From 86de56487e5f0017ffd5930b0dbd9dda43048849 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 30 Jul 2025 11:58:52 -0700 Subject: bpf: Allow syscall bpf programs to call non-recur helpers Allow syscall programs to call non-recur helpers too since syscall bpf programs runs in process context through bpf syscall, BPF_PROG_TEST_RUN, and cannot run recursively. bpf_task_storage_{get,set} have "_recur" versions that call trylock instead of taking the lock directly to avoid deadlock when called by bpf programs that run recursively. Currently, only bpf_lsm, bpf_iter, struct_ops without private stack are allow to call the non-recur helpers since they cannot be recursively called in another bpf program. Signed-off-by: Amery Hung Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20250730185903.3574598-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 94defa405c85..c823f8efe3ed 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -962,6 +962,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SYSCALL: return false; default: return true; -- cgit v1.2.3 From d87a513d093726d121dd5c816e26803111a259d0 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 6 Aug 2025 09:25:38 -0700 Subject: bpf: Allow struct_ops to get map id by kdata Add bpf_struct_ops_id() to enable struct_ops implementors to use struct_ops map id as the unique id of a struct_ops in their subsystem. A subsystem that wishes to create a mapping between id and struct_ops instance pointer can update the mapping accordingly during bpf_struct_ops::reg(), unreg(), and update(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250806162540.681679-2-ameryhung@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802..e7ee089e8a31 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1985,6 +1985,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 687a3e9c76f5..a41e6730edcf 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +u32 bpf_struct_ops_id(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + return st_map->map.id; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_id); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; -- cgit v1.2.3 From 8bcfcb3bd3e38b8f3bb7e5eb3acb4120500994a0 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Tue, 29 Jul 2025 15:06:32 +0200 Subject: ASoC: Intel: avs: Parse conditional path tuples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conditional paths need information about their source and sink paths to be created which is then stored to keep track of who their parents are. That information allows to change their state accordingly to what is currently happening to their parent paths. Signed-off-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Link: https://patch.msgid.link/20250729130633.310388-2-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/uapi/sound/intel/avs/tokens.h | 15 +++++ sound/soc/intel/avs/topology.c | 110 ++++++++++++++++++++++++++++++++++ sound/soc/intel/avs/topology.h | 8 +++ 3 files changed, 133 insertions(+) (limited to 'include') diff --git a/include/uapi/sound/intel/avs/tokens.h b/include/uapi/sound/intel/avs/tokens.h index c9f845b3c523..f3ff6aae09a9 100644 --- a/include/uapi/sound/intel/avs/tokens.h +++ b/include/uapi/sound/intel/avs/tokens.h @@ -133,6 +133,21 @@ enum avs_tplg_token { AVS_TKN_PATH_FE_FMT_ID_U32 = 1902, AVS_TKN_PATH_BE_FMT_ID_U32 = 1903, + /* struct avs_tplg_path_template (conditional) */ + AVS_TKN_CONDPATH_TMPL_ID_U32 = 1801, + AVS_TKN_CONDPATH_TMPL_SOURCE_TPLG_NAME_STRING = 2002, + AVS_TKN_CONDPATH_TMPL_SOURCE_PATH_TMPL_ID_U32 = 2003, + AVS_TKN_CONDPATH_TMPL_SINK_TPLG_NAME_STRING = 2004, + AVS_TKN_CONDPATH_TMPL_SINK_PATH_TMPL_ID_U32 = 2005, + AVS_TKN_CONDPATH_TMPL_COND_TYPE_U32 = 2006, + AVS_TKN_CONDPATH_TMPL_OVERRIDABLE_BOOL = 2007, + AVS_TKN_CONDPATH_TMPL_PRIORITY_U8 = 2008, + + /* struct avs_tplg_path (conditional) */ + AVS_TKN_CONDPATH_ID_U32 = 1901, + AVS_TKN_CONDPATH_SOURCE_PATH_ID_U32 = 2102, + AVS_TKN_CONDPATH_SINK_PATH_ID_U32 = 2103, + /* struct avs_tplg_pin_format */ AVS_TKN_PIN_FMT_INDEX_U32 = 2201, AVS_TKN_PIN_FMT_IOBS_U32 = 2202, diff --git a/sound/soc/intel/avs/topology.c b/sound/soc/intel/avs/topology.c index f2e4ad8b8e14..dfe8cf505381 100644 --- a/sound/soc/intel/avs/topology.c +++ b/sound/soc/intel/avs/topology.c @@ -1387,6 +1387,27 @@ static const struct avs_tplg_token_parser path_parsers[] = { }, }; +static const struct avs_tplg_token_parser condpath_parsers[] = { + { + .token = AVS_TKN_CONDPATH_ID_U32, + .type = SND_SOC_TPLG_TUPLE_TYPE_WORD, + .offset = offsetof(struct avs_tplg_path, id), + .parse = avs_parse_word_token, + }, + { + .token = AVS_TKN_CONDPATH_SOURCE_PATH_ID_U32, + .type = SND_SOC_TPLG_TUPLE_TYPE_WORD, + .offset = offsetof(struct avs_tplg_path, source_path_id), + .parse = avs_parse_word_token, + }, + { + .token = AVS_TKN_CONDPATH_SINK_PATH_ID_U32, + .type = SND_SOC_TPLG_TUPLE_TYPE_WORD, + .offset = offsetof(struct avs_tplg_path, sink_path_id), + .parse = avs_parse_word_token, + }, +}; + static struct avs_tplg_path * avs_tplg_path_create(struct snd_soc_component *comp, struct avs_tplg_path_template *owner, struct snd_soc_tplg_vendor_array *tuples, u32 block_size, @@ -1454,6 +1475,39 @@ static const struct avs_tplg_token_parser path_tmpl_parsers[] = { }, }; +static const struct avs_tplg_token_parser condpath_tmpl_parsers[] = { + { + .token = AVS_TKN_CONDPATH_TMPL_ID_U32, + .type = SND_SOC_TPLG_TUPLE_TYPE_WORD, + .offset = offsetof(struct avs_tplg_path_template, id), + .parse = avs_parse_word_token, + }, + { + .token = AVS_TKN_CONDPATH_TMPL_SOURCE_TPLG_NAME_STRING, + .type = SND_SOC_TPLG_TUPLE_TYPE_STRING, + .offset = offsetof(struct avs_tplg_path_template, source.tplg_name), + .parse = avs_parse_string_token, + }, + { + .token = AVS_TKN_CONDPATH_TMPL_SOURCE_PATH_TMPL_ID_U32, + .type = SND_SOC_TPLG_TUPLE_TYPE_WORD, + .offset = offsetof(struct avs_tplg_path_template, source.id), + .parse = avs_parse_word_token, + }, + { + .token = AVS_TKN_CONDPATH_TMPL_SINK_TPLG_NAME_STRING, + .type = SND_SOC_TPLG_TUPLE_TYPE_STRING, + .offset = offsetof(struct avs_tplg_path_template, sink.tplg_name), + .parse = avs_parse_string_token, + }, + { + .token = AVS_TKN_CONDPATH_TMPL_SINK_PATH_TMPL_ID_U32, + .type = SND_SOC_TPLG_TUPLE_TYPE_WORD, + .offset = offsetof(struct avs_tplg_path_template, sink.id), + .parse = avs_parse_word_token, + }, +}; + static int parse_path_template(struct snd_soc_component *comp, struct snd_soc_tplg_vendor_array *tuples, u32 block_size, struct avs_tplg_path_template *template, @@ -1524,6 +1578,56 @@ avs_tplg_path_template_create(struct snd_soc_component *comp, struct avs_tplg *o return template; } +static int avs_tplg_parse_condpath_templates(struct snd_soc_component *comp, + struct snd_soc_tplg_vendor_array *tuples, + u32 block_size) +{ + struct avs_soc_component *acomp = to_avs_soc_component(comp); + struct avs_tplg *tplg = acomp->tplg; + int ret, i; + + ret = parse_dictionary_header(comp, tuples, (void **)&tplg->condpath_tmpls, + &tplg->num_condpath_tmpls, + sizeof(*tplg->condpath_tmpls), + AVS_TKN_MANIFEST_NUM_CONDPATH_TMPLS_U32); + if (ret) + return ret; + + block_size -= le32_to_cpu(tuples->size); + /* With header parsed, move on to parsing entries. */ + tuples = avs_tplg_vendor_array_next(tuples); + + for (i = 0; i < tplg->num_condpath_tmpls; i++) { + struct avs_tplg_path_template *template; + u32 esize; + + template = &tplg->condpath_tmpls[i]; + template->owner = tplg; /* Used when building sysfs hierarchy. */ + INIT_LIST_HEAD(&template->path_list); + INIT_LIST_HEAD(&template->node); + + ret = avs_tplg_vendor_entry_size(tuples, block_size, + AVS_TKN_CONDPATH_TMPL_ID_U32, &esize); + if (ret) + return ret; + + ret = parse_path_template(comp, tuples, esize, template, + condpath_tmpl_parsers, + ARRAY_SIZE(condpath_tmpl_parsers), + condpath_parsers, + ARRAY_SIZE(condpath_parsers)); + if (ret < 0) { + dev_err(comp->dev, "parse condpath_tmpl: %d failed: %d\n", i, ret); + return ret; + } + + block_size -= esize; + tuples = avs_tplg_vendor_array_at(tuples, esize); + } + + return 0; +} + static const struct avs_tplg_token_parser mod_init_config_parsers[] = { { .token = AVS_TKN_INIT_CONFIG_ID_U32, @@ -1891,6 +1995,12 @@ static int avs_manifest(struct snd_soc_component *comp, int index, return ret; } + /* Condpaths dictionary. */ + ret = avs_tplg_parse_condpath_templates(comp, tuples, + has_init_config ? offset : remaining); + if (ret < 0) + return ret; + if (!has_init_config) return 0; diff --git a/sound/soc/intel/avs/topology.h b/sound/soc/intel/avs/topology.h index f5601a4e3ec8..1e83fccf2ea2 100644 --- a/sound/soc/intel/avs/topology.h +++ b/sound/soc/intel/avs/topology.h @@ -33,6 +33,7 @@ struct avs_tplg { u32 num_pplcfgs; struct avs_tplg_binding *bindings; u32 num_bindings; + struct avs_tplg_path_template *condpath_tmpls; u32 num_condpath_tmpls; struct avs_tplg_init_config *init_configs; u32 num_init_configs; @@ -155,6 +156,10 @@ struct avs_tplg_path_template { struct snd_soc_dapm_widget *w; + /* Conditional path. */ + struct avs_tplg_path_template_id source; + struct avs_tplg_path_template_id sink; + struct list_head path_list; struct avs_tplg *owner; @@ -176,6 +181,9 @@ struct avs_tplg_path { /* Path format requirements. */ struct avs_audio_format *fe_fmt; struct avs_audio_format *be_fmt; + /* Condpath path-variant requirements. */ + u32 source_path_id; + u32 sink_path_id; struct list_head ppl_list; -- cgit v1.2.3 From 40229ea9fa437d6a1feb86be9dd419e843ec754c Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 6 Aug 2025 04:41:41 +0000 Subject: ASoC: soc-dapm: move snd_soc_dapm_get_bias_level() to soc-dpcm Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Because dapm will not be used on user driver in the future, Let's move snd_soc_dapm_get_bias_level() to soc-dpcm.c Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/874iul83ju.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 13 +------------ sound/soc/soc-dapm.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 0b5c7e6a90c8..f1318cdcf7e4 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -722,6 +722,7 @@ struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_dapm(struct snd_kcontrol *kco struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_widget(struct snd_kcontrol *kcontrol); int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); +enum snd_soc_bias_level snd_soc_dapm_get_bias_level(struct snd_soc_dapm_context *dapm); #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ @@ -747,18 +748,6 @@ static inline void snd_soc_dapm_init_bias_level( dapm->bias_level = level; } -/** - * snd_soc_dapm_get_bias_level() - Get current DAPM bias level - * @dapm: The context for which to get the bias level - * - * Returns: The current bias level of the passed DAPM context. - */ -static inline enum snd_soc_bias_level snd_soc_dapm_get_bias_level( - struct snd_soc_dapm_context *dapm) -{ - return dapm->bias_level; -} - /** * snd_soc_dapm_widget_for_each_path - Iterates over all paths in the * specified direction of a widget diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index a37d44cd04c6..6ee03cf16171 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1037,6 +1037,18 @@ out: return ret; } +/** + * snd_soc_dapm_get_bias_level() - Get current DAPM bias level + * @dapm: The context for which to get the bias level + * + * Returns: The current bias level of the passed DAPM context. + */ +enum snd_soc_bias_level snd_soc_dapm_get_bias_level(struct snd_soc_dapm_context *dapm) +{ + return dapm->bias_level; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_get_bias_level); + static int dapm_is_shared_kcontrol(struct snd_soc_dapm_context *dapm, struct snd_soc_dapm_widget *kcontrolw, const struct snd_kcontrol_new *kcontrol_new, -- cgit v1.2.3 From e93703687cd75a0c7d330e15a3d9ff20b12f3d1d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 6 Aug 2025 04:41:46 +0000 Subject: ASoC: soc-dapm: move snd_soc_dapm_init_bias_level() to soc-dpcm Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Because dapm will not be used on user driver in the future, Let's move snd_soc_dapm_init_bias_level() to soc-dpcm.c Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/8734a583jp.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 20 +------------------- sound/soc/soc-dapm.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index f1318cdcf7e4..53bf6590bd4b 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -723,31 +723,13 @@ struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_widget(struct snd_kcontrol *kc int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); enum snd_soc_bias_level snd_soc_dapm_get_bias_level(struct snd_soc_dapm_context *dapm); +void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ (i) < list->num_widgets && (widget = list->widgets[i]); \ (i)++) -/** - * snd_soc_dapm_init_bias_level() - Initialize DAPM bias level - * @dapm: The DAPM context to initialize - * @level: The DAPM level to initialize to - * - * This function only sets the driver internal state of the DAPM level and will - * not modify the state of the device. Hence it should not be used during normal - * operation, but only to synchronize the internal state to the device state. - * E.g. during driver probe to set the DAPM level to the one corresponding with - * the power-on reset state of the device. - * - * To change the DAPM state of the device use snd_soc_dapm_set_bias_level(). - */ -static inline void snd_soc_dapm_init_bias_level( - struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level) -{ - dapm->bias_level = level; -} - /** * snd_soc_dapm_widget_for_each_path - Iterates over all paths in the * specified direction of a widget diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 6ee03cf16171..6782a0d6cd47 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -999,6 +999,25 @@ int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, } EXPORT_SYMBOL_GPL(snd_soc_dapm_force_bias_level); +/** + * snd_soc_dapm_init_bias_level() - Initialize DAPM bias level + * @dapm: The DAPM context to initialize + * @level: The DAPM level to initialize to + * + * This function only sets the driver internal state of the DAPM level and will + * not modify the state of the device. Hence it should not be used during normal + * operation, but only to synchronize the internal state to the device state. + * E.g. during driver probe to set the DAPM level to the one corresponding with + * the power-on reset state of the device. + * + * To change the DAPM state of the device use snd_soc_dapm_set_bias_level(). + */ +void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level) +{ + dapm->bias_level = level; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_init_bias_level); + /** * snd_soc_dapm_set_bias_level - set the bias level for the system * @dapm: DAPM context -- cgit v1.2.3 From 7b900b5878a910d60ecfa67448bbe81e4e2bb8b7 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 6 Aug 2025 04:41:51 +0000 Subject: ASoC: soc-component: unpack snd_soc_component_force_bias_level() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. This patch unpack component wrapper to cleanup it. The function will be kept by using macro for a while, but will be replaced/cleanuped in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/871ppp83jk.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 17 ----------------- include/sound/soc-dapm.h | 3 +++ 2 files changed, 3 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 2caa807c6249..c616f489c237 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -314,23 +314,6 @@ snd_soc_component_get_bias_level(struct snd_soc_component *component) snd_soc_component_get_dapm(component)); } -/** - * snd_soc_component_force_bias_level() - Set the COMPONENT DAPM bias level - * @component: The COMPONENT for which to set the level - * @level: The level to set to - * - * Forces the COMPONENT bias level to a specific state. See - * snd_soc_dapm_force_bias_level(). - */ -static inline int -snd_soc_component_force_bias_level(struct snd_soc_component *component, - enum snd_soc_bias_level level) -{ - return snd_soc_dapm_force_bias_level( - snd_soc_component_get_dapm(component), - level); -} - /** * snd_soc_dapm_kcontrol_component() - Returns the component associated to a * kcontrol diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 53bf6590bd4b..5bd9a27b12ba 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -725,6 +725,9 @@ int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so enum snd_soc_bias_level snd_soc_dapm_get_bias_level(struct snd_soc_dapm_context *dapm); void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); +// REMOVE ME !! +#define snd_soc_component_force_bias_level(c, l) snd_soc_dapm_force_bias_level(&(c)->dapm, l) + #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ (i) < list->num_widgets && (widget = list->widgets[i]); \ -- cgit v1.2.3 From 7509e7e4288976e3028a6e8482d979ca77f584a7 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 6 Aug 2025 04:41:55 +0000 Subject: ASoC: soc-component: unpack snd_soc_component_get_bias_level() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. This patch unpack component wrapper to cleanup it. The function will be kept by using macro for a while, but will be replaced/cleanuped in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87zfcd6oz0.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 13 ------------- include/sound/soc-dapm.h | 1 + 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index c616f489c237..86ad2802879f 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -301,19 +301,6 @@ snd_soc_component_init_bias_level(struct snd_soc_component *component, snd_soc_component_get_dapm(component), level); } -/** - * snd_soc_component_get_bias_level() - Get current COMPONENT DAPM bias level - * @component: The COMPONENT for which to get the DAPM bias level - * - * Returns: The current DAPM bias level of the COMPONENT. - */ -static inline enum snd_soc_bias_level -snd_soc_component_get_bias_level(struct snd_soc_component *component) -{ - return snd_soc_dapm_get_bias_level( - snd_soc_component_get_dapm(component)); -} - /** * snd_soc_dapm_kcontrol_component() - Returns the component associated to a * kcontrol diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 5bd9a27b12ba..6e2db79d6a97 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -727,6 +727,7 @@ void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so // REMOVE ME !! #define snd_soc_component_force_bias_level(c, l) snd_soc_dapm_force_bias_level(&(c)->dapm, l) +#define snd_soc_component_get_bias_level(c) snd_soc_dapm_get_bias_level(&(c)->dapm) #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ -- cgit v1.2.3 From cf25eb8eae91bcae9b2065d84b0c0ba0f6d9dd34 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 6 Aug 2025 04:42:05 +0000 Subject: ASoC: soc-component: unpack snd_soc_component_init_bias_level() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. This patch unpack component wrapper to cleanup it. This patch keeps compatible by using define, but old name will be replaced on each drivers and removed from ASoC in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87y0rx6oyx.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 15 --------------- include/sound/soc-dapm.h | 1 + 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 86ad2802879f..54bfa0cb1085 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -286,21 +286,6 @@ static inline struct snd_soc_dapm_context *snd_soc_component_get_dapm( return &component->dapm; } -/** - * snd_soc_component_init_bias_level() - Initialize COMPONENT DAPM bias level - * @component: The COMPONENT for which to initialize the DAPM bias level - * @level: The DAPM level to initialize to - * - * Initializes the COMPONENT DAPM bias level. See snd_soc_dapm_init_bias_level() - */ -static inline void -snd_soc_component_init_bias_level(struct snd_soc_component *component, - enum snd_soc_bias_level level) -{ - snd_soc_dapm_init_bias_level( - snd_soc_component_get_dapm(component), level); -} - /** * snd_soc_dapm_kcontrol_component() - Returns the component associated to a * kcontrol diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 6e2db79d6a97..2e9196b6ffba 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -728,6 +728,7 @@ void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so // REMOVE ME !! #define snd_soc_component_force_bias_level(c, l) snd_soc_dapm_force_bias_level(&(c)->dapm, l) #define snd_soc_component_get_bias_level(c) snd_soc_dapm_get_bias_level(&(c)->dapm) +#define snd_soc_component_init_bias_level(c, l) snd_soc_dapm_init_bias_level(&(c)->dapm, l) #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ -- cgit v1.2.3 From 32dffd4c3e3129e3d9bb378af8d80bb57dc3038b Mon Sep 17 00:00:00 2001 From: Chancel Liu Date: Fri, 8 Aug 2025 15:17:41 +0900 Subject: ASoC: dmaengine_pcm: Add port_window_size to DAI dma data struct The port_window_size is a struct member of dma slave channel runtime config. It's the length of the register area in words the data need to be accessed on the device side. It is only used for devices which is using an area instead of a single register to send or receive the data. Typically the DMA loops in this area in order to transfer the data. It's useful for cases that reading/writing multiple registers in DMA transactions. Signed-off-by: Chancel Liu Link: https://patch.msgid.link/20250808061741.187414-1-chancel.liu@nxp.com Signed-off-by: Mark Brown --- include/sound/dmaengine_pcm.h | 5 +++++ sound/core/pcm_dmaengine.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h index 1ef13bcdc43f..9472f0a966a2 100644 --- a/include/sound/dmaengine_pcm.h +++ b/include/sound/dmaengine_pcm.h @@ -69,6 +69,10 @@ struct dma_chan *snd_dmaengine_pcm_get_chan(struct snd_pcm_substream *substream) * @peripheral_config: peripheral configuration for programming peripheral * for dmaengine transfer * @peripheral_size: peripheral configuration buffer size + * @port_window_size: The length of the register area in words the data need + * to be accessed on the device side. It is only used for devices which is using + * an area instead of a single register to send/receive the data. Typically the + * DMA loops in this area in order to transfer the data. */ struct snd_dmaengine_dai_dma_data { dma_addr_t addr; @@ -80,6 +84,7 @@ struct snd_dmaengine_dai_dma_data { unsigned int flags; void *peripheral_config; size_t peripheral_size; + u32 port_window_size; }; void snd_dmaengine_pcm_set_config_from_dai_data( diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c index 72040964b6fd..f0c17503df42 100644 --- a/sound/core/pcm_dmaengine.c +++ b/sound/core/pcm_dmaengine.c @@ -111,6 +111,7 @@ void snd_dmaengine_pcm_set_config_from_dai_data( if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { slave_config->dst_addr = dma_data->addr; slave_config->dst_maxburst = dma_data->maxburst; + slave_config->dst_port_window_size = dma_data->port_window_size; if (dma_data->flags & SND_DMAENGINE_PCM_DAI_FLAG_PACK) slave_config->dst_addr_width = DMA_SLAVE_BUSWIDTH_UNDEFINED; @@ -119,6 +120,7 @@ void snd_dmaengine_pcm_set_config_from_dai_data( } else { slave_config->src_addr = dma_data->addr; slave_config->src_maxburst = dma_data->maxburst; + slave_config->src_port_window_size = dma_data->port_window_size; if (dma_data->flags & SND_DMAENGINE_PCM_DAI_FLAG_PACK) slave_config->src_addr_width = DMA_SLAVE_BUSWIDTH_UNDEFINED; -- cgit v1.2.3 From e83dcd139e776ebb86d5e88e13282580407278e4 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Sun, 3 Aug 2025 21:11:10 +0800 Subject: ASoC: tas2781: Add keyword "init" in profile section Since version 0x105, the keyword 'init' was introduced into the profile, which is used for chip initialization, particularly to store common settings for other non-initialization profiles. Signed-off-by: Shenghao Ding Link: https://patch.msgid.link/20250803131110.1443-1-shenghao-ding@ti.com Signed-off-by: Mark Brown --- include/sound/tas2781-dsp.h | 8 ++++++++ sound/soc/codecs/tas2781-fmwlib.c | 12 ++++++++++++ sound/soc/codecs/tas2781-i2c.c | 6 ++++++ 3 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/sound/tas2781-dsp.h b/include/sound/tas2781-dsp.h index c3a9efa73d5d..a21f34c0266e 100644 --- a/include/sound/tas2781-dsp.h +++ b/include/sound/tas2781-dsp.h @@ -198,6 +198,14 @@ struct tasdevice_rca { int ncfgs; struct tasdevice_config_info **cfg_info; int profile_cfg_id; + /* + * Since version 0x105, the keyword 'init' was introduced into the + * profile, which is used for chip initialization, particularly to + * store common settings for other non-initialization profiles. + * if (init_profile_id < 0) + * No init profile inside the RCA firmware. + */ + int init_profile_id; }; void tasdevice_select_cfg_blk(void *context, int conf_no, diff --git a/sound/soc/codecs/tas2781-fmwlib.c b/sound/soc/codecs/tas2781-fmwlib.c index c9c1e608ddb7..8baf56237624 100644 --- a/sound/soc/codecs/tas2781-fmwlib.c +++ b/sound/soc/codecs/tas2781-fmwlib.c @@ -180,6 +180,16 @@ static struct tasdevice_config_info *tasdevice_add_config( dev_err(tas_priv->dev, "add conf: Out of boundary\n"); goto out; } + /* If in the RCA bin file are several profiles with the + * keyword "init", init_profile_id only store the last + * init profile id. + */ + if (strnstr(&config_data[config_offset], "init", 64)) { + tas_priv->rcabin.init_profile_id = + tas_priv->rcabin.ncfgs - 1; + dev_dbg(tas_priv->dev, "%s: init profile id = %d\n", + __func__, tas_priv->rcabin.init_profile_id); + } config_offset += 64; } @@ -283,6 +293,8 @@ int tasdevice_rca_parser(void *context, const struct firmware *fmw) int i; rca = &(tas_priv->rcabin); + /* Initialize to none */ + rca->init_profile_id = -1; fw_hdr = &(rca->fw_hdr); if (!fmw || !fmw->data) { dev_err(tas_priv->dev, "Failed to read %s\n", diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index 9f4d965a1335..f8faeffd983d 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -1641,6 +1641,12 @@ static void tasdevice_fw_ready(const struct firmware *fmw, tasdevice_prmg_load(tas_priv, 0); tas_priv->cur_prog = 0; + /* Init common setting for different audio profiles */ + if (tas_priv->rcabin.init_profile_id >= 0) + tasdevice_select_cfg_blk(tas_priv, + tas_priv->rcabin.init_profile_id, + TASDEVICE_BIN_BLK_PRE_POWER_UP); + #ifdef CONFIG_SND_SOC_TAS2781_ACOUST_I2C if (tas_priv->name_prefix) acoustic_debugfs_node = devm_kasprintf(tas_priv->dev, -- cgit v1.2.3 From f4ee43ae6ea8c509d470f20e7b446adf5a167dba Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Sun, 10 Aug 2025 20:23:58 +0800 Subject: ALSA: hda: Add TAS5825 support Add TAS5825 support in TI's HDA driver. TAS5825 is an on-chip DSP, but no calibration is required, and no global address support smart amplifier devices. Signed-off-by: Baojun Xu Acked-by: Mark Brown Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250810122358.1575-1-baojun.xu@ti.com --- include/sound/tas2781-dsp.h | 3 ++ include/sound/tas2781.h | 4 +- include/sound/tas5825-tlv.h | 24 +++++++++ sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 28 +++++++++++ sound/soc/codecs/tas2781-fmwlib.c | 67 ++++++++++++++++++++++++-- 5 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 include/sound/tas5825-tlv.h (limited to 'include') diff --git a/include/sound/tas2781-dsp.h b/include/sound/tas2781-dsp.h index c3a9efa73d5d..49bbf24d6559 100644 --- a/include/sound/tas2781-dsp.h +++ b/include/sound/tas2781-dsp.h @@ -34,6 +34,7 @@ #define PPC3_VERSION_TAS2781_BASIC_MIN 0x14600 #define PPC3_VERSION_TAS2781_ALPHA_MIN 0x4a00 #define PPC3_VERSION_TAS2781_BETA_MIN 0x19400 +#define PPC3_VERSION_TAS5825_BASE 0x114200 #define TASDEVICE_DEVICE_SUM 8 #define TASDEVICE_CONFIG_SUM 64 @@ -53,6 +54,8 @@ enum tasdevice_dsp_dev_idx { TASDEVICE_DSP_TAS_2781_DUAL_MONO, TASDEVICE_DSP_TAS_2781_21, TASDEVICE_DSP_TAS_2781_QUAD, + TASDEVICE_DSP_TAS_5825_MONO, + TASDEVICE_DSP_TAS_5825_DUAL, TASDEVICE_DSP_TAS_MAX_DEVICE }; diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index 3875e92f1ec5..f0aefc04a957 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -49,9 +49,9 @@ #define TASDEVICE_REG(book, page, reg) (((book * 256 * 128) + \ (page * 128)) + reg) -/* Software Reset */ +/* Software Reset, compatble with new device (TAS5825). */ #define TASDEVICE_REG_SWRESET TASDEVICE_REG(0x0, 0x0, 0x01) -#define TASDEVICE_REG_SWRESET_RESET BIT(0) +#define TASDEVICE_REG_SWRESET_RESET (BIT(0) | BIT(4)) /* Checksum */ #define TASDEVICE_CHECKSUM_REG TASDEVICE_REG(0x0, 0x0, 0x7e) diff --git a/include/sound/tas5825-tlv.h b/include/sound/tas5825-tlv.h new file mode 100644 index 000000000000..95f2d3fad120 --- /dev/null +++ b/include/sound/tas5825-tlv.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// +// ALSA SoC Texas Instruments TAS5825 Audio Smart Amplifier +// +// Copyright (C) 2025 Texas Instruments Incorporated +// https://www.ti.com +// +// The TAS5825 hda driver implements for one or two TAS5825 chips. +// +// Author: Baojun Xu +// + +#ifndef __TAS5825_TLV_H__ +#define __TAS5825_TLV_H__ + +#define TAS5825_DVC_LEVEL TASDEVICE_REG(0x0, 0x0, 0x4c) +#define TAS5825_AMP_LEVEL TASDEVICE_REG(0x0, 0x0, 0x54) + +static const __maybe_unused DECLARE_TLV_DB_SCALE( + tas5825_dvc_tlv, -10300, 50, 0); +static const __maybe_unused DECLARE_TLV_DB_SCALE( + tas5825_amp_tlv, -1550, 50, 0); + +#endif diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c index e1d60da50897..8fae0fea37a0 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "hda_local.h" #include "hda_auto_parser.h" @@ -50,6 +51,7 @@ enum device_chip_id { HDA_TAS2563, HDA_TAS2770, HDA_TAS2781, + HDA_TAS5825, HDA_OTHERS }; @@ -272,6 +274,17 @@ static const struct snd_kcontrol_new tas2781_snd_controls[] = { tas2781_force_fwload_get, tas2781_force_fwload_put), }; +static const struct snd_kcontrol_new tas5825_snd_controls[] = { + ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS5825_AMP_LEVEL, + 0, 0, 31, 1, tas2781_amp_getvol, + tas2781_amp_putvol, tas5825_amp_tlv), + ACARD_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS5825_DVC_LEVEL, + 0, 0, 254, 1, tas2781_amp_getvol, + tas2781_amp_putvol, tas5825_dvc_tlv), + ACARD_SINGLE_BOOL_EXT("Speaker Force Firmware Load", 0, + tas2781_force_fwload_get, tas2781_force_fwload_put), +}; + static const struct snd_kcontrol_new tasdevice_prof_ctrl = { .name = "Speaker Profile Id", .iface = SNDRV_CTL_ELEM_IFACE_CARD, @@ -501,6 +514,12 @@ static void tasdev_fw_ready(const struct firmware *fmw, void *context) ARRAY_SIZE(tas2781_snd_controls)); tasdevice_dspfw_init(context); break; + case HDA_TAS5825: + tasdev_add_kcontrols(tas_priv, hda_priv->snd_ctls, codec, + &tas5825_snd_controls[0], + ARRAY_SIZE(tas5825_snd_controls)); + tasdevice_dspfw_init(context); + break; case HDA_TAS2563: tasdevice_dspfw_init(context); break; @@ -628,6 +647,7 @@ static int tas2781_hda_i2c_probe(struct i2c_client *clt) } else if (strstarts(dev_name(&clt->dev), "i2c-TXNW2781:00-tas2781-hda.0")) { device_name = "TXNW2781"; + hda_priv->hda_chip_id = HDA_TAS2781; hda_priv->save_calibration = tas2781_save_calibration; tas_hda->priv->global_addr = TAS2781_GLOBAL_ADDR; } else if (strstr(dev_name(&clt->dev), "INT8866")) { @@ -639,6 +659,13 @@ static int tas2781_hda_i2c_probe(struct i2c_client *clt) hda_priv->hda_chip_id = HDA_TAS2563; hda_priv->save_calibration = tas2563_save_calibration; tas_hda->priv->global_addr = TAS2563_GLOBAL_ADDR; + } else if (strstarts(dev_name(&clt->dev), "i2c-TXNW5825")) { + /* + * TAS5825, integrated on-chip DSP without + * global I2C address and calibration supported. + */ + device_name = "TXNW5825"; + hda_priv->hda_chip_id = HDA_TAS5825; } else { return -ENODEV; } @@ -775,6 +802,7 @@ static const struct acpi_device_id tas2781_acpi_hda_match[] = { {"TIAS2781", 0 }, {"TXNW2770", 0 }, {"TXNW2781", 0 }, + {"TXNW5825", 0 }, {} }; MODULE_DEVICE_TABLE(acpi, tas2781_acpi_hda_match); diff --git a/sound/soc/codecs/tas2781-fmwlib.c b/sound/soc/codecs/tas2781-fmwlib.c index c9c1e608ddb7..d69faef8a4d7 100644 --- a/sound/soc/codecs/tas2781-fmwlib.c +++ b/sound/soc/codecs/tas2781-fmwlib.c @@ -91,7 +91,7 @@ struct blktyp_devidx_map { }; static const char deviceNumber[TASDEVICE_DSP_TAS_MAX_DEVICE] = { - 1, 2, 1, 2, 1, 1, 0, 2, 4, 3, 1, 2, 3, 4 + 1, 2, 1, 2, 1, 1, 0, 2, 4, 3, 1, 2, 3, 4, 1, 2 }; /* fixed m68k compiling issue: mapping table can save code field */ @@ -509,6 +509,56 @@ out: return offset; } +static int fw_parse_tas5825_program_data_kernel( + struct tasdevice_priv *tas_priv, struct tasdevice_fw *tas_fmw, + const struct firmware *fmw, int offset) +{ + struct tasdevice_prog *program; + unsigned int i; + + for (i = 0; i < tas_fmw->nr_programs; i++) { + program = &(tas_fmw->programs[i]); + if (offset + 72 > fmw->size) { + dev_err(tas_priv->dev, "%s: mpName error\n", __func__); + return -EINVAL; + } + /* Skip 65 unused byts*/ + offset += 65; + offset = fw_parse_data_kernel(tas_fmw, &(program->dev_data), + fmw, offset); + if (offset < 0) + return offset; + } + + return offset; +} + +static int fw_parse_tas5825_configuration_data_kernel( + struct tasdevice_priv *tas_priv, + struct tasdevice_fw *tas_fmw, const struct firmware *fmw, int offset) +{ + const unsigned char *data = fmw->data; + struct tasdevice_config *config; + unsigned int i; + + for (i = 0; i < tas_fmw->nr_configurations; i++) { + config = &(tas_fmw->configs[i]); + if (offset + 80 > fmw->size) { + dev_err(tas_priv->dev, "%s: mpName error\n", __func__); + return -EINVAL; + } + memcpy(config->name, &data[offset], 64); + /* Skip extra 8 bytes*/ + offset += 72; + offset = fw_parse_data_kernel(tas_fmw, &(config->dev_data), + fmw, offset); + if (offset < 0) + return offset; + } + + return offset; +} + static int fw_parse_program_data_kernel( struct tasdevice_priv *tas_priv, struct tasdevice_fw *tas_fmw, const struct firmware *fmw, int offset) @@ -1826,7 +1876,8 @@ static void dspbin_type_check(struct tasdevice_priv *tas_priv, else tas_priv->dspbin_typ = TASDEV_ALPHA; } - if (tas_priv->dspbin_typ != TASDEV_BASIC) + if ((tas_priv->dspbin_typ != TASDEV_BASIC) && + (ppcver < PPC3_VERSION_TAS5825_BASE)) tas_priv->fw_parse_fct_param_address = fw_parse_fct_param_address; } @@ -1837,7 +1888,17 @@ static int dspfw_default_callback(struct tasdevice_priv *tas_priv, int rc = 0; if (drv_ver == 0x100) { - if (ppcver >= PPC3_VERSION_BASE) { + if (ppcver >= PPC3_VERSION_TAS5825_BASE) { + tas_priv->fw_parse_variable_header = + fw_parse_variable_header_kernel; + tas_priv->fw_parse_program_data = + fw_parse_tas5825_program_data_kernel; + tas_priv->fw_parse_configuration_data = + fw_parse_tas5825_configuration_data_kernel; + tas_priv->tasdevice_load_block = + tasdevice_load_block_kernel; + dspbin_type_check(tas_priv, ppcver); + } else if (ppcver >= PPC3_VERSION_BASE) { tas_priv->fw_parse_variable_header = fw_parse_variable_header_kernel; tas_priv->fw_parse_program_data = -- cgit v1.2.3 From 17e8b7e08fa8bf7a936f70444a42a88750410251 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jul 2025 09:48:54 +0200 Subject: fs: mark file_remove_privs_flags static file_remove_privs_flags is only used inside of inode.c, mark it static. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250724074854.3316911-1-hch@lst.de Signed-off-by: Christian Brauner --- fs/inode.c | 3 +-- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e..3cbb78412a3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2189,7 +2189,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2214,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..796319914b0a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3393,7 +3393,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb) extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); -- cgit v1.2.3 From 4e021920812d164bb02c30cc40e08a3681b1c755 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Wed, 30 Jul 2025 20:18:53 +0000 Subject: fs: document 'name' parameter for name_contains_dotdot() The kernel-doc for name_contains_dotdot() was missing the @name parameter description, leading to a warning during make htmldocs. Add the missing documentation to resolve this warning. Signed-off-by: Kriish Sharma Link: https://lore.kernel.org/20250730201853.8436-1-kriish.sharma2006@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 796319914b0a..780e9c774c54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3281,7 +3281,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len) /** * name_contains_dotdot - check if a file name contains ".." path components - * + * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) -- cgit v1.2.3 From 56ecfd9175b999dfc303ac6a0f9ea4bd1bee49d7 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:54 +0100 Subject: fs: Remove mount_nodev mount_nodev has had no in-tree users since cc0876f817d6 ("vfs: Convert devpts to use the new mount API"). Remove it. Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-2-pfalcato@suse.de Signed-off-by: Christian Brauner --- fs/super.c | 20 -------------------- include/linux/fs.h | 3 --- 2 files changed, 23 deletions(-) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..7daa20737f2e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1773,26 +1773,6 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) - return ERR_CAST(s); - - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_nodev); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..204328ed7ebb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2716,9 +2716,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); -- cgit v1.2.3 From f7d161c2804f3ad36bdc3222cb93c8fee67be98c Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:55 +0100 Subject: fs: Remove mount_bdev mount_bdev has no in-tree users ever since f2fs adopted the new mount API. Remove it. Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-3-pfalcato@suse.de Signed-off-by: Christian Brauner --- fs/super.c | 43 ------------------------------------------- include/linux/fs.h | 3 --- 2 files changed, 46 deletions(-) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index 7daa20737f2e..a038848e8d1f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_bdev); -static int test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - dev_t dev; - - error = lookup_bdev(dev_name, &dev); - if (error) - return ERR_PTR(error); - - flags |= SB_NOSEC; - s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (s->s_root) { - if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return ERR_PTR(-EBUSY); - } - } else { - error = setup_bdev_super(s, flags, NULL); - if (!error) - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= SB_ACTIVE; - } - - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index 204328ed7ebb..98afcf455b28 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2713,9 +2713,6 @@ static inline bool is_mgtime(const struct inode *inode) return inode->i_opflags & IOP_MGTIME; } -extern struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); -- cgit v1.2.3 From ad7fe23b4b0dc0c26187df92a5649948ef7049fa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 6 Aug 2025 16:07:05 +1000 Subject: fscontext: add custom-prefix log helpers Sometimes, errors associated with an fscontext come from the VFS or otherwise outside of the filesystem driver itself. However, the default logging of errorfc will always prefix the message with the filesystem name. So, add some *fcp() wrappers that allow for custom prefixes to be used when emitting information to the fscontext log. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250806-errorfc-mount-too-revealing-v2-1-534b9b4d45bb@cyphar.com Signed-off-by: Christian Brauner --- include/linux/fs_context.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039..671f031be173 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -186,10 +186,12 @@ struct fc_log { extern __attribute__((format(printf, 4, 5))) void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...); -#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \ - l, fmt, ## __VA_ARGS__) -#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \ - l, fmt, ## __VA_ARGS__) +#define __logfc(fc, l, fmt, ...) \ + logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__) +#define __plogp(p, prefix, l, fmt, ...) \ + logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__) +#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__) + /** * infof - Store supplementary informational message * @fc: The context in which to log the informational message @@ -201,6 +203,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) #define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message @@ -213,6 +217,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__) #define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__) #define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__) +#define warnfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__) /** * errorf - Store supplementary error message @@ -225,6 +231,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__) #define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__) #define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__) +#define errorfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__) /** * invalf - Store supplementary invalid argument error message @@ -237,5 +245,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL) #define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL) #define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL) +#define invalfcp(fc, prefix, fmt, ...) \ + (errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL) #endif /* _LINUX_FS_CONTEXT_H */ -- cgit v1.2.3 From bb2441402392ef1f49563be68e8f0dcb127ac965 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Tue, 5 Aug 2025 22:40:56 +0300 Subject: regulator: add s2dos05 regulator support S2DOS05 has 1 buck and 4 LDO regulators, used for powering panel/touchscreen. Signed-off-by: Dzmitry Sankouski Link: https://patch.msgid.link/20250805-starqltechn_integration_upstream-v8-1-09d8a321fafe@gmail.com Signed-off-by: Mark Brown --- MAINTAINERS | 2 +- drivers/regulator/Kconfig | 8 ++ drivers/regulator/Makefile | 1 + drivers/regulator/s2dos05-regulator.c | 165 ++++++++++++++++++++++++++++++++++ include/linux/regulator/s2dos05.h | 73 +++++++++++++++ 5 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 drivers/regulator/s2dos05-regulator.c create mode 100644 include/linux/regulator/s2dos05.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..f35f9b503956 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22387,7 +22387,7 @@ F: Documentation/devicetree/bindings/regulator/samsung,s2m*.yaml F: Documentation/devicetree/bindings/regulator/samsung,s5m*.yaml F: drivers/clk/clk-s2mps11.c F: drivers/mfd/sec*.[ch] -F: drivers/regulator/s2m*.c +F: drivers/regulator/s2*.c F: drivers/regulator/s5m*.c F: drivers/rtc/rtc-s5m.c F: include/linux/mfd/samsung/ diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f80..2399c8a9a1c5 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1344,6 +1344,14 @@ config REGULATOR_RTQ2208 and two ldos. It features wide output voltage range from 0.4V to 2.05V and the capability to configure the corresponding power stages. +config REGULATOR_S2DOS05 + tristate "Samsung S2DOS05 voltage regulator" + depends on MFD_SEC_CORE || COMPILE_TEST + help + This driver provides support for the voltage regulators of the S2DOS05. + The S2DOS05 is a companion power management IC for the smart phones. + The S2DOS05 has 4 LDOs and 1 BUCK outputs. + config REGULATOR_S2MPA01 tristate "Samsung S2MPA01 voltage regulator" depends on MFD_SEC_CORE || COMPILE_TEST diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675..78605b0fa0b2 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -156,6 +156,7 @@ obj-$(CONFIG_REGULATOR_RTMV20) += rtmv20-regulator.o obj-$(CONFIG_REGULATOR_RTQ2134) += rtq2134-regulator.o obj-$(CONFIG_REGULATOR_RTQ6752) += rtq6752-regulator.o obj-$(CONFIG_REGULATOR_RTQ2208) += rtq2208-regulator.o +obj-$(CONFIG_REGULATOR_S2DOS05) += s2dos05-regulator.o obj-$(CONFIG_REGULATOR_S2MPA01) += s2mpa01.o obj-$(CONFIG_REGULATOR_S2MPS11) += s2mps11.o obj-$(CONFIG_REGULATOR_S5M8767) += s5m8767.o diff --git a/drivers/regulator/s2dos05-regulator.c b/drivers/regulator/s2dos05-regulator.c new file mode 100644 index 000000000000..1463585c4565 --- /dev/null +++ b/drivers/regulator/s2dos05-regulator.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// s2dos05.c - Regulator driver for the Samsung s2dos05 +// +// Copyright (C) 2025 Dzmitry Sankouski + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct s2dos05_data { + struct regmap *regmap; + struct device *dev; +}; + +#define _BUCK(macro) S2DOS05_BUCK##macro +#define _buck_ops(num) s2dos05_ops##num +#define _LDO(macro) S2DOS05_LDO##macro +#define _REG(ctrl) S2DOS05_REG##ctrl +#define _ldo_ops(num) s2dos05_ops##num +#define _MASK(macro) S2DOS05_ENABLE_MASK##macro +#define _TIME(macro) S2DOS05_ENABLE_TIME##macro + +#define BUCK_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_BUCK_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_BUCK_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_BUCK_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_BUCK_FD_MASK \ +} + +#define LDO_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_LDO_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_LDO_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_LDO_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_LDO_FD_MASK \ +} + +static const struct regulator_ops s2dos05_ops = { + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .is_enabled = regulator_is_enabled_regmap, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_desc regulators[S2DOS05_REGULATOR_MAX] = { + // name, id, ops, min_uv, uV_step, vsel_reg, enable_reg + LDO_DESC("ldo1", _LDO(1), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO1_CFG), + _REG(_EN), _MASK(_L1), _TIME(_LDO), _REG(_LDO1_CFG)), + LDO_DESC("ldo2", _LDO(2), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO2_CFG), + _REG(_EN), _MASK(_L2), _TIME(_LDO), _REG(_LDO2_CFG)), + LDO_DESC("ldo3", _LDO(3), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO3_CFG), + _REG(_EN), _MASK(_L3), _TIME(_LDO), _REG(_LDO3_CFG)), + LDO_DESC("ldo4", _LDO(4), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO4_CFG), + _REG(_EN), _MASK(_L4), _TIME(_LDO), _REG(_LDO4_CFG)), + BUCK_DESC("buck", _BUCK(1), &_buck_ops(), _BUCK(_MIN1), + _BUCK(_STEP1), _REG(_BUCK_VOUT), + _REG(_EN), _MASK(_B1), _TIME(_BUCK), _REG(_BUCK_CFG)), +}; + +static int s2dos05_pmic_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sec_pmic_dev *iodev = dev_get_drvdata(pdev->dev.parent); + struct s2dos05_data *s2dos05; + struct regulator_config config = { }; + unsigned int rdev_num = ARRAY_SIZE(regulators); + + s2dos05 = devm_kzalloc(dev, sizeof(*s2dos05), GFP_KERNEL); + if (!s2dos05) + return -ENOMEM; + + platform_set_drvdata(pdev, s2dos05); + + s2dos05->regmap = iodev->regmap_pmic; + s2dos05->dev = dev; + if (!dev->of_node) + dev->of_node = dev->parent->of_node; + + config.dev = dev; + config.driver_data = s2dos05; + + for (int i = 0; i < rdev_num; i++) { + struct regulator_dev *regulator; + + regulator = devm_regulator_register(&pdev->dev, + ®ulators[i], &config); + if (IS_ERR(regulator)) { + return dev_err_probe(&pdev->dev, PTR_ERR(regulator), + "regulator init failed for %d\n", i); + } + } + + return 0; +} + +static const struct platform_device_id s2dos05_pmic_id[] = { + { "s2dos05-regulator" }, + { }, +}; +MODULE_DEVICE_TABLE(platform, s2dos05_pmic_id); + +static struct platform_driver s2dos05_platform_driver = { + .driver = { + .name = "s2dos05", + }, + .probe = s2dos05_pmic_probe, + .id_table = s2dos05_pmic_id, +}; +module_platform_driver(s2dos05_platform_driver); + +MODULE_AUTHOR("Dzmitry Sankouski "); +MODULE_DESCRIPTION("Samsung S2DOS05 Regulator Driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/regulator/s2dos05.h b/include/linux/regulator/s2dos05.h new file mode 100644 index 000000000000..2e89fcbce769 --- /dev/null +++ b/include/linux/regulator/s2dos05.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +// s2dos05.h +// +// Copyright (c) 2016 Samsung Electronics Co., Ltd +// http://www.samsung.com +// Copyright (C) 2024 Dzmitry Sankouski + +#ifndef __LINUX_S2DOS05_H +#define __LINUX_S2DOS05_H + +// S2DOS05 registers +// Slave Addr : 0xC0 +enum S2DOS05_reg { + S2DOS05_REG_DEV_ID, + S2DOS05_REG_TOPSYS_STAT, + S2DOS05_REG_STAT, + S2DOS05_REG_EN, + S2DOS05_REG_LDO1_CFG, + S2DOS05_REG_LDO2_CFG, + S2DOS05_REG_LDO3_CFG, + S2DOS05_REG_LDO4_CFG, + S2DOS05_REG_BUCK_CFG, + S2DOS05_REG_BUCK_VOUT, + S2DOS05_REG_IRQ_MASK = 0x0D, + S2DOS05_REG_SSD_TSD = 0x0E, + S2DOS05_REG_OCL = 0x10, + S2DOS05_REG_IRQ = 0x11 +}; + +// S2DOS05 regulator ids +enum S2DOS05_regulators { + S2DOS05_LDO1, + S2DOS05_LDO2, + S2DOS05_LDO3, + S2DOS05_LDO4, + S2DOS05_BUCK1, + S2DOS05_REG_MAX, +}; + +#define S2DOS05_IRQ_PWRMT_MASK BIT(5) +#define S2DOS05_IRQ_TSD_MASK BIT(4) +#define S2DOS05_IRQ_SSD_MASK BIT(3) +#define S2DOS05_IRQ_SCP_MASK BIT(2) +#define S2DOS05_IRQ_UVLO_MASK BIT(1) +#define S2DOS05_IRQ_OCD_MASK BIT(0) + +#define S2DOS05_BUCK_MIN1 506250 +#define S2DOS05_LDO_MIN1 1500000 +#define S2DOS05_LDO_MIN2 2700000 +#define S2DOS05_BUCK_STEP1 6250 +#define S2DOS05_LDO_STEP1 25000 +#define S2DOS05_LDO_VSEL_MASK 0x7F +#define S2DOS05_LDO_FD_MASK 0x80 +#define S2DOS05_BUCK_VSEL_MASK 0xFF +#define S2DOS05_BUCK_FD_MASK 0x08 + +#define S2DOS05_ENABLE_MASK_L1 BIT(0) +#define S2DOS05_ENABLE_MASK_L2 BIT(1) +#define S2DOS05_ENABLE_MASK_L3 BIT(2) +#define S2DOS05_ENABLE_MASK_L4 BIT(3) +#define S2DOS05_ENABLE_MASK_B1 BIT(4) + +#define S2DOS05_RAMP_DELAY 12000 + +#define S2DOS05_ENABLE_TIME_LDO 50 +#define S2DOS05_ENABLE_TIME_BUCK 350 + +#define S2DOS05_LDO_N_VOLTAGES (S2DOS05_LDO_VSEL_MASK + 1) +#define S2DOS05_BUCK_N_VOLTAGES (S2DOS05_BUCK_VSEL_MASK + 1) + +#define S2DOS05_REGULATOR_MAX (S2DOS05_REG_MAX) + +#endif // __LINUX_S2DOS05_H -- cgit v1.2.3 From 181fe022ecf8a8e85def0e94852c631c59a8b3f6 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:44 +0200 Subject: gpiolib: add support to register sparse pin range Add support to register for GPIO<->pin mapping using a list of non consecutive pins. The core already supports sparse pin range (pins member of struct pinctrl_gpio_range), but it was not possible to register one. If pins is not NULL the core uses it, otherwise it assumes that a consecutive pin range was registered and it uses pin_base. The function gpiochip_add_pin_range() which allocates and fills the struct pinctrl_gpio_range was renamed to gpiochip_add_pin_range_with_pins() and the pins parameter was added. Two new functions were added, gpiochip_add_pin_range() and gpiochip_add_sparse_pin_range() to register a consecutive or sparse pins range. Both use gpiochip_add_pin_range_with_pins(). Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-1-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 29 ++++++++++++++++++-------- include/linux/gpio/driver.h | 51 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0d2b470a252e..98d2fa602490 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2349,11 +2349,13 @@ int gpiochip_add_pingroup_range(struct gpio_chip *gc, EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); /** - * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping + * gpiochip_add_pin_range_with_pins() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pinctl_name: the dev_name() of the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_offset: the start offset in the pin controller number space + * @pins: the list of non consecutive pins to accumulate in this range (if not + * NULL, pin_offset is ignored by pinctrl core) * @npins: the number of pins from the offset of each pin space (GPIO and * pin controller) to accumulate in this range * @@ -2365,9 +2367,12 @@ EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); * Returns: * 0 on success, or a negative errno on failure. */ -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins) +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; @@ -2385,6 +2390,7 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->range.pin_base = pin_offset; + pin_range->range.pins = pins; pin_range->range.npins = npins; pin_range->pctldev = pinctrl_find_and_add_gpio_range(pinctl_name, &pin_range->range); @@ -2394,16 +2400,21 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, kfree(pin_range); return ret; } - chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", - gpio_offset, gpio_offset + npins - 1, - pinctl_name, - pin_offset, pin_offset + npins - 1); + if (pin_range->range.pins) + chip_dbg(gc, "created GPIO range %d->%d ==> %s %d sparse PIN range { %d, ... }", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, npins, pins[0]); + else + chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, + pin_offset, pin_offset + npins - 1); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } -EXPORT_SYMBOL_GPL(gpiochip_add_pin_range); +EXPORT_SYMBOL_GPL(gpiochip_add_pin_range_with_pins); /** * gpiochip_remove_pin_ranges() - remove all the GPIO <-> pin mappings diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 667f8fd58a79..9fcd4a988081 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -772,16 +772,50 @@ struct gpio_pin_range { #ifdef CONFIG_PINCTRL -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins); +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins); int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, unsigned int gpio_offset, const char *pin_group); void gpiochip_remove_pin_ranges(struct gpio_chip *gc); +static inline int +gpiochip_add_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, + pin_offset, NULL, npins); +} + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, 0, + pins, npins); +} #else /* ! CONFIG_PINCTRL */ +static inline int +gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, @@ -789,6 +823,17 @@ gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, { return 0; } + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 6e986f8852f56cf9214ea2ec02b4b432e201d02c Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:49 +0200 Subject: gpio: aggregator: export symbols of the GPIO forwarder library Export all symbols and create header file for the GPIO forwarder library. It will be used in the next changes. Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-6-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 202 +++++++++++++++++++++++++++++++++++++++-- include/linux/gpio/forwarder.h | 37 ++++++++ 2 files changed, 233 insertions(+), 6 deletions(-) create mode 100644 include/linux/gpio/forwarder.h (limited to 'include') diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index cc54d19f2dd1..c1952b28b7b7 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include +#include #include #include "dev-sync-probe.h" @@ -475,8 +477,180 @@ static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) } #endif /* !CONFIG_OF_GPIO */ -static struct gpiochip_fwd * -devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) +/** + * gpiochip_fwd_get_gpiochip - Get the GPIO chip for the GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: The GPIO chip for the GPIO forwarder + */ +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) +{ + return &fwd->chip; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 for output, 1 for input, or an error code in case of error. + */ +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_direction(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_direction, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_output - Set a GPIO forwarder line direction to + * output + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, unsigned int offset, + int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_output(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_output, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_input - Set a GPIO forwarder line direction to input + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_input(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_input, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get - Return a GPIO forwarder line's value + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The GPIO's logical value, i.e. taking the ACTIVE_LOW status into + * account, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_multiple - Get values for multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per line; BITS_PER_LONG bits per word defines + * which lines are to be read + * @bits: bit value array; one bit per line; BITS_PER_LONG bits per word will + * contains the read values for the lines specified by mask + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set - Assign value to a GPIO forwarder line. + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_multiple - Assign values to multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per output; BITS_PER_LONG bits per word + * defines which outputs are to be changed + * @bits: bit value array; one bit per output; BITS_PER_LONG bits per word + * defines the values the outputs specified by mask are to be set to + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_config - Set @config for a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @config: Same packed config format as generic pinconf + * + * Returns: 0 on success, %-ENOTSUPP if the controller doesn't support setting + * the configuration. + */ +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_config(gc, offset, config); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_config, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_to_irq - Return the IRQ corresponding to a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The Linux IRQ corresponding to the passed line, or an error code in + * case of error. + */ +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_to_irq(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_to_irq, "GPIO_FORWARDER"); + +/** + * devm_gpiochip_fwd_alloc - Allocate and initialize a new GPIO forwarder + * @dev: Parent device pointer + * @ngpios: Number of GPIOs in the forwarder + * + * Returns: An opaque object pointer, or an ERR_PTR()-encoded negative error + * code on failure. + */ +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios) { struct gpiochip_fwd *fwd; struct gpio_chip *chip; @@ -507,10 +681,18 @@ devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) return fwd; } +EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); -static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, - struct gpio_desc *desc, - unsigned int offset) +/** + * gpiochip_fwd_desc_add - Add a GPIO desc in the forwarder + * @fwd: GPIO forwarder + * @desc: GPIO descriptor to register + * @offset: offset for the GPIO in the forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, + unsigned int offset) { struct gpio_chip *parent = gpiod_to_chip(desc); struct gpio_chip *chip = &fwd->chip; @@ -537,8 +719,15 @@ static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, return 0; } +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); -static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +/** + * gpiochip_fwd_register - Register a GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_register(struct gpiochip_fwd *fwd) { struct gpio_chip *chip = &fwd->chip; @@ -549,6 +738,7 @@ static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) return devm_gpiochip_add_data(chip->parent, chip, fwd); } +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); /** * gpiochip_fwd_create() - Create a new GPIO forwarder diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h new file mode 100644 index 000000000000..e21a1b7b1905 --- /dev/null +++ b/include/linux/gpio/forwarder.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GPIO_FORWARDER_H +#define __LINUX_GPIO_FORWARDER_H + +struct gpio_desc; +struct gpio_chip; +struct gpiochip_fwd; + +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios); +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, + struct gpio_desc *desc, unsigned int offset); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd); + +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); + +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, + unsigned int offset, + int value); +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset); +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, + int value); +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config); +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset); + +#endif -- cgit v1.2.3 From b31c68fd851e74526ad963362ea205eb97b9a710 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:50 +0200 Subject: gpio: aggregator: handle runtime registration of gpio_desc in gpiochip_fwd Add request() callback to check if the GPIO descriptor was well registered in the gpiochip_fwd before using it. This is done to handle the case where GPIO descriptor is added at runtime in the forwarder. If at least one GPIO descriptor was not added before the forwarder registration, we assume the forwarder can sleep as if a GPIO is added at runtime it may sleep. Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-7-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 63 ++++++++++++++++++++++++++++++++++++++---- include/linux/gpio/forwarder.h | 2 ++ 2 files changed, 59 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index c1952b28b7b7..f0d38d76cf73 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -246,6 +246,7 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; @@ -254,10 +255,24 @@ struct gpiochip_fwd { #define fwd_tmp_size(ngpios) (BITS_TO_LONGS((ngpios)) + (ngpios)) +static int gpio_fwd_request(struct gpio_chip *chip, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + + return test_bit(offset, fwd->valid_mask) ? 0 : -ENODEV; +} + static int gpio_fwd_get_direction(struct gpio_chip *chip, unsigned int offset) { struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + /* + * get_direction() is called during gpiochip registration, return + * -ENODEV if there is no GPIO desc for the line. + */ + if (!test_bit(offset, fwd->valid_mask)) + return -ENODEV; + return gpiod_get_direction(fwd->descs[offset]); } @@ -489,6 +504,21 @@ struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder + * @fwd: GPIO forwarder + * @offset: the offset of the line to request + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_request(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_request, "GPIO_FORWARDER"); + /** * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line * @fwd: GPIO forwarder @@ -663,11 +693,16 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, if (!fwd->descs) return ERR_PTR(-ENOMEM); + fwd->valid_mask = devm_bitmap_zalloc(dev, ngpios, GFP_KERNEL); + if (!fwd->valid_mask) + return ERR_PTR(-ENOMEM); + chip = &fwd->chip; chip->label = dev_name(dev); chip->parent = dev; chip->owner = THIS_MODULE; + chip->request = gpio_fwd_request; chip->get_direction = gpio_fwd_get_direction; chip->direction_input = gpio_fwd_direction_input; chip->direction_output = gpio_fwd_direction_output; @@ -694,24 +729,21 @@ EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset) { - struct gpio_chip *parent = gpiod_to_chip(desc); struct gpio_chip *chip = &fwd->chip; if (offset > chip->ngpio) return -EINVAL; + if (test_and_set_bit(offset, fwd->valid_mask)) + return -EEXIST; + /* * If any of the GPIO lines are sleeping, then the entire forwarder * will be sleeping. - * If any of the chips support .set_config(), then the forwarder will - * support setting configs. */ if (gpiod_cansleep(desc)) chip->can_sleep = true; - if (parent && parent->set_config) - chip->set_config = gpio_fwd_set_config; - fwd->descs[offset] = desc; dev_dbg(chip->parent, "%u => gpio %d irq %d\n", offset, @@ -721,6 +753,18 @@ int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_desc_free - Remove a GPIO desc from the forwarder + * @fwd: GPIO forwarder + * @offset: offset of GPIO desc to remove + */ +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset) +{ + if (test_and_clear_bit(offset, fwd->valid_mask)) + gpiod_put(fwd->descs[offset]); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); + /** * gpiochip_fwd_register - Register a GPIO forwarder * @fwd: GPIO forwarder @@ -731,6 +775,13 @@ int gpiochip_fwd_register(struct gpiochip_fwd *fwd) { struct gpio_chip *chip = &fwd->chip; + /* + * Some gpio_desc were not registered. They will be registered at runtime + * but we have to suppose they can sleep. + */ + if (!bitmap_full(fwd->valid_mask, chip->ngpio)) + chip->can_sleep = true; + if (chip->can_sleep) mutex_init(&fwd->mlock); else diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h index e21a1b7b1905..45e0190308f0 100644 --- a/include/linux/gpio/forwarder.h +++ b/include/linux/gpio/forwarder.h @@ -10,10 +10,12 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios); int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset); +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_register(struct gpiochip_fwd *fwd); struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, -- cgit v1.2.3 From 60e92c1009c7c6abd4a9d0caf33a8cba5d09f67c Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:51 +0200 Subject: gpio: aggregator: add possibility to attach data to the forwarder Add a data pointer to store private data in the forwarder. Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-8-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 20 ++++++++++++++++++-- include/linux/gpio/forwarder.h | 4 +++- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index f0d38d76cf73..fb3694d581d1 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -246,6 +246,7 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + void *data; unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; @@ -504,6 +505,18 @@ struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_get_data - Get driver-private data for the GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: The driver-private data for the GPIO forwarder + */ +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd) +{ + return fwd->data; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_data, "GPIO_FORWARDER"); + /** * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder * @fwd: GPIO forwarder @@ -768,10 +781,11 @@ EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); /** * gpiochip_fwd_register - Register a GPIO forwarder * @fwd: GPIO forwarder + * @data: driver-private data associated with this forwarder * * Returns: 0 on success, or negative errno on failure. */ -int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data) { struct gpio_chip *chip = &fwd->chip; @@ -787,6 +801,8 @@ int gpiochip_fwd_register(struct gpiochip_fwd *fwd) else spin_lock_init(&fwd->slock); + fwd->data = data; + return devm_gpiochip_add_data(chip->parent, chip, fwd); } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); @@ -831,7 +847,7 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, return ERR_PTR(error); } - error = gpiochip_fwd_register(fwd); + error = gpiochip_fwd_register(fwd, NULL); if (error) return ERR_PTR(error); diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h index 45e0190308f0..ee5d8355f735 100644 --- a/include/linux/gpio/forwarder.h +++ b/include/linux/gpio/forwarder.h @@ -11,10 +11,12 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset); void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); -int gpiochip_fwd_register(struct gpiochip_fwd *fwd); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data); struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd); + int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset); -- cgit v1.2.3 From 53ec9169db1345f04174febb90f88a871fc28d9e Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:52 +0200 Subject: lib/string_choices: Add str_input_output() helper Add str_input_output() helper to return 'input' or 'output' string literal. Also add the inversed variant str_output_input(). Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-9-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff26..a27c87c954ae 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -41,6 +41,12 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) +static inline const char *str_input_output(bool v) +{ + return v ? "input" : "output"; +} +#define str_output_input(v) str_input_output(!(v)) + static inline const char *str_on_off(bool v) { return v ? "on" : "off"; -- cgit v1.2.3 From 8a5a0294f40a50e5be83e9b7ebbc15b546f64e41 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 4 Aug 2025 21:26:42 +0100 Subject: dt-bindings: clock: renesas,r9a09g077/87: Add USB_CLK clock ID Add the USB clock (USB_CLK) definition for the Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs. USB_CLK is used as the reference clock for USB PHY layer. Signed-off-by: Lad Prabhakar Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250804202643.3967484-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h | 1 + include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h index 7ecc4f0b235a..0c2ce81a8744 100644 --- a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h @@ -25,5 +25,6 @@ #define R9A09G077_CLK_PCLKM 13 #define R9A09G077_CLK_PCLKL 14 #define R9A09G077_SDHI_CLKHS 15 +#define R9A09G077_USB_CLK 16 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */ diff --git a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h index 925e57703925..70ee883f2386 100644 --- a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h @@ -25,5 +25,6 @@ #define R9A09G087_CLK_PCLKM 13 #define R9A09G087_CLK_PCLKL 14 #define R9A09G087_SDHI_CLKHS 15 +#define R9A09G087_USB_CLK 16 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__ */ -- cgit v1.2.3 From 5293e8f2a854344ef9aba2391b44c7a437889ebb Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 8 Aug 2025 14:30:15 +0100 Subject: dt-bindings: pinctrl: renesas: Document RZ/T2H and RZ/N2H SoCs Document the pin and GPIO controller IP for the Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs, and add the shared DTSI header file used by both the bindings and the driver. The RZ/T2H SoC supports 729 pins, while RZ/N2H supports 576 pins. Both share the same controller architecture; separate compatible strings are added for each SoC to distinguish them. Co-developed-by: Thierry Bultel Signed-off-by: Thierry Bultel Signed-off-by: Lad Prabhakar Reviewed-by: "Rob Herring (Arm)" Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250808133017.2053637-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- .../pinctrl/renesas,r9a09g077-pinctrl.yaml | 172 +++++++++++++++++++++ .../pinctrl/renesas,r9a09g077-pinctrl.h | 22 +++ 2 files changed, 194 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml create mode 100644 include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml new file mode 100644 index 000000000000..36d665971484 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/renesas,r9a09g077-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/T2H and RZ/N2H Pin and GPIO controller + +maintainers: + - Lad Prabhakar + +description: + The Renesas RZ/T2H and RZ/N2H SoCs feature a combined Pin and GPIO controller. + Pin multiplexing and GPIO configuration are performed on a per-pin basis. + Each port supports up to 8 pins, each configurable for either GPIO (port mode) + or alternate function mode. Each pin supports function mode values ranging from + 0x0 to 0x2A, allowing selection from up to 43 different functions. + +properties: + compatible: + enum: + - renesas,r9a09g077-pinctrl # RZ/T2H + - renesas,r9a09g087-pinctrl # RZ/N2H + + reg: + minItems: 1 + items: + - description: Non-safety I/O Port base + - description: Safety I/O Port safety region base + - description: Safety I/O Port Non-safety region base + + reg-names: + minItems: 1 + items: + - const: nsr + - const: srs + - const: srn + + gpio-controller: true + + '#gpio-cells': + const: 2 + description: + The first cell contains the global GPIO port index, constructed using the + RZT2H_GPIO() helper macro from + (e.g. "RZT2H_GPIO(3, 0)" for P03_0). The second cell represents the consumer + flag. Use the macros defined in include/dt-bindings/gpio/gpio.h. + + gpio-ranges: + maxItems: 1 + + clocks: + maxItems: 1 + + power-domains: + maxItems: 1 + +definitions: + renesas-rzt2h-n2h-pins-node: + type: object + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + properties: + pinmux: + description: + Values are constructed from I/O port number, pin number, and + alternate function configuration number using the RZT2H_PORT_PINMUX() + helper macro from . + pins: true + phandle: true + input: true + input-enable: true + output-enable: true + oneOf: + - required: [pinmux] + - required: [pins] + additionalProperties: false + +patternProperties: + # Grouping nodes: allow multiple "-pins" subnodes within a "-group" + '.*-group$': + type: object + description: + Pin controller client devices can organize pin configuration entries into + grouping nodes ending in "-group". These group nodes may contain multiple + child nodes each ending in "-pins" to configure distinct sets of pins. + additionalProperties: false + patternProperties: + '-pins$': + $ref: '#/definitions/renesas-rzt2h-n2h-pins-node' + + # Standalone "-pins" nodes under client devices or groups + '-pins$': + $ref: '#/definitions/renesas-rzt2h-n2h-pins-node' + + '-hog$': + type: object + description: GPIO hog node + properties: + gpio-hog: true + gpios: true + input: true + output-high: true + output-low: true + line-name: true + required: + - gpio-hog + - gpios + additionalProperties: false + +allOf: + - $ref: pinctrl.yaml# + +required: + - compatible + - reg + - reg-names + - gpio-controller + - '#gpio-cells' + - gpio-ranges + - clocks + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pinctrl@802c0000 { + compatible = "renesas,r9a09g077-pinctrl"; + reg = <0x802c0000 0x2000>, + <0x812c0000 0x2000>, + <0x802b0000 0x2000>; + reg-names = "nsr", "srs", "srn"; + clocks = <&cpg CPG_CORE R9A09G077_CLK_PCLKM>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&pinctrl 0 0 288>; + power-domains = <&cpg>; + + serial0-pins { + pinmux = , /* Tx */ + ; /* Rx */ + }; + + sd1-pwr-en-hog { + gpio-hog; + gpios = ; + output-high; + line-name = "sd1_pwr_en"; + }; + + i2c0-pins { + pins = "RIIC0_SDA", "RIIC0_SCL"; + input-enable; + }; + + sd0-sd-group { + ctrl-pins { + pinmux = , /* SD0_CLK */ + ; /* SD0_CMD */ + }; + + data-pins { + pinmux = , /* SD0_CLK */ + ; /* SD0_CMD */ + }; + }; + }; diff --git a/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h b/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h new file mode 100644 index 000000000000..f088793f23ee --- /dev/null +++ b/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for Renesas RZ/T2H family pinctrl bindings. + * + * Copyright (C) 2025 Renesas Electronics Corp. + */ + +#ifndef __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ +#define __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ + +#define RZT2H_PINS_PER_PORT 8 + +/* + * Create the pin index from its bank and position numbers and store in + * the upper 16 bits the alternate function identifier + */ +#define RZT2H_PORT_PINMUX(b, p, f) ((b) * RZT2H_PINS_PER_PORT + (p) | ((f) << 16)) + +/* Convert a port and pin label to its global pin index */ +#define RZT2H_GPIO(port, pin) ((port) * RZT2H_PINS_PER_PORT + (pin)) + +#endif /* __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ */ -- cgit v1.2.3 From c17ccefb611fdb346eef9be6bfbd0bfd04afa204 Mon Sep 17 00:00:00 2001 From: Sricharan Ramabadhran Date: Mon, 11 Aug 2025 14:39:51 +0530 Subject: dt-bindings: clock: ipq5424-apss-clk: Add ipq5424 apss clock controller The CPU core in ipq5424 is clocked by a huayra PLL with RCG support. The RCG and PLL have a separate register space from the GCC. Also the L3 cache has a separate pll and needs to be scaled along with the CPU. Co-developed-by: Md Sadre Alam Signed-off-by: Md Sadre Alam Signed-off-by: Sricharan Ramabadhran [ Added interconnect related changes ] Reviewed-by: Krzysztof Kozlowski Signed-off-by: Varadarajan Narayanan Link: https://lore.kernel.org/r/20250811090954.2854440-2-quic_varada@quicinc.com Signed-off-by: Bjorn Andersson --- .../bindings/clock/qcom,ipq5424-apss-clk.yaml | 55 ++++++++++++++++++++++ include/dt-bindings/clock/qcom,apss-ipq.h | 6 +++ include/dt-bindings/interconnect/qcom,ipq5424.h | 3 ++ 3 files changed, 64 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/qcom,ipq5424-apss-clk.yaml (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq5424-apss-clk.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq5424-apss-clk.yaml new file mode 100644 index 000000000000..def739fa0a8c --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,ipq5424-apss-clk.yaml @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/qcom,ipq5424-apss-clk.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm APSS IPQ5424 Clock Controller + +maintainers: + - Varadarajan Narayanan + +description: + The CPU core in ipq5424 is clocked by a huayra PLL with RCG support. + The RCG and PLL have a separate register space from the GCC. + +properties: + compatible: + enum: + - qcom,ipq5424-apss-clk + + reg: + maxItems: 1 + + clocks: + items: + - description: Reference to the XO clock. + - description: Reference to the GPLL0 clock. + + '#clock-cells': + const: 1 + + '#interconnect-cells': + const: 1 + +required: + - compatible + - reg + - clocks + - '#clock-cells' + - '#interconnect-cells' + +additionalProperties: false + +examples: + - | + #include + + apss_clk: clock-controller@fa80000 { + compatible = "qcom,ipq5424-apss-clk"; + reg = <0x0fa80000 0x20000>; + clocks = <&xo_board>, + <&gcc GPLL0>; + #clock-cells = <1>; + #interconnect-cells = <1>; + }; diff --git a/include/dt-bindings/clock/qcom,apss-ipq.h b/include/dt-bindings/clock/qcom,apss-ipq.h index 77b6e05492e2..0bb41e5efdef 100644 --- a/include/dt-bindings/clock/qcom,apss-ipq.h +++ b/include/dt-bindings/clock/qcom,apss-ipq.h @@ -8,5 +8,11 @@ #define APCS_ALIAS0_CLK_SRC 0 #define APCS_ALIAS0_CORE_CLK 1 +#define APSS_PLL_EARLY 2 +#define APSS_SILVER_CLK_SRC 3 +#define APSS_SILVER_CORE_CLK 4 +#define L3_PLL 5 +#define L3_CLK_SRC 6 +#define L3_CORE_CLK 7 #endif diff --git a/include/dt-bindings/interconnect/qcom,ipq5424.h b/include/dt-bindings/interconnect/qcom,ipq5424.h index a770356112ee..afd7e0683a24 100644 --- a/include/dt-bindings/interconnect/qcom,ipq5424.h +++ b/include/dt-bindings/interconnect/qcom,ipq5424.h @@ -21,4 +21,7 @@ #define MASTER_CNOC_USB 16 #define SLAVE_CNOC_USB 17 +#define MASTER_CPU 0 +#define SLAVE_L3 1 + #endif /* INTERCONNECT_QCOM_IPQ5424_H */ -- cgit v1.2.3 From 0daf35da397b083ea0ea5407196bb6bd210530ec Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Thu, 7 Aug 2025 13:13:10 +0530 Subject: soc: qcom: mdt_loader: Remove pas id parameter pas id is not used in qcom_mdt_load_no_init() and it should not be used as it is non-PAS specific function and has no relation to PAS specific mechanism. Reviewed-by: Dikshita Agarwal Acked-by: Jeff Johnson # drivers/net/wireless/ath/ath12k/ahb.c Signed-off-by: Mukesh Ojha Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250807074311.2381713-2-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/media/platform/qcom/venus/firmware.c | 4 ++-- drivers/net/wireless/ath/ath12k/ahb.c | 2 +- drivers/remoteproc/qcom_q6v5_adsp.c | 2 +- drivers/remoteproc/qcom_q6v5_pas.c | 7 +++---- drivers/remoteproc/qcom_q6v5_wcss.c | 2 +- drivers/soc/qcom/mdt_loader.c | 10 ++++------ include/linux/soc/qcom/mdt_loader.h | 7 +++---- 7 files changed, 15 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/media/platform/qcom/venus/firmware.c b/drivers/media/platform/qcom/venus/firmware.c index 66a18830e66d..862d0718f694 100644 --- a/drivers/media/platform/qcom/venus/firmware.c +++ b/drivers/media/platform/qcom/venus/firmware.c @@ -136,8 +136,8 @@ static int venus_load_fw(struct venus_core *core, const char *fwname, ret = qcom_mdt_load(dev, mdt, fwname, VENUS_PAS_ID, mem_va, *mem_phys, *mem_size, NULL); else - ret = qcom_mdt_load_no_init(dev, mdt, fwname, VENUS_PAS_ID, - mem_va, *mem_phys, *mem_size, NULL); + ret = qcom_mdt_load_no_init(dev, mdt, fwname, mem_va, + *mem_phys, *mem_size, NULL); memunmap(mem_va); err_release_fw: diff --git a/drivers/net/wireless/ath/ath12k/ahb.c b/drivers/net/wireless/ath/ath12k/ahb.c index 3b983f4e3268..b30527c402f6 100644 --- a/drivers/net/wireless/ath/ath12k/ahb.c +++ b/drivers/net/wireless/ath/ath12k/ahb.c @@ -414,7 +414,7 @@ static int ath12k_ahb_power_up(struct ath12k_base *ab) goto err_fw2; } - ret = qcom_mdt_load_no_init(dev, fw2, fw2_name, pasid, mem_region, mem_phys, + ret = qcom_mdt_load_no_init(dev, fw2, fw2_name, mem_region, mem_phys, mem_size, &mem_phys); if (ret) { ath12k_err(ab, "Failed to load MDT segments: %d\n", ret); diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c index 94af77baa7a1..e98b7e03162c 100644 --- a/drivers/remoteproc/qcom_q6v5_adsp.c +++ b/drivers/remoteproc/qcom_q6v5_adsp.c @@ -317,7 +317,7 @@ static int adsp_load(struct rproc *rproc, const struct firmware *fw) struct qcom_adsp *adsp = rproc->priv; int ret; - ret = qcom_mdt_load_no_init(adsp->dev, fw, rproc->firmware, 0, + ret = qcom_mdt_load_no_init(adsp->dev, fw, rproc->firmware, adsp->mem_region, adsp->mem_phys, adsp->mem_size, &adsp->mem_reloc); if (ret) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 02e29171cbbe..55a7da801183 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -242,9 +242,8 @@ static int qcom_pas_load(struct rproc *rproc, const struct firmware *fw) goto release_dtb_firmware; ret = qcom_mdt_load_no_init(pas->dev, pas->dtb_firmware, pas->dtb_firmware_name, - pas->dtb_pas_id, pas->dtb_mem_region, - pas->dtb_mem_phys, pas->dtb_mem_size, - &pas->dtb_mem_reloc); + pas->dtb_mem_region, pas->dtb_mem_phys, + pas->dtb_mem_size, &pas->dtb_mem_reloc); if (ret) goto release_dtb_metadata; } @@ -307,7 +306,7 @@ static int qcom_pas_start(struct rproc *rproc) if (ret) goto disable_px_supply; - ret = qcom_mdt_load_no_init(pas->dev, pas->firmware, rproc->firmware, pas->pas_id, + ret = qcom_mdt_load_no_init(pas->dev, pas->firmware, rproc->firmware, pas->mem_region, pas->mem_phys, pas->mem_size, &pas->mem_reloc); if (ret) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 93648734a2f2..07c88623f597 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -757,7 +757,7 @@ static int q6v5_wcss_load(struct rproc *rproc, const struct firmware *fw) int ret; ret = qcom_mdt_load_no_init(wcss->dev, fw, rproc->firmware, - 0, wcss->mem_region, wcss->mem_phys, + wcss->mem_region, wcss->mem_phys, wcss->mem_size, &wcss->mem_reloc); if (ret) return ret; diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c index dfd15d189087..74c415774657 100644 --- a/drivers/soc/qcom/mdt_loader.c +++ b/drivers/soc/qcom/mdt_loader.c @@ -331,7 +331,7 @@ static bool qcom_mdt_bins_are_split(const struct firmware *fw, const char *fw_na } static int __qcom_mdt_load(struct device *dev, const struct firmware *fw, - const char *fw_name, int pas_id, void *mem_region, + const char *fw_name, void *mem_region, phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base) { @@ -458,7 +458,7 @@ int qcom_mdt_load(struct device *dev, const struct firmware *fw, if (ret) return ret; - return __qcom_mdt_load(dev, fw, firmware, pas_id, mem_region, mem_phys, + return __qcom_mdt_load(dev, fw, firmware, mem_region, mem_phys, mem_size, reloc_base); } EXPORT_SYMBOL_GPL(qcom_mdt_load); @@ -468,7 +468,6 @@ EXPORT_SYMBOL_GPL(qcom_mdt_load); * @dev: device handle to associate resources with * @fw: firmware object for the mdt file * @firmware: name of the firmware, for construction of segment file names - * @pas_id: PAS identifier * @mem_region: allocated memory region to load firmware into * @mem_phys: physical address of allocated memory region * @mem_size: size of the allocated memory region @@ -477,11 +476,10 @@ EXPORT_SYMBOL_GPL(qcom_mdt_load); * Returns 0 on success, negative errno otherwise. */ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, - const char *firmware, int pas_id, - void *mem_region, phys_addr_t mem_phys, + const char *firmware, void *mem_region, phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base) { - return __qcom_mdt_load(dev, fw, firmware, pas_id, mem_region, mem_phys, + return __qcom_mdt_load(dev, fw, firmware, mem_region, mem_phys, mem_size, reloc_base); } EXPORT_SYMBOL_GPL(qcom_mdt_load_no_init); diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h index 9e8e60421192..8ea8230579a2 100644 --- a/include/linux/soc/qcom/mdt_loader.h +++ b/include/linux/soc/qcom/mdt_loader.h @@ -24,7 +24,7 @@ int qcom_mdt_load(struct device *dev, const struct firmware *fw, phys_addr_t *reloc_base); int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, - const char *fw_name, int pas_id, void *mem_region, + const char *fw_name, void *mem_region, phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base); void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len, @@ -54,9 +54,8 @@ static inline int qcom_mdt_load(struct device *dev, const struct firmware *fw, static inline int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, - const char *fw_name, int pas_id, - void *mem_region, phys_addr_t mem_phys, - size_t mem_size, + const char *fw_name, void *mem_region, + phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base) { return -ENODEV; -- cgit v1.2.3 From ce8370e2e62a903e18be7dd0e0be2eee079501e1 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 6 Aug 2025 17:04:07 -0400 Subject: audit: record fanotify event regardless of presence of rules When no audit rules are in place, fanotify event results are unconditionally dropped due to an explicit check for the existence of any audit rules. Given this is a report from another security sub-system, allow it to be recorded regardless of the existence of any audit rules. To test, install and run the fapolicyd daemon with default config. Then as an unprivileged user, create and run a very simple binary that should be denied. Then check for an event with ausearch -m FANOTIFY -ts recent Link: https://issues.redhat.com/browse/RHEL-9065 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index a394614ccd0b..e3f06eba9c6e 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name) static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - if (!audit_dummy_context()) + if (audit_enabled) __audit_fanotify(response, friar); } -- cgit v1.2.3 From b41dc83f0790fd3488a45b31de0b0c3af7d441fe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 11:26:29 -0700 Subject: kunit, lib/crypto: Move run_irq_test() to common header Rename run_irq_test() to kunit_run_irq_test() and move it to a public header so that it can be reused by crc_kunit. Link: https://lore.kernel.org/r/20250811182631.376302-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/kunit/run-in-irq-context.h | 129 ++++++++++++++++++++++++++++++++++ lib/crypto/tests/hash-test-template.h | 123 ++------------------------------ 2 files changed, 133 insertions(+), 119 deletions(-) create mode 100644 include/kunit/run-in-irq-context.h (limited to 'include') diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h new file mode 100644 index 000000000000..108e96433ea4 --- /dev/null +++ b/include/kunit/run-in-irq-context.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Helper function for testing code in interrupt contexts + * + * Copyright 2025 Google LLC + */ +#ifndef _KUNIT_RUN_IN_IRQ_CONTEXT_H +#define _KUNIT_RUN_IN_IRQ_CONTEXT_H + +#include +#include +#include +#include + +#define KUNIT_IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) + +struct kunit_irq_test_state { + bool (*func)(void *test_specific_state); + void *test_specific_state; + bool task_func_reported_failure; + bool hardirq_func_reported_failure; + bool softirq_func_reported_failure; + unsigned long hardirq_func_calls; + unsigned long softirq_func_calls; + struct hrtimer timer; + struct work_struct bh_work; +}; + +static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer) +{ + struct kunit_irq_test_state *state = + container_of(timer, typeof(*state), timer); + + WARN_ON_ONCE(!in_hardirq()); + state->hardirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->hardirq_func_reported_failure = true; + + hrtimer_forward_now(&state->timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL); + queue_work(system_bh_wq, &state->bh_work); + return HRTIMER_RESTART; +} + +static void kunit_irq_test_bh_work_func(struct work_struct *work) +{ + struct kunit_irq_test_state *state = + container_of(work, typeof(*state), bh_work); + + WARN_ON_ONCE(!in_serving_softirq()); + state->softirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->softirq_func_reported_failure = true; +} + +/* + * Helper function which repeatedly runs the given @func in task, softirq, and + * hardirq context concurrently, and reports a failure to KUnit if any + * invocation of @func in any context returns false. @func is passed + * @test_specific_state as its argument. At most 3 invocations of @func will + * run concurrently: one in each of task, softirq, and hardirq context. + * + * The main purpose of this interrupt context testing is to validate fallback + * code paths that run in contexts where the normal code path cannot be used, + * typically due to the FPU or vector registers already being in-use in kernel + * mode. These code paths aren't covered when the test code is executed only by + * the KUnit test runner thread in task context. The reason for the concurrency + * is because merely using hardirq context is not sufficient to reach a fallback + * code path on some architectures; the hardirq actually has to occur while the + * FPU or vector unit was already in-use in kernel mode. + * + * Another purpose of this testing is to detect issues with the architecture's + * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, + * especially in softirq context when the softirq may have interrupted a task + * already using kernel-mode FPU or vector (if the arch didn't prevent that). + * Crypto functions are often executed in softirqs, so this is important. + */ +static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *), + int max_iterations, + void *test_specific_state) +{ + struct kunit_irq_test_state state = { + .func = func, + .test_specific_state = test_specific_state, + }; + unsigned long end_jiffies; + + /* + * Set up a hrtimer (the way we access hardirq context) and a work + * struct for the BH workqueue (the way we access softirq context). + */ + hrtimer_setup_on_stack(&state.timer, kunit_irq_test_timer_func, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func); + + /* Run for up to max_iterations or 1 second, whichever comes first. */ + end_jiffies = jiffies + HZ; + hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL, + HRTIMER_MODE_REL_HARD); + for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); + i++) { + if (!func(test_specific_state)) + state.task_func_reported_failure = true; + } + + /* Cancel the timer and work. */ + hrtimer_cancel(&state.timer); + flush_work(&state.bh_work); + + /* Sanity check: the timer and BH functions should have been run. */ + KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, + "Timer function was not called"); + KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, + "BH work function was not called"); + + /* Check for incorrect hash values reported from any context. */ + KUNIT_EXPECT_FALSE_MSG( + test, state.task_func_reported_failure, + "Incorrect hash values reported from task context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.hardirq_func_reported_failure, + "Incorrect hash values reported from hardirq context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.softirq_func_reported_failure, + "Incorrect hash values reported from softirq context"); +} + +#endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */ diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h index f437a0a9ac6c..61b43e62779f 100644 --- a/lib/crypto/tests/hash-test-template.h +++ b/lib/crypto/tests/hash-test-template.h @@ -5,11 +5,9 @@ * * Copyright 2025 Google LLC */ +#include #include -#include -#include #include -#include /* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */ #define TEST_BUF_LEN 16384 @@ -319,119 +317,6 @@ static void test_hash_ctx_zeroization(struct kunit *test) "Hash context was not zeroized by finalization"); } -#define IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) - -struct hash_irq_test_state { - bool (*func)(void *test_specific_state); - void *test_specific_state; - bool task_func_reported_failure; - bool hardirq_func_reported_failure; - bool softirq_func_reported_failure; - unsigned long hardirq_func_calls; - unsigned long softirq_func_calls; - struct hrtimer timer; - struct work_struct bh_work; -}; - -static enum hrtimer_restart hash_irq_test_timer_func(struct hrtimer *timer) -{ - struct hash_irq_test_state *state = - container_of(timer, typeof(*state), timer); - - WARN_ON_ONCE(!in_hardirq()); - state->hardirq_func_calls++; - - if (!state->func(state->test_specific_state)) - state->hardirq_func_reported_failure = true; - - hrtimer_forward_now(&state->timer, IRQ_TEST_HRTIMER_INTERVAL); - queue_work(system_bh_wq, &state->bh_work); - return HRTIMER_RESTART; -} - -static void hash_irq_test_bh_work_func(struct work_struct *work) -{ - struct hash_irq_test_state *state = - container_of(work, typeof(*state), bh_work); - - WARN_ON_ONCE(!in_serving_softirq()); - state->softirq_func_calls++; - - if (!state->func(state->test_specific_state)) - state->softirq_func_reported_failure = true; -} - -/* - * Helper function which repeatedly runs the given @func in task, softirq, and - * hardirq context concurrently, and reports a failure to KUnit if any - * invocation of @func in any context returns false. @func is passed - * @test_specific_state as its argument. At most 3 invocations of @func will - * run concurrently: one in each of task, softirq, and hardirq context. - * - * The main purpose of this interrupt context testing is to validate fallback - * code paths that run in contexts where the normal code path cannot be used, - * typically due to the FPU or vector registers already being in-use in kernel - * mode. These code paths aren't covered when the test code is executed only by - * the KUnit test runner thread in task context. The reason for the concurrency - * is because merely using hardirq context is not sufficient to reach a fallback - * code path on some architectures; the hardirq actually has to occur while the - * FPU or vector unit was already in-use in kernel mode. - * - * Another purpose of this testing is to detect issues with the architecture's - * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, - * especially in softirq context when the softirq may have interrupted a task - * already using kernel-mode FPU or vector (if the arch didn't prevent that). - * Crypto functions are often executed in softirqs, so this is important. - */ -static void run_irq_test(struct kunit *test, bool (*func)(void *), - int max_iterations, void *test_specific_state) -{ - struct hash_irq_test_state state = { - .func = func, - .test_specific_state = test_specific_state, - }; - unsigned long end_jiffies; - - /* - * Set up a hrtimer (the way we access hardirq context) and a work - * struct for the BH workqueue (the way we access softirq context). - */ - hrtimer_setup_on_stack(&state.timer, hash_irq_test_timer_func, - CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - INIT_WORK_ONSTACK(&state.bh_work, hash_irq_test_bh_work_func); - - /* Run for up to max_iterations or 1 second, whichever comes first. */ - end_jiffies = jiffies + HZ; - hrtimer_start(&state.timer, IRQ_TEST_HRTIMER_INTERVAL, - HRTIMER_MODE_REL_HARD); - for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); - i++) { - if (!func(test_specific_state)) - state.task_func_reported_failure = true; - } - - /* Cancel the timer and work. */ - hrtimer_cancel(&state.timer); - flush_work(&state.bh_work); - - /* Sanity check: the timer and BH functions should have been run. */ - KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, - "Timer function was not called"); - KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, - "BH work function was not called"); - - /* Check for incorrect hash values reported from any context. */ - KUNIT_EXPECT_FALSE_MSG( - test, state.task_func_reported_failure, - "Incorrect hash values reported from task context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.hardirq_func_reported_failure, - "Incorrect hash values reported from hardirq context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.softirq_func_reported_failure, - "Incorrect hash values reported from softirq context"); -} - #define IRQ_TEST_DATA_LEN 256 #define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */ @@ -469,7 +354,7 @@ static void test_hash_interrupt_context_1(struct kunit *test) HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, state.expected_hashes[i]); - run_irq_test(test, hash_irq_test1_func, 100000, &state); + kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state); } struct hash_irq_test2_hash_ctx { @@ -500,7 +385,7 @@ static bool hash_irq_test2_func(void *state_) if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) { /* * This should never happen, as the number of contexts is equal - * to the maximum concurrency level of run_irq_test(). + * to the maximum concurrency level of kunit_run_irq_test(). */ return false; } @@ -566,7 +451,7 @@ static void test_hash_interrupt_context_2(struct kunit *test) state->update_lens[state->num_steps++] = remaining; state->num_steps += 2; /* for init and final */ - run_irq_test(test, hash_irq_test2_func, 250000, state); + kunit_run_irq_test(test, hash_irq_test2_func, 250000, state); } #define UNKEYED_HASH_KUNIT_CASES \ -- cgit v1.2.3 From 5816bf4273edb32716a88c796e0b04f0e12962eb Mon Sep 17 00:00:00 2001 From: Blaise Boscaccy Date: Tue, 22 Jul 2025 14:21:34 -0700 Subject: lsm,selinux: Add LSM blob support for BPF objects This patch introduces LSM blob support for BPF maps, programs, and tokens to enable LSM stacking and multiplexing of LSM modules that govern BPF objects. Additionally, the existing BPF hooks used by SELinux have been updated to utilize the new blob infrastructure, removing the assumption of exclusive ownership of the security pointer. Signed-off-by: Blaise Boscaccy [PM: dropped local variable init, style fixes] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 3 ++ security/security.c | 86 +++++++++++++++++++++++++++++++++++++-- security/selinux/hooks.c | 56 +++++-------------------- security/selinux/include/objsec.h | 20 +++++++++ 4 files changed, 116 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 090d1d3e19fe..79ec5a2bdcca 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -116,6 +116,9 @@ struct lsm_blob_sizes { int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; int lbs_bdev; + int lbs_bpf_map; + int lbs_bpf_prog; + int lbs_bpf_token; }; /* diff --git a/security/security.c b/security/security.c index a88ebfca3224..ca126b02d2fe 100644 --- a/security/security.c +++ b/security/security.c @@ -283,6 +283,9 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); + lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map); + lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); + lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token); } /* Prepare LSM for initialization. */ @@ -480,6 +483,9 @@ static void __init ordered_lsm_init(void) init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); + init_debug("bpf map blob size = %d\n", blob_sizes.lbs_bpf_map); + init_debug("bpf prog blob size = %d\n", blob_sizes.lbs_bpf_prog); + init_debug("bpf token blob size = %d\n", blob_sizes.lbs_bpf_token); /* * Create any kmem_caches needed for blobs @@ -827,6 +833,47 @@ static int lsm_bdev_alloc(struct block_device *bdev) GFP_KERNEL); } +#ifdef CONFIG_BPF_SYSCALL +/** + * lsm_bpf_map_alloc - allocate a composite bpf_map blob + * @map: the bpf_map that needs a blob + * + * Allocate the bpf_map blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_map_alloc(struct bpf_map *map) +{ + return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL); +} + +/** + * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob + * @prog: the bpf_prog that needs a blob + * + * Allocate the bpf_prog blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_prog_alloc(struct bpf_prog *prog) +{ + return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL); +} + +/** + * lsm_bpf_token_alloc - allocate a composite bpf_token blob + * @token: the bpf_token that needs a blob + * + * Allocate the bpf_token blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_token_alloc(struct bpf_token *token) +{ + return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL); +} +#endif /* CONFIG_BPF_SYSCALL */ + /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob @@ -5706,7 +5753,16 @@ int security_bpf_prog(struct bpf_prog *prog) int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_map_create, map, attr, token, kernel); + int rc; + + rc = lsm_bpf_map_alloc(map); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_map_create, map, attr, token, kernel); + if (unlikely(rc)) + security_bpf_map_free(map); + return rc; } /** @@ -5725,7 +5781,16 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_prog_load, prog, attr, token, kernel); + int rc; + + rc = lsm_bpf_prog_alloc(prog); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel); + if (unlikely(rc)) + security_bpf_prog_free(prog); + return rc; } /** @@ -5742,7 +5807,16 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { - return call_int_hook(bpf_token_create, token, attr, path); + int rc; + + rc = lsm_bpf_token_alloc(token); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_token_create, token, attr, path); + if (unlikely(rc)) + security_bpf_token_free(token); + return rc; } /** @@ -5786,6 +5860,8 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap) void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free, map); + kfree(map->security); + map->security = NULL; } /** @@ -5797,6 +5873,8 @@ void security_bpf_map_free(struct bpf_map *map) void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); + kfree(prog->aux->security); + prog->aux->security = NULL; } /** @@ -5808,6 +5886,8 @@ void security_bpf_prog_free(struct bpf_prog *prog) void security_bpf_token_free(struct bpf_token *token) { call_void_hook(bpf_token_free, token); + kfree(token->security); + token->security = NULL; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..4da5e792b42e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7062,14 +7062,14 @@ static int bpf_fd_pass(const struct file *file, u32 sid) if (file->f_op == &bpf_map_fops) { map = file->private_data; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) @@ -7083,7 +7083,7 @@ static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } @@ -7093,7 +7093,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } @@ -7103,69 +7103,33 @@ static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_map_security(map); bpfsec->sid = current_sid(); - map->security = bpfsec; return 0; } -static void selinux_bpf_map_free(struct bpf_map *map) -{ - struct bpf_security_struct *bpfsec = map->security; - - map->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_prog_security(prog); bpfsec->sid = current_sid(); - prog->aux->security = bpfsec; return 0; } -static void selinux_bpf_prog_free(struct bpf_prog *prog) -{ - struct bpf_security_struct *bpfsec = prog->aux->security; - - prog->aux->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_token_security(token); bpfsec->sid = current_sid(); - token->security = bpfsec; return 0; } - -static void selinux_bpf_token_free(struct bpf_token *token) -{ - struct bpf_security_struct *bpfsec = token->security; - - token->security = NULL; - kfree(bpfsec); -} #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { @@ -7183,6 +7147,9 @@ struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS, .lbs_tun_dev = sizeof(struct tun_security_struct), .lbs_ib = sizeof(struct ib_security_struct), + .lbs_bpf_map = sizeof(struct bpf_security_struct), + .lbs_bpf_prog = sizeof(struct bpf_security_struct), + .lbs_bpf_token = sizeof(struct bpf_security_struct), }; #ifdef CONFIG_PERF_EVENTS @@ -7536,9 +7503,6 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), - LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free), - LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), - LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free), #endif #ifdef CONFIG_PERF_EVENTS diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 1d7ac59015a1..2d5139c6d45b 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "flask.h" #include "avc.h" @@ -245,4 +246,23 @@ selinux_perf_event(void *perf_event) return perf_event + selinux_blob_sizes.lbs_perf_event; } +#ifdef CONFIG_BPF_SYSCALL +static inline struct bpf_security_struct * +selinux_bpf_map_security(struct bpf_map *map) +{ + return map->security + selinux_blob_sizes.lbs_bpf_map; +} + +static inline struct bpf_security_struct * +selinux_bpf_prog_security(struct bpf_prog *prog) +{ + return prog->aux->security + selinux_blob_sizes.lbs_bpf_prog; +} + +static inline struct bpf_security_struct * +selinux_bpf_token_security(struct bpf_token *token) +{ + return token->security + selinux_blob_sizes.lbs_bpf_token; +} +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _SELINUX_OBJSEC_H_ */ -- cgit v1.2.3 From 039a504cda2cb69354387aa453391ec89a9e0e49 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 11 Aug 2025 22:11:33 -0500 Subject: dt-bindings: clock: dispcc-sc7280: Add display resets Like other platforms the sc7280 display clock controller provides a couple of resets, add the defines to allow referring to them. Signed-off-by: Bjorn Andersson Reviewed-by: Taniya Das Link: https://lore.kernel.org/r/20250811-sc7280-mdss-reset-v1-1-83ceff1d48de@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,dispcc-sc7280.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,dispcc-sc7280.h b/include/dt-bindings/clock/qcom,dispcc-sc7280.h index a4a692c20acf..9f113f346be8 100644 --- a/include/dt-bindings/clock/qcom,dispcc-sc7280.h +++ b/include/dt-bindings/clock/qcom,dispcc-sc7280.h @@ -52,4 +52,8 @@ /* DISP_CC power domains */ #define DISP_CC_MDSS_CORE_GDSC 0 +/* DISPCC resets */ +#define DISP_CC_MDSS_CORE_BCR 0 +#define DISP_CC_MDSS_RSCC_BCR 1 + #endif -- cgit v1.2.3 From bea90085dcb0f9a75748e73d723bde557a5ebf1a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:53 -0400 Subject: dlm: use defines for force values in dlm_release_lockspace Clarify the use of the force parameter by renaming it to "release_option" and adding defines (with descriptions) for each of the accepted values. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- drivers/md/md-cluster.c | 4 ++-- fs/dlm/lockspace.c | 20 +++++++++----------- fs/dlm/user.c | 6 +++--- fs/gfs2/lock_dlm.c | 4 ++-- fs/ocfs2/stack_user.c | 2 +- include/linux/dlm.h | 26 +++++++++++++++++++++++++- 6 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5497eaee96e7..6e9a0045f0ff 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -979,7 +979,7 @@ err: lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index ee11a70def92..be8dbf482229 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -671,19 +671,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int force) +static int lockspace_busy(struct dlm_ls *ls, int release_option) { struct dlm_lkb *lkb; unsigned long id; int rv = 0; read_lock_bh(&ls->ls_lkbxa_lock); - if (force == 0) { + if (release_option == DLM_RELEASE_NO_LOCKS) { xa_for_each(&ls->ls_lkbxa, id, lkb) { rv = 1; break; } - } else if (force == 1) { + } else if (release_option == DLM_RELEASE_UNUSED) { + /* TODO: handle this UNUSED option as NO_LOCKS in later patch */ xa_for_each(&ls->ls_lkbxa, id, lkb) { if (lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV) { @@ -698,11 +699,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force) return rv; } -static int release_lockspace(struct dlm_ls *ls, int force) +static int release_lockspace(struct dlm_ls *ls, int release_option) { int busy, rv; - busy = lockspace_busy(ls, force); + busy = lockspace_busy(ls, release_option); spin_lock_bh(&lslist_lock); if (ls->ls_create_count == 1) { @@ -730,7 +731,8 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_device_deregister(ls); - if (force != 3 && dlm_user_daemon_available()) + if (release_option != DLM_RELEASE_NO_EVENT && + dlm_user_daemon_available()) do_uevent(ls, 0); dlm_recoverd_stop(ls); @@ -782,11 +784,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * lockspace must continue to function as usual, participating in recoveries, * until this returns. * - * Force has 4 possible values: - * 0 - don't destroy lockspace if it has any LKBs - * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs - * 2 - destroy lockspace regardless of LKBs - * 3 - destroy lockspace as part of a forced shutdown + * See DLM_RELEASE defines for release_option values and their meaning. */ int dlm_release_lockspace(void *lockspace, int force) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 5cb3896be826..51daf4acbe31 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) dlm_put_lockspace(ls); if (error) - dlm_release_lockspace(lockspace, 0); + dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS); else error = ls->ls_device.minor; @@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; - int error, force = 0; + int error, force = DLM_RELEASE_NO_LOCKS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) return -ENOENT; if (params->flags & DLM_USER_LSFLG_FORCEFREE) - force = 2; + force = DLM_RELEASE_NORMAL; lockspace = ls; dlm_put_lockspace(ls); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d8..aac4dd6d0381 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1400,7 +1400,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) return 0; fail_release: - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); fail_free: free_recover_size(ls); fail: @@ -1437,7 +1437,7 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: if (ls->ls_dlm) { - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); ls->ls_dlm = NULL; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c..765105f1ff8a 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { version_unlock(conn); - dlm_release_lockspace(conn->cc_lockspace, 2); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); conn->cc_lockspace = NULL; ocfs2_live_connection_drop(conn->cc_private); conn->cc_private = NULL; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index bacda9898f2b..cc7a36244893 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -87,13 +87,37 @@ int dlm_new_lockspace(const char *name, const char *cluster, const struct dlm_lockspace_ops *ops, void *ops_arg, int *ops_result, dlm_lockspace_t **lockspace); +/* + * dlm_release_lockspace() release_option values: + * + * DLM_RELEASE_NO_LOCKS returns -EBUSY if any locks (lkb's) + * exist in the local lockspace. + * + * DLM_RELEASE_UNUSED previous value that is no longer used. + * + * DLM_RELEASE_NORMAL releases the lockspace regardless of any + * locks managed in the local lockspace. + * + * DLM_RELEASE_NO_EVENT release the lockspace regardless of any + * locks managed in the local lockspace, and does not submit + * a leave event to the cluster manager, so other nodes will + * not be notified that the node should be removed from the + * list of lockspace members. + */ +#define DLM_RELEASE_NO_LOCKS 0 +#define DLM_RELEASE_UNUSED 1 +#define DLM_RELEASE_NORMAL 2 +#define DLM_RELEASE_NO_EVENT 3 + /* * dlm_release_lockspace * * Stop a lockspace. + * + * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int release_option); /* * dlm_lock -- cgit v1.2.3 From 6f4f4ca5caf73de5e86329547d4527b3e0c08488 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:56 -0400 Subject: dlm: add new flag DLM_RELEASE_RECOVER for dlm_lockspace_release When dlm_lockspace_release() is passed DLM_RELEASE_RECOVER, it tells the dlm to handle the release/leave as if the node had failed, i.e. perform recovery steps for a failed node, like recover_slot(). When DLM_RELEASE_RECOVER is set: - dlm_release_lockspace() includes RELEASE_RECOVER=1 in the OFFLINE uevent sent to userspace. - userspace/dlm_controld sends a message to all lockspace members indicating that the subsequent node removal should be handled as if the node had failed. - when dlm_controld on all nodes receives the new message, it sets the release_recover configfs entry to 1 for the node. - when the dlm/kernel next performs recovery and removes the node, it will see that release_recover has been set, and will perform recovery steps for the node as if it had failed, e.g. the recover_slot() callback is called to notify the fs. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- fs/dlm/member.c | 14 ++++++++++---- include/linux/dlm.h | 5 +++++ 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 6ff666a511c7..d986b7ef153d 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -738,7 +738,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) if (release_option != DLM_RELEASE_NO_EVENT && dlm_user_daemon_available()) - do_uevent(ls, 0, 0); + do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER)); dlm_recoverd_stop(ls); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 152d2cb16f59..356337102015 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls) ls->ls_ops->recover_prep(ls->ls_ops_arg); } -static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb, + unsigned int release_recover) { struct dlm_slot slot; uint32_t seq; @@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) error = dlm_comm_seq(memb->nodeid, &seq, false); - if (!error && seq == memb->comm_seq) + if (!release_recover && !error && seq == memb->comm_seq) return; slot.nodeid = memb->nodeid; @@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; struct dlm_config_node *node; int i, error, neg = 0, low = -1; + unsigned int release_recover; /* previously removed members that we've not finished removing need to * count as a negative change so the "neg" recovery steps will happen @@ -572,8 +574,12 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) if (node && !node->new && !node->gone) continue; + release_recover = 0; + if (node->gone) { - log_rinfo(ls, "remove member %d", memb->nodeid); + release_recover = node->release_recover; + log_rinfo(ls, "remove member %d%s", memb->nodeid, + release_recover ? " (release_recover)" : ""); } else { /* removed and re-added */ log_rinfo(ls, "remove member %d comm_seq %u %u", @@ -584,7 +590,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_move(&memb->list, &ls->ls_nodes_gone); remove_remote_member(memb->nodeid); ls->ls_num_nodes--; - dlm_lsop_recover_slot(ls, memb); + dlm_lsop_recover_slot(ls, memb, release_recover); } /* add new members to ls_nodes */ diff --git a/include/linux/dlm.h b/include/linux/dlm.h index cc7a36244893..108eb953eb18 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -103,11 +103,16 @@ int dlm_new_lockspace(const char *name, const char *cluster, * a leave event to the cluster manager, so other nodes will * not be notified that the node should be removed from the * list of lockspace members. + * + * DLM_RELEASE_RECOVER like DLM_RELEASE_NORMAL, but the remaining + * nodes will handle the removal of the node as if the node + * had failed, e.g. the recover_slot() callback would be used. */ #define DLM_RELEASE_NO_LOCKS 0 #define DLM_RELEASE_UNUSED 1 #define DLM_RELEASE_NORMAL 2 #define DLM_RELEASE_NO_EVENT 3 +#define DLM_RELEASE_RECOVER 4 /* * dlm_release_lockspace -- cgit v1.2.3 From 534b9bdeb4b80d843ca9f924524d4d103ad6605e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 29 Jun 2025 22:52:45 -0700 Subject: Input: tca6416-keypad - remove the driver This input driver predates proper GPIO driver for the chip and now can be replaced with the generic gpio-keys. Remove the driver. Link: https://lore.kernel.org/r/ajfsei3keh4jjasd4lshjicgqixew7bak3cmty3suoliskzgz4@vj3ijycfxy4i Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 18 -- drivers/input/keyboard/Makefile | 1 - drivers/input/keyboard/tca6416-keypad.c | 305 -------------------------------- include/linux/tca6416_keypad.h | 30 ---- 4 files changed, 354 deletions(-) delete mode 100644 drivers/input/keyboard/tca6416-keypad.c delete mode 100644 include/linux/tca6416_keypad.h (limited to 'include') diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 7c4f309a4cb6..f3474026d192 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -262,24 +262,6 @@ config KEYBOARD_GPIO_POLLED To compile this driver as a module, choose M here: the module will be called gpio_keys_polled. -config KEYBOARD_TCA6416 - tristate "TCA6416/TCA6408A Keypad Support" - depends on I2C - help - This driver implements basic keypad functionality - for keys connected through TCA6416/TCA6408A IO expanders. - - Say Y here if your device has keys connected to - TCA6416/TCA6408A IO expander. Your board-specific setup logic - must also provide pin-mask details(of which TCA6416 pins - are used for keypad). - - If enabled the entire TCA6416 device will be managed through - this driver. - - To compile this driver as a module, choose M here: the - module will be called tca6416_keypad. - config KEYBOARD_TCA8418 tristate "TCA8418 Keypad Support" depends on I2C diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 8bc20ab2b103..c0134da14580 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_KEYBOARD_EP93XX) += ep93xx_keypad.o obj-$(CONFIG_KEYBOARD_GOLDFISH_EVENTS) += goldfish_events.o obj-$(CONFIG_KEYBOARD_GPIO) += gpio_keys.o obj-$(CONFIG_KEYBOARD_GPIO_POLLED) += gpio_keys_polled.o -obj-$(CONFIG_KEYBOARD_TCA6416) += tca6416-keypad.o obj-$(CONFIG_KEYBOARD_TCA8418) += tca8418_keypad.o obj-$(CONFIG_KEYBOARD_HIL) += hil_kbd.o obj-$(CONFIG_KEYBOARD_HIL_OLD) += hilkbd.o diff --git a/drivers/input/keyboard/tca6416-keypad.c b/drivers/input/keyboard/tca6416-keypad.c deleted file mode 100644 index fbc674d7b9f0..000000000000 --- a/drivers/input/keyboard/tca6416-keypad.c +++ /dev/null @@ -1,305 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Driver for keys on TCA6416 I2C IO expander - * - * Copyright (C) 2010 Texas Instruments - * - * Author : Sriramakrishnan.A.G. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define TCA6416_INPUT 0 -#define TCA6416_OUTPUT 1 -#define TCA6416_INVERT 2 -#define TCA6416_DIRECTION 3 - -#define TCA6416_POLL_INTERVAL 100 /* msec */ - -static const struct i2c_device_id tca6416_id[] = { - { "tca6416-keys", 16, }, - { "tca6408-keys", 8, }, - { } -}; -MODULE_DEVICE_TABLE(i2c, tca6416_id); - -struct tca6416_keypad_chip { - uint16_t reg_output; - uint16_t reg_direction; - uint16_t reg_input; - - struct i2c_client *client; - struct input_dev *input; - int io_size; - u16 pinmask; - bool use_polling; - struct tca6416_button buttons[]; -}; - -static int tca6416_write_reg(struct tca6416_keypad_chip *chip, int reg, u16 val) -{ - int error; - - error = chip->io_size > 8 ? - i2c_smbus_write_word_data(chip->client, reg << 1, val) : - i2c_smbus_write_byte_data(chip->client, reg, val); - if (error < 0) { - dev_err(&chip->client->dev, - "%s failed, reg: %d, val: %d, error: %d\n", - __func__, reg, val, error); - return error; - } - - return 0; -} - -static int tca6416_read_reg(struct tca6416_keypad_chip *chip, int reg, u16 *val) -{ - int retval; - - retval = chip->io_size > 8 ? - i2c_smbus_read_word_data(chip->client, reg << 1) : - i2c_smbus_read_byte_data(chip->client, reg); - if (retval < 0) { - dev_err(&chip->client->dev, "%s failed, reg: %d, error: %d\n", - __func__, reg, retval); - return retval; - } - - *val = (u16)retval; - return 0; -} - -static void tca6416_keys_scan(struct input_dev *input) -{ - struct tca6416_keypad_chip *chip = input_get_drvdata(input); - u16 reg_val, val; - int error, i, pin_index; - - error = tca6416_read_reg(chip, TCA6416_INPUT, ®_val); - if (error) - return; - - reg_val &= chip->pinmask; - - /* Figure out which lines have changed */ - val = reg_val ^ chip->reg_input; - chip->reg_input = reg_val; - - for (i = 0, pin_index = 0; i < 16; i++) { - if (val & (1 << i)) { - struct tca6416_button *button = &chip->buttons[pin_index]; - unsigned int type = button->type ?: EV_KEY; - int state = ((reg_val & (1 << i)) ? 1 : 0) - ^ button->active_low; - - input_event(input, type, button->code, !!state); - input_sync(input); - } - - if (chip->pinmask & (1 << i)) - pin_index++; - } -} - -/* - * This is threaded IRQ handler and this can (and will) sleep. - */ -static irqreturn_t tca6416_keys_isr(int irq, void *dev_id) -{ - tca6416_keys_scan(dev_id); - - return IRQ_HANDLED; -} - -static int tca6416_keys_open(struct input_dev *dev) -{ - struct tca6416_keypad_chip *chip = input_get_drvdata(dev); - - if (!chip->use_polling) { - /* Get initial device state in case it has switches */ - tca6416_keys_scan(dev); - enable_irq(chip->client->irq); - } - - return 0; -} - -static void tca6416_keys_close(struct input_dev *dev) -{ - struct tca6416_keypad_chip *chip = input_get_drvdata(dev); - - if (!chip->use_polling) - disable_irq(chip->client->irq); -} - -static int tca6416_setup_registers(struct tca6416_keypad_chip *chip) -{ - int error; - - error = tca6416_read_reg(chip, TCA6416_OUTPUT, &chip->reg_output); - if (error) - return error; - - error = tca6416_read_reg(chip, TCA6416_DIRECTION, &chip->reg_direction); - if (error) - return error; - - /* ensure that keypad pins are set to input */ - error = tca6416_write_reg(chip, TCA6416_DIRECTION, - chip->reg_direction | chip->pinmask); - if (error) - return error; - - error = tca6416_read_reg(chip, TCA6416_DIRECTION, &chip->reg_direction); - if (error) - return error; - - error = tca6416_read_reg(chip, TCA6416_INPUT, &chip->reg_input); - if (error) - return error; - - chip->reg_input &= chip->pinmask; - - return 0; -} - -static int tca6416_keypad_probe(struct i2c_client *client) -{ - const struct i2c_device_id *id = i2c_client_get_device_id(client); - struct tca6416_keys_platform_data *pdata; - struct tca6416_keypad_chip *chip; - struct input_dev *input; - int error; - int i; - - /* Check functionality */ - if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE)) { - dev_err(&client->dev, "%s adapter not supported\n", - dev_driver_string(&client->adapter->dev)); - return -ENODEV; - } - - pdata = dev_get_platdata(&client->dev); - if (!pdata) { - dev_dbg(&client->dev, "no platform data\n"); - return -EINVAL; - } - - chip = devm_kzalloc(&client->dev, - struct_size(chip, buttons, pdata->nbuttons), - GFP_KERNEL); - if (!chip) - return -ENOMEM; - - input = devm_input_allocate_device(&client->dev); - if (!input) - return -ENOMEM; - - chip->client = client; - chip->input = input; - chip->io_size = id->driver_data; - chip->pinmask = pdata->pinmask; - chip->use_polling = pdata->use_polling; - - input->phys = "tca6416-keys/input0"; - input->name = client->name; - - input->open = tca6416_keys_open; - input->close = tca6416_keys_close; - - input->id.bustype = BUS_HOST; - input->id.vendor = 0x0001; - input->id.product = 0x0001; - input->id.version = 0x0100; - - /* Enable auto repeat feature of Linux input subsystem */ - if (pdata->rep) - __set_bit(EV_REP, input->evbit); - - for (i = 0; i < pdata->nbuttons; i++) { - unsigned int type; - - chip->buttons[i] = pdata->buttons[i]; - type = (pdata->buttons[i].type) ?: EV_KEY; - input_set_capability(input, type, pdata->buttons[i].code); - } - - input_set_drvdata(input, chip); - - /* - * Initialize cached registers from their original values. - * we can't share this chip with another i2c master. - */ - error = tca6416_setup_registers(chip); - if (error) - return error; - - if (chip->use_polling) { - error = input_setup_polling(input, tca6416_keys_scan); - if (error) { - dev_err(&client->dev, "Failed to setup polling\n"); - return error; - } - - input_set_poll_interval(input, TCA6416_POLL_INTERVAL); - } else { - error = devm_request_threaded_irq(&client->dev, client->irq, - NULL, tca6416_keys_isr, - IRQF_TRIGGER_FALLING | - IRQF_ONESHOT | - IRQF_NO_AUTOEN, - "tca6416-keypad", input); - if (error) { - dev_dbg(&client->dev, - "Unable to claim irq %d; error %d\n", - client->irq, error); - return error; - } - } - - error = input_register_device(input); - if (error) { - dev_dbg(&client->dev, - "Unable to register input device, error: %d\n", error); - return error; - } - - i2c_set_clientdata(client, chip); - - return 0; -} - -static struct i2c_driver tca6416_keypad_driver = { - .driver = { - .name = "tca6416-keypad", - }, - .probe = tca6416_keypad_probe, - .id_table = tca6416_id, -}; - -static int __init tca6416_keypad_init(void) -{ - return i2c_add_driver(&tca6416_keypad_driver); -} - -subsys_initcall(tca6416_keypad_init); - -static void __exit tca6416_keypad_exit(void) -{ - i2c_del_driver(&tca6416_keypad_driver); -} -module_exit(tca6416_keypad_exit); - -MODULE_AUTHOR("Sriramakrishnan "); -MODULE_DESCRIPTION("Keypad driver over tca6416 IO expander"); -MODULE_LICENSE("GPL"); diff --git a/include/linux/tca6416_keypad.h b/include/linux/tca6416_keypad.h deleted file mode 100644 index 5cf6f6f82aa7..000000000000 --- a/include/linux/tca6416_keypad.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tca6416 keypad platform support - * - * Copyright (C) 2010 Texas Instruments - * - * Author: Sriramakrishnan - */ - -#ifndef _TCA6416_KEYS_H -#define _TCA6416_KEYS_H - -#include - -struct tca6416_button { - /* Configuration parameters */ - int code; /* input event code (KEY_*, SW_*) */ - int active_low; - int type; /* input event type (EV_KEY, EV_SW) */ -}; - -struct tca6416_keys_platform_data { - struct tca6416_button *buttons; - int nbuttons; - unsigned int rep:1; /* enable input subsystem auto repeat */ - uint16_t pinmask; - uint16_t invert; - int use_polling; /* use polling if Interrupt is not connected*/ -}; -#endif -- cgit v1.2.3 From c93c59baa5ab57e94b874000cec56e26611b7a23 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 11 Aug 2025 20:58:20 +0200 Subject: bpf: Tidy verifier bug message Yonghong noticed that error messages for potential verifier bugs often have a '(1)' at the end. This is happening because verifier_bug_if(cond, env, fmt, args...) prints "(" #cond ")\n" as part of the message and verifier_bug() is defined as: #define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) Hence, verifier_bug() always ends up displaying '(1)'. This small patch fixes it by having verifier_bug_if conditionally call verifier_bug instead of the other way around. Fixes: 1cb0f56d9618 ("bpf: WARN_ONCE on verifier bugs") Reported-by: Yonghong Song Signed-off-by: Paul Chaignon Signed-off-by: Andrii Nakryiko Tested-by: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/aJo9THBrzo8jFXsh@mail.gmail.com --- include/linux/bpf_verifier.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c823f8efe3ed..020de62bd09c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -875,13 +875,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, #define verifier_bug_if(cond, env, fmt, args...) \ ({ \ bool __cond = (cond); \ - if (unlikely(__cond)) { \ - BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - } \ + if (unlikely(__cond)) \ + verifier_bug(env, fmt " (" #cond ")", ##args); \ (__cond); \ }) -#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) +#define verifier_bug(env, fmt, args...) \ + ({ \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "\n", ##args); \ + }) static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 07bbbfe7addf5b032e04f3c38f0b183d067a3f0d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 11 Aug 2025 19:50:43 +0100 Subject: net: stmmac: add suspend()/resume() platform ops Add suspend/resume platform operations, which, when populated, override the init/exit platform operations when we suspend and resume. These suspend()/resume() methods are called by core code, and thus are designed to support any struct device, not just platform devices. This allows them to be used by the PCI drivers we have. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1ulXbX-008gqZ-Bb@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 9 +++++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 12 ++++++++---- include/linux/stmmac.h | 2 ++ 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f1abf4242cd2..2da4f7bb2899 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7879,6 +7879,9 @@ int stmmac_suspend(struct device *dev) if (stmmac_fpe_supported(priv)) ethtool_mmsv_stop(&priv->fpe_cfg.mmsv); + if (priv->plat->suspend) + return priv->plat->suspend(dev, priv->plat->bsp_priv); + return 0; } EXPORT_SYMBOL_GPL(stmmac_suspend); @@ -7931,6 +7934,12 @@ int stmmac_resume(struct device *dev) struct stmmac_priv *priv = netdev_priv(ndev); int ret; + if (priv->plat->resume) { + ret = priv->plat->resume(dev, priv->plat->bsp_priv); + if (ret) + return ret; + } + if (!netif_running(ndev)) return 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 030fcf1b5993..21df052eeed0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -901,7 +901,9 @@ static int __maybe_unused stmmac_pltfr_suspend(struct device *dev) struct platform_device *pdev = to_platform_device(dev); ret = stmmac_suspend(dev); - stmmac_pltfr_exit(pdev, priv->plat); + + if (!priv->plat->suspend) + stmmac_pltfr_exit(pdev, priv->plat); return ret; } @@ -920,9 +922,11 @@ static int __maybe_unused stmmac_pltfr_resume(struct device *dev) struct platform_device *pdev = to_platform_device(dev); int ret; - ret = stmmac_pltfr_init(pdev, priv->plat); - if (ret) - return ret; + if (!priv->plat->resume) { + ret = stmmac_pltfr_init(pdev, priv->plat); + if (ret) + return ret; + } return stmmac_resume(dev); } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 26ddf95d23f9..22c24dacbc65 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -248,6 +248,8 @@ struct plat_stmmacenet_data { void (*ptp_clk_freq_config)(struct stmmac_priv *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); + int (*suspend)(struct device *dev, void *priv); + int (*resume)(struct device *dev, void *priv); struct mac_device_info *(*setup)(void *priv); int (*clks_config)(void *priv, bool enabled); int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, -- cgit v1.2.3 From b3ef7bdec66fb1813e865fd39d179a93cefd2015 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 11 Aug 2025 17:31:42 +0200 Subject: net: airoha: Add airoha_offload.h header Move NPU definitions to airoha_offload.h in include/linux/soc/airoha/ in order to allow the MT76 driver to access the callback definitions. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250811-airoha-en7581-wlan-offlaod-v7-7-58823603bb4e@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_npu.c | 2 +- drivers/net/ethernet/airoha/airoha_npu.h | 103 ------------ drivers/net/ethernet/airoha/airoha_ppe.c | 2 +- include/linux/soc/airoha/airoha_offload.h | 260 ++++++++++++++++++++++++++++++ 4 files changed, 262 insertions(+), 105 deletions(-) delete mode 100644 drivers/net/ethernet/airoha/airoha_npu.h create mode 100644 include/linux/soc/airoha/airoha_offload.h (limited to 'include') diff --git a/drivers/net/ethernet/airoha/airoha_npu.c b/drivers/net/ethernet/airoha/airoha_npu.c index 66a8a992dbf2..1a6b191ae0b0 100644 --- a/drivers/net/ethernet/airoha/airoha_npu.c +++ b/drivers/net/ethernet/airoha/airoha_npu.c @@ -11,9 +11,9 @@ #include #include #include +#include #include "airoha_eth.h" -#include "airoha_npu.h" #define NPU_EN7581_FIRMWARE_DATA "airoha/en7581_npu_data.bin" #define NPU_EN7581_FIRMWARE_RV32 "airoha/en7581_npu_rv32.bin" diff --git a/drivers/net/ethernet/airoha/airoha_npu.h b/drivers/net/ethernet/airoha/airoha_npu.h deleted file mode 100644 index a448c74208a9..000000000000 --- a/drivers/net/ethernet/airoha/airoha_npu.h +++ /dev/null @@ -1,103 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2025 AIROHA Inc - * Author: Lorenzo Bianconi - */ - -#define NPU_NUM_CORES 8 -#define NPU_NUM_IRQ 6 - -enum airoha_npu_wlan_set_cmd { - WLAN_FUNC_SET_WAIT_PCIE_ADDR, - WLAN_FUNC_SET_WAIT_DESC, - WLAN_FUNC_SET_WAIT_NPU_INIT_DONE, - WLAN_FUNC_SET_WAIT_TRAN_TO_CPU, - WLAN_FUNC_SET_WAIT_BA_WIN_SIZE, - WLAN_FUNC_SET_WAIT_DRIVER_MODEL, - WLAN_FUNC_SET_WAIT_DEL_STA, - WLAN_FUNC_SET_WAIT_DRAM_BA_NODE_ADDR, - WLAN_FUNC_SET_WAIT_PKT_BUF_ADDR, - WLAN_FUNC_SET_WAIT_IS_TEST_NOBA, - WLAN_FUNC_SET_WAIT_FLUSHONE_TIMEOUT, - WLAN_FUNC_SET_WAIT_FLUSHALL_TIMEOUT, - WLAN_FUNC_SET_WAIT_IS_FORCE_TO_CPU, - WLAN_FUNC_SET_WAIT_PCIE_STATE, - WLAN_FUNC_SET_WAIT_PCIE_PORT_TYPE, - WLAN_FUNC_SET_WAIT_ERROR_RETRY_TIMES, - WLAN_FUNC_SET_WAIT_BAR_INFO, - WLAN_FUNC_SET_WAIT_FAST_FLAG, - WLAN_FUNC_SET_WAIT_NPU_BAND0_ONCPU, - WLAN_FUNC_SET_WAIT_TX_RING_PCIE_ADDR, - WLAN_FUNC_SET_WAIT_TX_DESC_HW_BASE, - WLAN_FUNC_SET_WAIT_TX_BUF_SPACE_HW_BASE, - WLAN_FUNC_SET_WAIT_RX_RING_FOR_TXDONE_HW_BASE, - WLAN_FUNC_SET_WAIT_TX_PKT_BUF_ADDR, - WLAN_FUNC_SET_WAIT_INODE_TXRX_REG_ADDR, - WLAN_FUNC_SET_WAIT_INODE_DEBUG_FLAG, - WLAN_FUNC_SET_WAIT_INODE_HW_CFG_INFO, - WLAN_FUNC_SET_WAIT_INODE_STOP_ACTION, - WLAN_FUNC_SET_WAIT_INODE_PCIE_SWAP, - WLAN_FUNC_SET_WAIT_RATELIMIT_CTRL, - WLAN_FUNC_SET_WAIT_HWNAT_INIT, - WLAN_FUNC_SET_WAIT_ARHT_CHIP_INFO, - WLAN_FUNC_SET_WAIT_TX_BUF_CHECK_ADDR, - WLAN_FUNC_SET_WAIT_TOKEN_ID_SIZE, -}; - -enum airoha_npu_wlan_get_cmd { - WLAN_FUNC_GET_WAIT_NPU_INFO, - WLAN_FUNC_GET_WAIT_LAST_RATE, - WLAN_FUNC_GET_WAIT_COUNTER, - WLAN_FUNC_GET_WAIT_DBG_COUNTER, - WLAN_FUNC_GET_WAIT_RXDESC_BASE, - WLAN_FUNC_GET_WAIT_WCID_DBG_COUNTER, - WLAN_FUNC_GET_WAIT_DMA_ADDR, - WLAN_FUNC_GET_WAIT_RING_SIZE, - WLAN_FUNC_GET_WAIT_NPU_SUPPORT_MAP, - WLAN_FUNC_GET_WAIT_MDC_LOCK_ADDRESS, - WLAN_FUNC_GET_WAIT_NPU_VERSION, -}; - -struct airoha_npu { - struct device *dev; - struct regmap *regmap; - - struct airoha_npu_core { - struct airoha_npu *npu; - /* protect concurrent npu memory accesses */ - spinlock_t lock; - struct work_struct wdt_work; - } cores[NPU_NUM_CORES]; - - int irqs[NPU_NUM_IRQ]; - - struct airoha_foe_stats __iomem *stats; - - struct { - int (*ppe_init)(struct airoha_npu *npu); - int (*ppe_deinit)(struct airoha_npu *npu); - int (*ppe_flush_sram_entries)(struct airoha_npu *npu, - dma_addr_t foe_addr, - int sram_num_entries); - int (*ppe_foe_commit_entry)(struct airoha_npu *npu, - dma_addr_t foe_addr, - u32 entry_size, u32 hash, - bool ppe2); - int (*wlan_init_reserved_memory)(struct airoha_npu *npu); - int (*wlan_send_msg)(struct airoha_npu *npu, int ifindex, - enum airoha_npu_wlan_set_cmd func_id, - void *data, int data_len, gfp_t gfp); - int (*wlan_get_msg)(struct airoha_npu *npu, int ifindex, - enum airoha_npu_wlan_get_cmd func_id, - void *data, int data_len, gfp_t gfp); - u32 (*wlan_get_queue_addr)(struct airoha_npu *npu, int qid, - bool xmit); - void (*wlan_set_irq_status)(struct airoha_npu *npu, u32 val); - u32 (*wlan_get_irq_status)(struct airoha_npu *npu, int q); - void (*wlan_enable_irq)(struct airoha_npu *npu, int q); - void (*wlan_disable_irq)(struct airoha_npu *npu, int q); - } ops; -}; - -struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr); -void airoha_npu_put(struct airoha_npu *npu); diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 47411d2cbd28..82163392332c 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -7,10 +7,10 @@ #include #include #include +#include #include #include -#include "airoha_npu.h" #include "airoha_regs.h" #include "airoha_eth.h" diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h new file mode 100644 index 000000000000..117c63c2448d --- /dev/null +++ b/include/linux/soc/airoha/airoha_offload.h @@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 AIROHA Inc + * Author: Lorenzo Bianconi + */ +#ifndef AIROHA_OFFLOAD_H +#define AIROHA_OFFLOAD_H + +#include +#include + +#define NPU_NUM_CORES 8 +#define NPU_NUM_IRQ 6 +#define NPU_RX0_DESC_NUM 512 +#define NPU_RX1_DESC_NUM 512 + +/* CTRL */ +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) +#define NPU_RX_DMA_DESC_DONE_MASK BIT(0) +/* INFO */ +#define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 28) +#define NPU_RX_DMA_PKT_ID_MASK GENMASK(28, 26) +#define NPU_RX_DMA_SRC_PORT_MASK GENMASK(25, 21) +#define NPU_RX_DMA_CRSN_MASK GENMASK(20, 16) +#define NPU_RX_DMA_FOE_ID_MASK GENMASK(15, 0) +/* DATA */ +#define NPU_RX_DMA_SID_MASK GENMASK(31, 16) +#define NPU_RX_DMA_FRAG_TYPE_MASK GENMASK(15, 14) +#define NPU_RX_DMA_PRIORITY_MASK GENMASK(13, 10) +#define NPU_RX_DMA_RADIO_ID_MASK GENMASK(9, 6) +#define NPU_RX_DMA_VAP_ID_MASK GENMASK(5, 2) +#define NPU_RX_DMA_FRAME_TYPE_MASK GENMASK(1, 0) + +struct airoha_npu_rx_dma_desc { + u32 ctrl; + u32 info; + u32 data; + u32 addr; + u64 rsv; +} __packed; + +/* CTRL */ +#define NPU_TX_DMA_DESC_SCHED_MASK BIT(31) +#define NPU_TX_DMA_DESC_LEN_MASK GENMASK(30, 18) +#define NPU_TX_DMA_DESC_VEND_LEN_MASK GENMASK(17, 1) +#define NPU_TX_DMA_DESC_DONE_MASK BIT(0) + +#define NPU_TXWI_LEN 192 + +struct airoha_npu_tx_dma_desc { + u32 ctrl; + u32 addr; + u64 rsv; + u8 txwi[NPU_TXWI_LEN]; +} __packed; + +enum airoha_npu_wlan_set_cmd { + WLAN_FUNC_SET_WAIT_PCIE_ADDR, + WLAN_FUNC_SET_WAIT_DESC, + WLAN_FUNC_SET_WAIT_NPU_INIT_DONE, + WLAN_FUNC_SET_WAIT_TRAN_TO_CPU, + WLAN_FUNC_SET_WAIT_BA_WIN_SIZE, + WLAN_FUNC_SET_WAIT_DRIVER_MODEL, + WLAN_FUNC_SET_WAIT_DEL_STA, + WLAN_FUNC_SET_WAIT_DRAM_BA_NODE_ADDR, + WLAN_FUNC_SET_WAIT_PKT_BUF_ADDR, + WLAN_FUNC_SET_WAIT_IS_TEST_NOBA, + WLAN_FUNC_SET_WAIT_FLUSHONE_TIMEOUT, + WLAN_FUNC_SET_WAIT_FLUSHALL_TIMEOUT, + WLAN_FUNC_SET_WAIT_IS_FORCE_TO_CPU, + WLAN_FUNC_SET_WAIT_PCIE_STATE, + WLAN_FUNC_SET_WAIT_PCIE_PORT_TYPE, + WLAN_FUNC_SET_WAIT_ERROR_RETRY_TIMES, + WLAN_FUNC_SET_WAIT_BAR_INFO, + WLAN_FUNC_SET_WAIT_FAST_FLAG, + WLAN_FUNC_SET_WAIT_NPU_BAND0_ONCPU, + WLAN_FUNC_SET_WAIT_TX_RING_PCIE_ADDR, + WLAN_FUNC_SET_WAIT_TX_DESC_HW_BASE, + WLAN_FUNC_SET_WAIT_TX_BUF_SPACE_HW_BASE, + WLAN_FUNC_SET_WAIT_RX_RING_FOR_TXDONE_HW_BASE, + WLAN_FUNC_SET_WAIT_TX_PKT_BUF_ADDR, + WLAN_FUNC_SET_WAIT_INODE_TXRX_REG_ADDR, + WLAN_FUNC_SET_WAIT_INODE_DEBUG_FLAG, + WLAN_FUNC_SET_WAIT_INODE_HW_CFG_INFO, + WLAN_FUNC_SET_WAIT_INODE_STOP_ACTION, + WLAN_FUNC_SET_WAIT_INODE_PCIE_SWAP, + WLAN_FUNC_SET_WAIT_RATELIMIT_CTRL, + WLAN_FUNC_SET_WAIT_HWNAT_INIT, + WLAN_FUNC_SET_WAIT_ARHT_CHIP_INFO, + WLAN_FUNC_SET_WAIT_TX_BUF_CHECK_ADDR, + WLAN_FUNC_SET_WAIT_TOKEN_ID_SIZE, +}; + +enum airoha_npu_wlan_get_cmd { + WLAN_FUNC_GET_WAIT_NPU_INFO, + WLAN_FUNC_GET_WAIT_LAST_RATE, + WLAN_FUNC_GET_WAIT_COUNTER, + WLAN_FUNC_GET_WAIT_DBG_COUNTER, + WLAN_FUNC_GET_WAIT_RXDESC_BASE, + WLAN_FUNC_GET_WAIT_WCID_DBG_COUNTER, + WLAN_FUNC_GET_WAIT_DMA_ADDR, + WLAN_FUNC_GET_WAIT_RING_SIZE, + WLAN_FUNC_GET_WAIT_NPU_SUPPORT_MAP, + WLAN_FUNC_GET_WAIT_MDC_LOCK_ADDRESS, + WLAN_FUNC_GET_WAIT_NPU_VERSION, +}; + +struct airoha_npu { +#if (IS_BUILTIN(CONFIG_NET_AIROHA_NPU) || IS_MODULE(CONFIG_NET_AIROHA_NPU)) + struct device *dev; + struct regmap *regmap; + + struct airoha_npu_core { + struct airoha_npu *npu; + /* protect concurrent npu memory accesses */ + spinlock_t lock; + struct work_struct wdt_work; + } cores[NPU_NUM_CORES]; + + int irqs[NPU_NUM_IRQ]; + + struct airoha_foe_stats __iomem *stats; + + struct { + int (*ppe_init)(struct airoha_npu *npu); + int (*ppe_deinit)(struct airoha_npu *npu); + int (*ppe_flush_sram_entries)(struct airoha_npu *npu, + dma_addr_t foe_addr, + int sram_num_entries); + int (*ppe_foe_commit_entry)(struct airoha_npu *npu, + dma_addr_t foe_addr, + u32 entry_size, u32 hash, + bool ppe2); + int (*wlan_init_reserved_memory)(struct airoha_npu *npu); + int (*wlan_send_msg)(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_set_cmd func_id, + void *data, int data_len, gfp_t gfp); + int (*wlan_get_msg)(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_get_cmd func_id, + void *data, int data_len, gfp_t gfp); + u32 (*wlan_get_queue_addr)(struct airoha_npu *npu, int qid, + bool xmit); + void (*wlan_set_irq_status)(struct airoha_npu *npu, u32 val); + u32 (*wlan_get_irq_status)(struct airoha_npu *npu, int q); + void (*wlan_enable_irq)(struct airoha_npu *npu, int q); + void (*wlan_disable_irq)(struct airoha_npu *npu, int q); + } ops; +#endif +}; + +#if (IS_BUILTIN(CONFIG_NET_AIROHA_NPU) || IS_MODULE(CONFIG_NET_AIROHA_NPU)) +struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr); +void airoha_npu_put(struct airoha_npu *npu); + +static inline int airoha_npu_wlan_init_reserved_memory(struct airoha_npu *npu) +{ + return npu->ops.wlan_init_reserved_memory(npu); +} + +static inline int airoha_npu_wlan_send_msg(struct airoha_npu *npu, + int ifindex, + enum airoha_npu_wlan_set_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return npu->ops.wlan_send_msg(npu, ifindex, cmd, data, data_len, gfp); +} + +static inline int airoha_npu_wlan_get_msg(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_get_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return npu->ops.wlan_get_msg(npu, ifindex, cmd, data, data_len, gfp); +} + +static inline u32 airoha_npu_wlan_get_queue_addr(struct airoha_npu *npu, + int qid, bool xmit) +{ + return npu->ops.wlan_get_queue_addr(npu, qid, xmit); +} + +static inline void airoha_npu_wlan_set_irq_status(struct airoha_npu *npu, + u32 val) +{ + npu->ops.wlan_set_irq_status(npu, val); +} + +static inline u32 airoha_npu_wlan_get_irq_status(struct airoha_npu *npu, int q) +{ + return npu->ops.wlan_get_irq_status(npu, q); +} + +static inline void airoha_npu_wlan_enable_irq(struct airoha_npu *npu, int q) +{ + npu->ops.wlan_enable_irq(npu, q); +} + +static inline void airoha_npu_wlan_disable_irq(struct airoha_npu *npu, int q) +{ + npu->ops.wlan_disable_irq(npu, q); +} +#else +static inline struct airoha_npu *airoha_npu_get(struct device *dev, + dma_addr_t *foe_stats_addr) +{ + return NULL; +} + +static inline void airoha_npu_put(struct airoha_npu *npu) +{ +} + +static inline int airoha_npu_wlan_init_reserved_memory(struct airoha_npu *npu) +{ + return -EOPNOTSUPP; +} + +static inline int airoha_npu_wlan_send_msg(struct airoha_npu *npu, + int ifindex, + enum airoha_npu_wlan_set_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return -EOPNOTSUPP; +} + +static inline int airoha_npu_wlan_get_msg(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_get_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return -EOPNOTSUPP; +} + +static inline u32 airoha_npu_wlan_get_queue_addr(struct airoha_npu *npu, + int qid, bool xmit) +{ + return 0; +} + +static inline void airoha_npu_wlan_set_irq_status(struct airoha_npu *npu, + u32 val) +{ +} + +static inline u32 airoha_npu_wlan_get_irq_status(struct airoha_npu *npu, + int q) +{ + return 0; +} + +static inline void airoha_npu_wlan_enable_irq(struct airoha_npu *npu, int q) +{ +} + +static inline void airoha_npu_wlan_disable_irq(struct airoha_npu *npu, int q) +{ +} +#endif + +#endif /* AIROHA_OFFLOAD_H */ -- cgit v1.2.3 From 618882c92681de18e9bd99d2a88bb21c897283f3 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:29:50 +0300 Subject: media: Wrap file->private_data access with a helper function Accessing file->private_data manually to retrieve the v4l2_fh pointer is error-prone, as the field is a void * and will happily convert implicitly to any pointer type. To avoid direct access to file->private_data, introduce a new inline function that retrieves the v4l2_fh pointer, and use it to replace common access patterns through the kernel. Changes to drivers have been generated with the following coccinelle semantic patch: @@ struct file *filp; identifier fh; @@ - struct v4l2_fh *fh = filp->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(filp); Manual changes have been applied to Documentation/ to update the usage patterns, and to include/media/v4l2-fh.h to add the new function. While at it, fix a typo in the title of v4l2-fh.rst: the file describes the "file handles" API, not "file handlers". No functional change is intended, this only paves the way to remove direct accesses to file->private_data and make V4L2 drivers safer. Other accesses to the field will be addressed separately. Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- Documentation/driver-api/media/v4l2-fh.rst | 16 ++++++---- .../zh_CN/video4linux/v4l2-framework.txt | 2 +- drivers/media/common/videobuf2/videobuf2-v4l2.c | 2 +- drivers/media/pci/cx18/cx18-fileops.c | 2 +- drivers/media/pci/ivtv/ivtv-fileops.c | 2 +- drivers/media/platform/allegro-dvt/allegro-core.c | 2 +- .../media/platform/mediatek/jpeg/mtk_jpeg_core.c | 2 +- drivers/media/platform/nvidia/tegra-vde/v4l2.c | 2 +- drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c | 4 +-- drivers/media/platform/renesas/vsp1/vsp1_histo.c | 6 ++-- drivers/media/platform/renesas/vsp1/vsp1_video.c | 12 ++++---- drivers/media/platform/ti/omap3isp/ispvideo.c | 2 +- drivers/media/platform/xilinx/xilinx-dma.c | 10 +++---- drivers/media/usb/uvc/uvc_metadata.c | 10 +++---- drivers/media/v4l2-core/v4l2-ctrls-api.c | 4 +-- drivers/media/v4l2-core/v4l2-fh.c | 2 +- drivers/media/v4l2-core/v4l2-mem2mem.c | 34 +++++++++++----------- drivers/media/v4l2-core/v4l2-subdev.c | 8 ++--- include/media/v4l2-fh.h | 14 +++++++++ 19 files changed, 77 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/media/v4l2-fh.rst b/Documentation/driver-api/media/v4l2-fh.rst index 3eeaa8da0c9e..2c87b74578d9 100644 --- a/Documentation/driver-api/media/v4l2-fh.rst +++ b/Documentation/driver-api/media/v4l2-fh.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 -V4L2 File handlers ------------------- +V4L2 File handles +----------------- struct v4l2_fh provides a way to easily keep file handle specific data that is used by the V4L2 framework. @@ -18,7 +18,9 @@ This bit is set whenever :c:func:`v4l2_fh_init` is called. struct v4l2_fh is allocated as a part of the driver's own file handle structure and ``file->private_data`` is set to it in the driver's ``open()`` -function by the driver. +function by the driver. The :c:type:`v4l2_fh` file handle can be retrieved +from the :c:type:`file` using :c:func:`file_to_v4l2_fh`. Drivers must not +access ``file->private_data`` directly. In many cases the struct v4l2_fh will be embedded in a larger structure. In that case you should call: @@ -63,7 +65,7 @@ Example: int my_release(struct file *file) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct my_fh *my_fh = container_of(fh, struct my_fh, fh); ... @@ -78,11 +80,9 @@ Below is a short description of the :c:type:`v4l2_fh` functions used: :c:func:`v4l2_fh_init ` (:c:type:`fh `, :c:type:`vdev `) - - Initialise the file handle. This **MUST** be performed in the driver's :c:type:`v4l2_file_operations`->open() handler. - :c:func:`v4l2_fh_add ` (:c:type:`fh `) @@ -101,6 +101,10 @@ Below is a short description of the :c:type:`v4l2_fh` functions used: - Uninitialise the file handle. After uninitialisation the :c:type:`v4l2_fh` memory can be freed. +:c:func:`file_to_v4l2_fh ` +(struct file \*filp) + +- Retrieve the :c:type:`v4l2_fh` instance associated with a :c:type:`file`. If struct v4l2_fh is not embedded, then you can use these helper functions: diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt index 9cc97ec75d7a..a9eb62fa1531 100644 --- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt @@ -819,7 +819,7 @@ int my_open(struct file *file) int my_release(struct file *file) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct my_fh *my_fh = container_of(fh, struct my_fh, fh); ... diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 1cd26faee503..f29307e59be5 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -979,7 +979,7 @@ __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait) res = vb2_core_poll(q, file, wait); if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); poll_wait(file, &fh->wait, wait); if (v4l2_event_pending(fh)) diff --git a/drivers/media/pci/cx18/cx18-fileops.c b/drivers/media/pci/cx18/cx18-fileops.c index cefa91b37f89..af25628b11ba 100644 --- a/drivers/media/pci/cx18/cx18-fileops.c +++ b/drivers/media/pci/cx18/cx18-fileops.c @@ -678,7 +678,7 @@ void cx18_stop_capture(struct cx18_stream *s, int gop_end) int cx18_v4l2_close(struct file *filp) { - struct v4l2_fh *fh = filp->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(filp); struct cx18_open_id *id = fh2id(fh); struct cx18 *cx = id->cx; struct cx18_stream *s = &cx->streams[id->type]; diff --git a/drivers/media/pci/ivtv/ivtv-fileops.c b/drivers/media/pci/ivtv/ivtv-fileops.c index cfa28d035586..230d498108b5 100644 --- a/drivers/media/pci/ivtv/ivtv-fileops.c +++ b/drivers/media/pci/ivtv/ivtv-fileops.c @@ -877,7 +877,7 @@ static void ivtv_stop_decoding(struct ivtv_open_id *id, int flags, u64 pts) int ivtv_v4l2_close(struct file *filp) { - struct v4l2_fh *fh = filp->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(filp); struct ivtv_open_id *id = fh2id(fh); struct ivtv *itv = id->itv; struct ivtv_stream *s = &itv->streams[id->type]; diff --git a/drivers/media/platform/allegro-dvt/allegro-core.c b/drivers/media/platform/allegro-dvt/allegro-core.c index eb03df0d8652..1f134e08923a 100644 --- a/drivers/media/platform/allegro-dvt/allegro-core.c +++ b/drivers/media/platform/allegro-dvt/allegro-core.c @@ -3483,7 +3483,7 @@ static int allegro_enum_framesizes(struct file *file, void *fh, static int allegro_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type type) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct allegro_channel *channel = fh_to_channel(fh); int err; diff --git a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c index 7eb12449b63a..329e5787c2c2 100644 --- a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c +++ b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c @@ -588,7 +588,7 @@ static int mtk_jpeg_enc_s_selection(struct file *file, void *priv, static int mtk_jpeg_qbuf(struct file *file, void *priv, struct v4l2_buffer *buf) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct mtk_jpeg_ctx *ctx = mtk_jpeg_fh_to_ctx(priv); struct vb2_queue *vq; struct vb2_buffer *vb; diff --git a/drivers/media/platform/nvidia/tegra-vde/v4l2.c b/drivers/media/platform/nvidia/tegra-vde/v4l2.c index e3726cab0c82..531a85e3fe49 100644 --- a/drivers/media/platform/nvidia/tegra-vde/v4l2.c +++ b/drivers/media/platform/nvidia/tegra-vde/v4l2.c @@ -853,7 +853,7 @@ free_ctx: static int tegra_release(struct file *file) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct tegra_ctx *ctx = fh_to_tegra_ctx(fh); struct tegra_vde *vde = ctx->vde; diff --git a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c index 8681dd193033..1b2148578cb6 100644 --- a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c +++ b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c @@ -1604,7 +1604,7 @@ end: static int mxc_jpeg_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *cmd) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct mxc_jpeg_ctx *ctx = mxc_jpeg_fh_to_ctx(fh); unsigned long flags; int ret; @@ -1637,7 +1637,7 @@ static int mxc_jpeg_decoder_cmd(struct file *file, void *priv, static int mxc_jpeg_encoder_cmd(struct file *file, void *priv, struct v4l2_encoder_cmd *cmd) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct mxc_jpeg_ctx *ctx = mxc_jpeg_fh_to_ctx(fh); unsigned long flags; int ret; diff --git a/drivers/media/platform/renesas/vsp1/vsp1_histo.c b/drivers/media/platform/renesas/vsp1/vsp1_histo.c index c762202877ba..390ea50f1595 100644 --- a/drivers/media/platform/renesas/vsp1/vsp1_histo.c +++ b/drivers/media/platform/renesas/vsp1/vsp1_histo.c @@ -392,7 +392,7 @@ static const struct v4l2_subdev_ops histo_ops = { static int histo_v4l2_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_histogram *histo = vdev_to_histo(vfh->vdev); cap->capabilities = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_STREAMING @@ -409,7 +409,7 @@ static int histo_v4l2_querycap(struct file *file, void *fh, static int histo_v4l2_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_histogram *histo = vdev_to_histo(vfh->vdev); if (f->index > 0 || f->type != histo->queue.type) @@ -423,7 +423,7 @@ static int histo_v4l2_enum_format(struct file *file, void *fh, static int histo_v4l2_get_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_histogram *histo = vdev_to_histo(vfh->vdev); struct v4l2_meta_format *meta = &format->fmt.meta; diff --git a/drivers/media/platform/renesas/vsp1/vsp1_video.c b/drivers/media/platform/renesas/vsp1/vsp1_video.c index bc66fbdde3cc..656fb5e6cb30 100644 --- a/drivers/media/platform/renesas/vsp1/vsp1_video.c +++ b/drivers/media/platform/renesas/vsp1/vsp1_video.c @@ -896,7 +896,7 @@ static const struct vb2_ops vsp1_video_queue_qops = { static int vsp1_video_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_video *video = to_vsp1_video(vfh->vdev); cap->capabilities = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_STREAMING @@ -912,7 +912,7 @@ vsp1_video_querycap(struct file *file, void *fh, struct v4l2_capability *cap) static int vsp1_video_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_video *video = to_vsp1_video(vfh->vdev); const struct vsp1_format_info *info; @@ -933,7 +933,7 @@ static int vsp1_video_enum_format(struct file *file, void *fh, static int vsp1_video_get_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_video *video = to_vsp1_video(vfh->vdev); if (format->type != video->queue.type) @@ -949,7 +949,7 @@ vsp1_video_get_format(struct file *file, void *fh, struct v4l2_format *format) static int vsp1_video_try_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_video *video = to_vsp1_video(vfh->vdev); if (format->type != video->queue.type) @@ -961,7 +961,7 @@ vsp1_video_try_format(struct file *file, void *fh, struct v4l2_format *format) static int vsp1_video_set_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_video *video = to_vsp1_video(vfh->vdev); const struct vsp1_format_info *info; int ret; @@ -991,7 +991,7 @@ done: static int vsp1_video_streamon(struct file *file, void *fh, enum v4l2_buf_type type) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct vsp1_video *video = to_vsp1_video(vfh->vdev); struct media_device *mdev = &video->vsp1->media_dev; struct vsp1_pipeline *pipe; diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.c b/drivers/media/platform/ti/omap3isp/ispvideo.c index 78e30298c7ad..a777135c6a6c 100644 --- a/drivers/media/platform/ti/omap3isp/ispvideo.c +++ b/drivers/media/platform/ti/omap3isp/ispvideo.c @@ -1348,7 +1348,7 @@ done: static int isp_video_release(struct file *file) { struct isp_video *video = video_drvdata(file); - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct isp_video_fh *handle = to_isp_video_fh(vfh); /* Disable streaming and free the buffers queue resources. */ diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c index 18bfa6001909..fcfe0883aba5 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.c +++ b/drivers/media/platform/xilinx/xilinx-dma.c @@ -469,7 +469,7 @@ static const struct vb2_ops xvip_dma_queue_qops = { static int xvip_dma_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct xvip_dma *dma = to_xvip_dma(vfh->vdev); cap->capabilities = dma->xdev->v4l2_caps | V4L2_CAP_STREAMING | @@ -491,7 +491,7 @@ xvip_dma_querycap(struct file *file, void *fh, struct v4l2_capability *cap) static int xvip_dma_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct xvip_dma *dma = to_xvip_dma(vfh->vdev); if (f->index > 0) @@ -505,7 +505,7 @@ xvip_dma_enum_format(struct file *file, void *fh, struct v4l2_fmtdesc *f) static int xvip_dma_get_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct xvip_dma *dma = to_xvip_dma(vfh->vdev); format->fmt.pix = dma->format; @@ -565,7 +565,7 @@ __xvip_dma_try_format(struct xvip_dma *dma, struct v4l2_pix_format *pix, static int xvip_dma_try_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct xvip_dma *dma = to_xvip_dma(vfh->vdev); __xvip_dma_try_format(dma, &format->fmt.pix, NULL); @@ -575,7 +575,7 @@ xvip_dma_try_format(struct file *file, void *fh, struct v4l2_format *format) static int xvip_dma_set_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct xvip_dma *dma = to_xvip_dma(vfh->vdev); const struct xvip_video_format *info; diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c index 229e08ff323e..4cbf6ce314fd 100644 --- a/drivers/media/usb/uvc/uvc_metadata.c +++ b/drivers/media/usb/uvc/uvc_metadata.c @@ -26,7 +26,7 @@ static int uvc_meta_v4l2_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct uvc_video_chain *chain = stream->chain; @@ -42,7 +42,7 @@ static int uvc_meta_v4l2_querycap(struct file *file, void *fh, static int uvc_meta_v4l2_get_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct v4l2_meta_format *fmt = &format->fmt.meta; @@ -60,7 +60,7 @@ static int uvc_meta_v4l2_get_format(struct file *file, void *fh, static int uvc_meta_v4l2_try_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct uvc_device *dev = stream->dev; struct v4l2_meta_format *fmt = &format->fmt.meta; @@ -86,7 +86,7 @@ static int uvc_meta_v4l2_try_format(struct file *file, void *fh, static int uvc_meta_v4l2_set_format(struct file *file, void *fh, struct v4l2_format *format) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct v4l2_meta_format *fmt = &format->fmt.meta; int ret; @@ -115,7 +115,7 @@ static int uvc_meta_v4l2_set_format(struct file *file, void *fh, static int uvc_meta_v4l2_enum_formats(struct file *file, void *fh, struct v4l2_fmtdesc *fdesc) { - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct uvc_device *dev = stream->dev; u32 i = fdesc->index; diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c index d49a68b36c28..d46b2c8f3d23 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-api.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c @@ -1253,7 +1253,7 @@ EXPORT_SYMBOL(v4l2_querymenu); int v4l2_ctrl_log_status(struct file *file, void *fh) { struct video_device *vfd = video_devdata(file); - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) && vfd->v4l2_dev) v4l2_ctrl_handler_log_status(vfh->ctrl_handler, @@ -1348,7 +1348,7 @@ EXPORT_SYMBOL(v4l2_ctrl_subdev_subscribe_event); */ __poll_t v4l2_ctrl_poll(struct file *file, struct poll_table_struct *wait) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); poll_wait(file, &fh->wait, wait); if (v4l2_event_pending(fh)) diff --git a/drivers/media/v4l2-core/v4l2-fh.c b/drivers/media/v4l2-core/v4l2-fh.c index 90eec79ee995..7a5f7aa5e253 100644 --- a/drivers/media/v4l2-core/v4l2-fh.c +++ b/drivers/media/v4l2-core/v4l2-fh.c @@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(v4l2_fh_exit); int v4l2_fh_release(struct file *filp) { - struct v4l2_fh *fh = filp->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(filp); if (fh) { v4l2_fh_del(fh); diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c index eb22d6172462..e67e67f76f72 100644 --- a/drivers/media/v4l2-core/v4l2-mem2mem.c +++ b/drivers/media/v4l2-core/v4l2-mem2mem.c @@ -971,7 +971,7 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, rc = v4l2_m2m_poll_for_data(file, m2m_ctx, wait); if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); poll_wait(file, &fh->wait, wait); if (v4l2_event_pending(fh)) @@ -1004,7 +1004,7 @@ unsigned long v4l2_m2m_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); unsigned long offset = pgoff << PAGE_SHIFT; struct vb2_queue *vq; @@ -1371,7 +1371,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_request_queue); int v4l2_m2m_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *rb) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_reqbufs(file, fh->m2m_ctx, rb); } @@ -1380,7 +1380,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_reqbufs); int v4l2_m2m_ioctl_create_bufs(struct file *file, void *priv, struct v4l2_create_buffers *create) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_create_bufs(file, fh->m2m_ctx, create); } @@ -1389,7 +1389,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_create_bufs); int v4l2_m2m_ioctl_remove_bufs(struct file *file, void *priv, struct v4l2_remove_buffers *remove) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct vb2_queue *q = v4l2_m2m_get_vq(fh->m2m_ctx, remove->type); if (!q) @@ -1404,7 +1404,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_remove_bufs); int v4l2_m2m_ioctl_querybuf(struct file *file, void *priv, struct v4l2_buffer *buf) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_querybuf(file, fh->m2m_ctx, buf); } @@ -1413,7 +1413,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_querybuf); int v4l2_m2m_ioctl_qbuf(struct file *file, void *priv, struct v4l2_buffer *buf) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_qbuf(file, fh->m2m_ctx, buf); } @@ -1422,7 +1422,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_qbuf); int v4l2_m2m_ioctl_dqbuf(struct file *file, void *priv, struct v4l2_buffer *buf) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_dqbuf(file, fh->m2m_ctx, buf); } @@ -1431,7 +1431,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_dqbuf); int v4l2_m2m_ioctl_prepare_buf(struct file *file, void *priv, struct v4l2_buffer *buf) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_prepare_buf(file, fh->m2m_ctx, buf); } @@ -1440,7 +1440,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_prepare_buf); int v4l2_m2m_ioctl_expbuf(struct file *file, void *priv, struct v4l2_exportbuffer *eb) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_expbuf(file, fh->m2m_ctx, eb); } @@ -1449,7 +1449,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_expbuf); int v4l2_m2m_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type type) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_streamon(file, fh->m2m_ctx, type); } @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_streamon); int v4l2_m2m_ioctl_streamoff(struct file *file, void *priv, enum v4l2_buf_type type) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_streamoff(file, fh->m2m_ctx, type); } @@ -1542,7 +1542,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_decoder_cmd); int v4l2_m2m_ioctl_encoder_cmd(struct file *file, void *priv, struct v4l2_encoder_cmd *ec) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_encoder_cmd(file, fh->m2m_ctx, ec); } @@ -1551,7 +1551,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_encoder_cmd); int v4l2_m2m_ioctl_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_decoder_cmd(file, fh->m2m_ctx, dc); } @@ -1572,7 +1572,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_stateless_try_decoder_cmd); int v4l2_m2m_ioctl_stateless_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct vb2_v4l2_buffer *out_vb, *cap_vb; struct v4l2_m2m_dev *m2m_dev = fh->m2m_ctx->m2m_dev; unsigned long flags; @@ -1617,7 +1617,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_stateless_decoder_cmd); int v4l2_m2m_fop_mmap(struct file *file, struct vm_area_struct *vma) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); return v4l2_m2m_mmap(file, fh->m2m_ctx, vma); } @@ -1625,7 +1625,7 @@ EXPORT_SYMBOL_GPL(v4l2_m2m_fop_mmap); __poll_t v4l2_m2m_fop_poll(struct file *file, poll_table *wait) { - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct v4l2_m2m_ctx *m2m_ctx = fh->m2m_ctx; __poll_t ret; diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 4fd25fea3b58..29d3b788b288 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -122,7 +122,7 @@ static int subdev_close(struct file *file) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); if (sd->internal_ops && sd->internal_ops->close) @@ -612,7 +612,7 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg, { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); bool ro_subdev = test_bit(V4L2_FL_SUBDEV_RO_DEVNODE, &vdev->flags); bool streams_subdev = sd->flags & V4L2_SUBDEV_FL_STREAMS; @@ -1135,7 +1135,7 @@ static long subdev_do_ioctl_lock(struct file *file, unsigned int cmd, void *arg) if (video_is_registered(vdev)) { struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); - struct v4l2_fh *vfh = file->private_data; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); struct v4l2_subdev_state *state; @@ -1192,7 +1192,7 @@ static __poll_t subdev_poll(struct file *file, poll_table *wait) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); - struct v4l2_fh *fh = file->private_data; + struct v4l2_fh *fh = file_to_v4l2_fh(file); if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) return EPOLLERR; diff --git a/include/media/v4l2-fh.h b/include/media/v4l2-fh.h index b5b3e00c8e6a..823fa8ebeb8f 100644 --- a/include/media/v4l2-fh.h +++ b/include/media/v4l2-fh.h @@ -56,6 +56,20 @@ struct v4l2_fh { struct v4l2_m2m_ctx *m2m_ctx; }; +/** + * file_to_v4l2_fh - Return the v4l2_fh associated with a struct file + * + * @filp: pointer to &struct file + * + * This function should be used by drivers to retrieve the &struct v4l2_fh + * instance pointer stored in the file private_data instead of accessing the + * private_data field directly. + */ +static inline struct v4l2_fh *file_to_v4l2_fh(struct file *filp) +{ + return filp->private_data; +} + /** * v4l2_fh_init - Initialise the file handle. * -- cgit v1.2.3 From bbe4debfaa6a16f11064d5c40ef6d468dad4398d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:29:58 +0300 Subject: media: v4l2-fh: Move piece of documentation to correct function The paragraph in the v4l2_fh_del() documentation that indicates the function sets filp->private_data was added in the wrong place. It is meant for v4l2_fh_open(). Move it to where it belongs. Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/media/v4l2-fh.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-fh.h b/include/media/v4l2-fh.h index 823fa8ebeb8f..14e7136e693f 100644 --- a/include/media/v4l2-fh.h +++ b/include/media/v4l2-fh.h @@ -101,6 +101,9 @@ void v4l2_fh_add(struct v4l2_fh *fh); * * It allocates a v4l2_fh and inits and adds it to the &struct video_device * associated with the file pointer. + * + * On error filp->private_data will be %NULL, otherwise it will point to + * the &struct v4l2_fh. */ int v4l2_fh_open(struct file *filp); @@ -109,9 +112,6 @@ int v4l2_fh_open(struct file *filp); * * @fh: pointer to &struct v4l2_fh * - * On error filp->private_data will be %NULL, otherwise it will point to - * the &struct v4l2_fh. - * * .. note:: * Must be called in v4l2_file_operations->release\(\) handler if the driver * uses &struct v4l2_fh. -- cgit v1.2.3 From 47f4b1acb4d505b1e7e81d8e0ebce774422b8c2e Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:30:08 +0300 Subject: media: Set file->private_data in v4l2_fh_add() All the drivers that use v4l2_fh and call v4l2_fh_add() manually store a pointer to the v4l2_fh instance in file->private_data in their video device .open() file operation handler. Move the code to the v4l2_fh_add() function to avoid direct access to file->private_data in drivers. This requires adding a file pointer argument to the function. Changes to drivers have been generated with the following coccinelle semantic patch: @@ expression fh; identifier filp; identifier open; type ret; @@ ret open(..., struct file *filp, ...) { <... - filp->private_data = fh; ... - v4l2_fh_add(fh); + v4l2_fh_add(fh, filp); ...> } @@ expression fh; identifier filp; identifier open; type ret; @@ ret open(..., struct file *filp, ...) { <... - v4l2_fh_add(fh); + v4l2_fh_add(fh, filp); ... - filp->private_data = fh; ...> } Manual changes have been applied to Documentation/ to update the usage patterns, to drivers/media/v4l2-core/v4l2-fh.c to update the v4l2_fh_add() prototype set file->private_data, and to include/media/v4l2-fh.h to update the v4l2_fh_add() function prototype and its documentation. Additionally, white space issues have been fixed manually in drivers/media/platform/nvidia/tegra-vde/v4l2.c, drivers/media/platform/rockchip/rkvdec/rkvdec.c, drivers/media/v4l2-core/v4l2-fh.c and drivers/staging/most/video/video.c. Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- Documentation/driver-api/media/v4l2-fh.rst | 46 +++++++++++----------- .../zh_CN/video4linux/v4l2-framework.txt | 5 +-- drivers/media/pci/cx18/cx18-fileops.c | 3 +- drivers/media/pci/ivtv/ivtv-fileops.c | 4 +- drivers/media/pci/saa7164/saa7164-encoder.c | 3 +- drivers/media/pci/saa7164/saa7164-vbi.c | 3 +- drivers/media/platform/allegro-dvt/allegro-core.c | 3 +- drivers/media/platform/amlogic/meson-ge2d/ge2d.c | 3 +- drivers/media/platform/amphion/vpu_v4l2.c | 3 +- .../media/platform/chips-media/coda/coda-common.c | 3 +- .../platform/chips-media/wave5/wave5-vpu-dec.c | 3 +- .../platform/chips-media/wave5/wave5-vpu-enc.c | 3 +- .../media/platform/imagination/e5010-jpeg-enc.c | 3 +- drivers/media/platform/m2m-deinterlace.c | 3 +- .../media/platform/mediatek/jpeg/mtk_jpeg_core.c | 3 +- drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c | 3 +- .../media/platform/mediatek/mdp3/mtk-mdp3-m2m.c | 3 +- .../mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c | 3 +- .../mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c | 3 +- drivers/media/platform/nvidia/tegra-vde/v4l2.c | 3 +- drivers/media/platform/nxp/dw100/dw100.c | 3 +- drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c | 3 +- drivers/media/platform/nxp/imx-pxp.c | 3 +- drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c | 3 +- drivers/media/platform/nxp/mx2_emmaprp.c | 3 +- drivers/media/platform/qcom/iris/iris_vidc.c | 3 +- drivers/media/platform/qcom/venus/vdec.c | 3 +- drivers/media/platform/qcom/venus/venc.c | 3 +- drivers/media/platform/renesas/rcar_fdp1.c | 3 +- drivers/media/platform/renesas/rcar_jpu.c | 3 +- drivers/media/platform/renesas/vsp1/vsp1_video.c | 4 +- drivers/media/platform/rockchip/rga/rga.c | 3 +- drivers/media/platform/rockchip/rkvdec/rkvdec.c | 3 +- .../media/platform/samsung/exynos-gsc/gsc-m2m.c | 3 +- .../media/platform/samsung/exynos4-is/fimc-m2m.c | 3 +- drivers/media/platform/samsung/s5p-g2d/g2d.c | 3 +- .../media/platform/samsung/s5p-jpeg/jpeg-core.c | 3 +- drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c | 3 +- drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c | 3 +- drivers/media/platform/st/sti/delta/delta-v4l2.c | 3 +- drivers/media/platform/st/sti/hva/hva-v4l2.c | 3 +- drivers/media/platform/st/stm32/dma2d/dma2d.c | 3 +- drivers/media/platform/sunxi/sun8i-di/sun8i-di.c | 3 +- .../platform/sunxi/sun8i-rotate/sun8i_rotate.c | 3 +- drivers/media/platform/ti/omap3isp/ispvideo.c | 3 +- drivers/media/platform/ti/vpe/vpe.c | 3 +- drivers/media/platform/verisilicon/hantro_drv.c | 3 +- drivers/media/test-drivers/vicodec/vicodec-core.c | 3 +- drivers/media/test-drivers/vim2m.c | 3 +- drivers/media/test-drivers/visl/visl-core.c | 3 +- drivers/media/usb/hdpvr/hdpvr-video.c | 3 +- drivers/media/usb/pvrusb2/pvrusb2-v4l2.c | 3 +- drivers/media/usb/uvc/uvc_v4l2.c | 3 +- drivers/media/v4l2-core/v4l2-fh.c | 7 ++-- drivers/media/v4l2-core/v4l2-subdev.c | 3 +- drivers/staging/media/imx/imx-media-csc-scaler.c | 3 +- drivers/staging/media/meson/vdec/vdec.c | 3 +- drivers/staging/media/sunxi/cedrus/cedrus.c | 3 +- drivers/staging/most/video/video.c | 4 +- drivers/usb/gadget/function/uvc_v4l2.c | 3 +- include/media/v4l2-fh.h | 5 ++- 61 files changed, 91 insertions(+), 146 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/media/v4l2-fh.rst b/Documentation/driver-api/media/v4l2-fh.rst index 2c87b74578d9..a7393067f5db 100644 --- a/Documentation/driver-api/media/v4l2-fh.rst +++ b/Documentation/driver-api/media/v4l2-fh.rst @@ -11,25 +11,22 @@ data that is used by the V4L2 framework. since it is also used to implement priority handling (:ref:`VIDIOC_G_PRIORITY`). -The users of :c:type:`v4l2_fh` (in the V4L2 framework, not the driver) know -whether a driver uses :c:type:`v4l2_fh` as its ``file->private_data`` pointer -by testing the ``V4L2_FL_USES_V4L2_FH`` bit in :c:type:`video_device`->flags. -This bit is set whenever :c:func:`v4l2_fh_init` is called. - -struct v4l2_fh is allocated as a part of the driver's own file handle -structure and ``file->private_data`` is set to it in the driver's ``open()`` -function by the driver. The :c:type:`v4l2_fh` file handle can be retrieved -from the :c:type:`file` using :c:func:`file_to_v4l2_fh`. Drivers must not -access ``file->private_data`` directly. - -In many cases the struct v4l2_fh will be embedded in a larger -structure. In that case you should call: - -#) :c:func:`v4l2_fh_init` and :c:func:`v4l2_fh_add` in ``open()`` -#) :c:func:`v4l2_fh_del` and :c:func:`v4l2_fh_exit` in ``release()`` - -Drivers can extract their own file handle structure by using the container_of -macro. +struct v4l2_fh is allocated in the driver's ``open()`` file operation handler. +It is typically embedded in a larger driver-specific structure. The +:c:type:`v4l2_fh` must be initialized with a call to :c:func:`v4l2_fh_init`, +and added to the video device with :c:func:`v4l2_fh_add`. This associates the +:c:type:`v4l2_fh` with the :c:type:`file` by setting ``file->private_data`` to +point to the :c:type:`v4l2_fh`. + +Similarly, the struct v4l2_fh is freed in the driver's ``release()`` file +operation handler. It must be removed from the video device with +:c:func:`v4l2_fh_del` and cleaned up with :c:func:`v4l2_fh_exit` before being +freed. + +Drivers must not access ``file->private_data`` directly. They can retrieve the +:c:type:`v4l2_fh` associated with a :c:type:`file` by calling +:c:func:`file_to_v4l2_fh`. Drivers can extract their own file handle structure +by using the container_of macro. Example: @@ -58,8 +55,7 @@ Example: ... - file->private_data = &my_fh->fh; - v4l2_fh_add(&my_fh->fh); + v4l2_fh_add(&my_fh->fh, file); return 0; } @@ -84,7 +80,7 @@ Below is a short description of the :c:type:`v4l2_fh` functions used: :c:type:`v4l2_file_operations`->open() handler. :c:func:`v4l2_fh_add ` -(:c:type:`fh `) +(:c:type:`fh `, struct file \*filp) - Add a :c:type:`v4l2_fh` to :c:type:`video_device` file handle list. Must be called once the file handle is completely initialized. @@ -138,6 +134,12 @@ associated device node: - Same, but it calls v4l2_fh_is_singular with filp->private_data. +.. note:: + The V4L2 framework knows whether a driver uses :c:type:`v4l2_fh` as its + ``file->private_data`` pointer by testing the ``V4L2_FL_USES_V4L2_FH`` + bit in :c:type:`video_device`->flags. This bit is set whenever + :c:func:`v4l2_fh_init` is called. + V4L2 fh functions and data structures ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt index a9eb62fa1531..2d38ae17d940 100644 --- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt @@ -812,8 +812,7 @@ int my_open(struct file *file) ... - file->private_data = &my_fh->fh; - v4l2_fh_add(&my_fh->fh); + v4l2_fh_add(&my_fh->fh, file); return 0; } @@ -836,7 +835,7 @@ void v4l2_fh_init(struct v4l2_fh *fh, struct video_device *vdev) 初始化文件句柄。这*必须*在驱动的 v4l2_file_operations->open() 函数中执行。 -void v4l2_fh_add(struct v4l2_fh *fh) +void v4l2_fh_add(struct v4l2_fh *fh, struct file *filp) 添加一个 v4l2_fh 到 video_device 文件句柄列表。一旦文件句柄 初始化完成就必须调用。 diff --git a/drivers/media/pci/cx18/cx18-fileops.c b/drivers/media/pci/cx18/cx18-fileops.c index 89e38b303630..f90b547f5d67 100644 --- a/drivers/media/pci/cx18/cx18-fileops.c +++ b/drivers/media/pci/cx18/cx18-fileops.c @@ -743,8 +743,7 @@ static int cx18_serialized_open(struct cx18_stream *s, struct file *filp) item->type = s->type; item->open_id = cx->open_id++; - filp->private_data = &item->fh; - v4l2_fh_add(&item->fh); + v4l2_fh_add(&item->fh, filp); if (item->type == CX18_ENC_STREAM_TYPE_RAD && v4l2_fh_is_singular_file(filp)) { diff --git a/drivers/media/pci/ivtv/ivtv-fileops.c b/drivers/media/pci/ivtv/ivtv-fileops.c index 67964a3c382c..aa5f5f16427c 100644 --- a/drivers/media/pci/ivtv/ivtv-fileops.c +++ b/drivers/media/pci/ivtv/ivtv-fileops.c @@ -998,9 +998,7 @@ static int ivtv_open(struct file *filp) v4l2_fh_init(&item->fh, &s->vdev); item->itv = itv; item->type = s->type; - - filp->private_data = &item->fh; - v4l2_fh_add(&item->fh); + v4l2_fh_add(&item->fh, filp); if (item->type == IVTV_ENC_STREAM_TYPE_RAD && v4l2_fh_is_singular_file(filp)) { diff --git a/drivers/media/pci/saa7164/saa7164-encoder.c b/drivers/media/pci/saa7164/saa7164-encoder.c index 296f50b6b8d3..e6e353a251cf 100644 --- a/drivers/media/pci/saa7164/saa7164-encoder.c +++ b/drivers/media/pci/saa7164/saa7164-encoder.c @@ -725,8 +725,7 @@ static int fops_open(struct file *file) fh->port = port; v4l2_fh_init(&fh->fh, video_devdata(file)); - v4l2_fh_add(&fh->fh); - file->private_data = &fh->fh; + v4l2_fh_add(&fh->fh, file); return 0; } diff --git a/drivers/media/pci/saa7164/saa7164-vbi.c b/drivers/media/pci/saa7164/saa7164-vbi.c index a7e398f30472..181442fcb43b 100644 --- a/drivers/media/pci/saa7164/saa7164-vbi.c +++ b/drivers/media/pci/saa7164/saa7164-vbi.c @@ -428,8 +428,7 @@ static int fops_open(struct file *file) fh->port = port; v4l2_fh_init(&fh->fh, video_devdata(file)); - v4l2_fh_add(&fh->fh); - file->private_data = &fh->fh; + v4l2_fh_add(&fh->fh, file); return 0; } diff --git a/drivers/media/platform/allegro-dvt/allegro-core.c b/drivers/media/platform/allegro-dvt/allegro-core.c index 74977f3ae484..8c30f3cd4fc5 100644 --- a/drivers/media/platform/allegro-dvt/allegro-core.c +++ b/drivers/media/platform/allegro-dvt/allegro-core.c @@ -3219,8 +3219,7 @@ static int allegro_open(struct file *file) } list_add(&channel->list, &dev->channels); - file->private_data = &channel->fh; - v4l2_fh_add(&channel->fh); + v4l2_fh_add(&channel->fh, file); allegro_channel_adjust(channel); diff --git a/drivers/media/platform/amlogic/meson-ge2d/ge2d.c b/drivers/media/platform/amlogic/meson-ge2d/ge2d.c index c7df29a2d820..d36891b546bc 100644 --- a/drivers/media/platform/amlogic/meson-ge2d/ge2d.c +++ b/drivers/media/platform/amlogic/meson-ge2d/ge2d.c @@ -860,8 +860,7 @@ static int ge2d_open(struct file *file) return ret; } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ge2d_setup_ctrls(ctx); diff --git a/drivers/media/platform/amphion/vpu_v4l2.c b/drivers/media/platform/amphion/vpu_v4l2.c index 57ca6262bb04..e13bfe09af1b 100644 --- a/drivers/media/platform/amphion/vpu_v4l2.c +++ b/drivers/media/platform/amphion/vpu_v4l2.c @@ -760,7 +760,7 @@ int vpu_v4l2_open(struct file *file, struct vpu_inst *inst) inst->min_buffer_cap = 2; inst->min_buffer_out = 2; v4l2_fh_init(&inst->fh, func->vfd); - v4l2_fh_add(&inst->fh); + v4l2_fh_add(&inst->fh, file); ret = call_vop(inst, ctrl_init); if (ret) @@ -774,7 +774,6 @@ int vpu_v4l2_open(struct file *file, struct vpu_inst *inst) } inst->fh.ctrl_handler = &inst->ctrl_handler; - file->private_data = &inst->fh; inst->state = VPU_CODEC_STATE_DEINIT; inst->workqueue = alloc_ordered_workqueue("vpu_inst", WQ_MEM_RECLAIM); if (inst->workqueue) { diff --git a/drivers/media/platform/chips-media/coda/coda-common.c b/drivers/media/platform/chips-media/coda/coda-common.c index 459b59149390..7d874fd502b8 100644 --- a/drivers/media/platform/chips-media/coda/coda-common.c +++ b/drivers/media/platform/chips-media/coda/coda-common.c @@ -2642,8 +2642,7 @@ static int coda_open(struct file *file) if (ctx->ops->seq_end_work) INIT_WORK(&ctx->seq_end_work, ctx->ops->seq_end_work); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->dev = dev; ctx->idx = idx; diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c index f3188d720ed3..88eb933a5144 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c @@ -1761,8 +1761,7 @@ static int wave5_vpu_open_dec(struct file *filp) return -ENOMEM; v4l2_fh_init(&inst->v4l2_fh, vdev); - filp->private_data = &inst->v4l2_fh; - v4l2_fh_add(&inst->v4l2_fh); + v4l2_fh_add(&inst->v4l2_fh, filp); INIT_LIST_HEAD(&inst->list); diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c b/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c index b69a1206fa12..322c1498758a 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c @@ -1587,8 +1587,7 @@ static int wave5_vpu_open_enc(struct file *filp) return -ENOMEM; v4l2_fh_init(&inst->v4l2_fh, vdev); - filp->private_data = &inst->v4l2_fh; - v4l2_fh_add(&inst->v4l2_fh); + v4l2_fh_add(&inst->v4l2_fh, filp); INIT_LIST_HEAD(&inst->list); diff --git a/drivers/media/platform/imagination/e5010-jpeg-enc.c b/drivers/media/platform/imagination/e5010-jpeg-enc.c index 295461325862..1da00ff4b1e3 100644 --- a/drivers/media/platform/imagination/e5010-jpeg-enc.c +++ b/drivers/media/platform/imagination/e5010-jpeg-enc.c @@ -742,8 +742,7 @@ static int e5010_open(struct file *file) } v4l2_fh_init(&ctx->fh, vdev); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->e5010 = e5010; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(e5010->m2m_dev, ctx, queue_init); diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c index 1812c07837ad..a343dffd19f0 100644 --- a/drivers/media/platform/m2m-deinterlace.c +++ b/drivers/media/platform/m2m-deinterlace.c @@ -847,7 +847,6 @@ static int deinterlace_open(struct file *file) return -ENOMEM; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = pcdev; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(pcdev->m2m_dev, ctx, &queue_init); @@ -866,7 +865,7 @@ static int deinterlace_open(struct file *file) } ctx->colorspace = V4L2_COLORSPACE_REC709; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); dprintk(pcdev, "Created instance %p, m2m_ctx: %p\n", ctx, ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c index 3a7a6eb53d89..5178a1b170fe 100644 --- a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c +++ b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c @@ -1176,8 +1176,7 @@ static int mtk_jpeg_open(struct file *file) INIT_LIST_HEAD(&ctx->dst_done_queue); spin_lock_init(&ctx->done_queue_lock); v4l2_fh_init(&ctx->fh, vfd); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->jpeg = jpeg; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(jpeg->m2m_dev, ctx, diff --git a/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c b/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c index 2d894b5bfaa7..7a1a8e51dbca 100644 --- a/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c +++ b/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c @@ -1070,14 +1070,13 @@ static int mtk_mdp_m2m_open(struct file *file) mutex_init(&ctx->slock); ctx->id = mdp->id_counter++; v4l2_fh_init(&ctx->fh, vfd); - file->private_data = &ctx->fh; ret = mtk_mdp_ctrls_create(ctx); if (ret) goto error_ctrls; /* Use separate control handler per file handle */ ctx->fh.ctrl_handler = &ctx->ctrl_handler; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); INIT_LIST_HEAD(&ctx->list); ctx->mdp_dev = mdp; diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c index 886ff25c70eb..847d6b310e74 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c @@ -590,14 +590,13 @@ static int mdp_m2m_open(struct file *file) ctx->mdp_dev = mdp; v4l2_fh_init(&ctx->fh, vdev); - file->private_data = &ctx->fh; ret = mdp_m2m_ctrls_create(ctx); if (ret) goto err_exit_fh; /* Use separate control handler per file handle */ ctx->fh.ctrl_handler = &ctx->ctrl_handler; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); mutex_init(&ctx->ctx_lock); ctx->m2m_ctx = v4l2_m2m_ctx_init(mdp->m2m_dev, ctx, mdp_m2m_queue_init); diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c index 18801883c31a..952a77c383bd 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c @@ -206,8 +206,7 @@ static int fops_vcodec_open(struct file *file) mutex_lock(&dev->dev_mutex); ctx->id = dev->id_counter++; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); INIT_LIST_HEAD(&ctx->list); ctx->dev = dev; if (ctx->dev->vdec_pdata->is_subdev_supported) { diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c index e26a6c3ffa0c..9cacb6cbcf28 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c @@ -129,8 +129,7 @@ static int fops_vcodec_open(struct file *file) */ ctx->id = dev->id_counter++; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); INIT_LIST_HEAD(&ctx->list); ctx->dev = dev; init_waitqueue_head(&ctx->queue[0]); diff --git a/drivers/media/platform/nvidia/tegra-vde/v4l2.c b/drivers/media/platform/nvidia/tegra-vde/v4l2.c index 393dc3f07d5c..688b776b3010 100644 --- a/drivers/media/platform/nvidia/tegra-vde/v4l2.c +++ b/drivers/media/platform/nvidia/tegra-vde/v4l2.c @@ -832,8 +832,7 @@ static int tegra_open(struct file *file) goto free_ctrls; } - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); tegra_reset_coded_fmt(ctx); tegra_try_coded_fmt(file, &ctx->fh, &ctx->coded_fmt); diff --git a/drivers/media/platform/nxp/dw100/dw100.c b/drivers/media/platform/nxp/dw100/dw100.c index 2460f09a6813..2bd30910ddf9 100644 --- a/drivers/media/platform/nxp/dw100/dw100.c +++ b/drivers/media/platform/nxp/dw100/dw100.c @@ -607,7 +607,6 @@ static int dw100_open(struct file *file) mutex_init(&ctx->vq_mutex); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dw_dev = dw_dev; ctx->q_data[DW100_QUEUE_SRC].fmt = &formats[0]; @@ -651,7 +650,7 @@ static int dw100_open(struct file *file) goto err; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); return 0; diff --git a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c index 8eef7ebd0428..d7cecc56a9eb 100644 --- a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c +++ b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c @@ -2205,8 +2205,7 @@ static int mxc_jpeg_open(struct file *file) } v4l2_fh_init(&ctx->fh, mxc_vfd); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->mxc_jpeg = mxc_jpeg; diff --git a/drivers/media/platform/nxp/imx-pxp.c b/drivers/media/platform/nxp/imx-pxp.c index 879b1803a2b3..9602409f3ece 100644 --- a/drivers/media/platform/nxp/imx-pxp.c +++ b/drivers/media/platform/nxp/imx-pxp.c @@ -1660,7 +1660,6 @@ static int pxp_open(struct file *file) } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; hdl = &ctx->hdl; v4l2_ctrl_handler_init(hdl, 4); @@ -1699,7 +1698,7 @@ static int pxp_open(struct file *file) goto open_unlock; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); atomic_inc(&dev->num_inst); dprintk(dev, "Created instance: %p, m2m_ctx: %p\n", diff --git a/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c b/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c index 6444392c5e62..d6df6e2725f5 100644 --- a/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c +++ b/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c @@ -673,7 +673,6 @@ static int mxc_isi_m2m_open(struct file *file) mutex_init(&ctx->vb2_lock); v4l2_fh_init(&ctx->fh, vdev); - file->private_data = &ctx->fh; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(m2m->m2m_dev, ctx, &mxc_isi_m2m_queue_init); @@ -694,7 +693,7 @@ static int mxc_isi_m2m_open(struct file *file) if (ret) goto err_ctrls; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); return 0; diff --git a/drivers/media/platform/nxp/mx2_emmaprp.c b/drivers/media/platform/nxp/mx2_emmaprp.c index 5c8c6285ec1e..8c8f834e6250 100644 --- a/drivers/media/platform/nxp/mx2_emmaprp.c +++ b/drivers/media/platform/nxp/mx2_emmaprp.c @@ -730,7 +730,6 @@ static int emmaprp_open(struct file *file) return -ENOMEM; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = pcdev; if (mutex_lock_interruptible(&pcdev->dev_mutex)) { @@ -752,7 +751,7 @@ static int emmaprp_open(struct file *file) clk_prepare_enable(pcdev->clk_emma_ahb); ctx->q_data[V4L2_M2M_SRC].fmt = &formats[1]; ctx->q_data[V4L2_M2M_DST].fmt = &formats[0]; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); mutex_unlock(&pcdev->dev_mutex); dprintk(pcdev, "Created instance %p, m2m_ctx: %p\n", ctx, ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/qcom/iris/iris_vidc.c b/drivers/media/platform/qcom/iris/iris_vidc.c index 64ebec2ca6b3..cdd34a3b71ff 100644 --- a/drivers/media/platform/qcom/iris/iris_vidc.c +++ b/drivers/media/platform/qcom/iris/iris_vidc.c @@ -25,8 +25,7 @@ static void iris_v4l2_fh_init(struct iris_inst *inst, struct file *filp) { v4l2_fh_init(&inst->fh, inst->core->vdev_dec); inst->fh.ctrl_handler = &inst->ctrl_handler; - v4l2_fh_add(&inst->fh); - filp->private_data = &inst->fh; + v4l2_fh_add(&inst->fh, filp); } static void iris_v4l2_fh_deinit(struct iris_inst *inst, struct file *filp) diff --git a/drivers/media/platform/qcom/venus/vdec.c b/drivers/media/platform/qcom/venus/vdec.c index d10ca6d89f6d..55c27345b7d8 100644 --- a/drivers/media/platform/qcom/venus/vdec.c +++ b/drivers/media/platform/qcom/venus/vdec.c @@ -1732,9 +1732,8 @@ static int vdec_open(struct file *file) v4l2_fh_init(&inst->fh, core->vdev_dec); inst->fh.ctrl_handler = &inst->ctrl_handler; - v4l2_fh_add(&inst->fh); + v4l2_fh_add(&inst->fh, file); inst->fh.m2m_ctx = inst->m2m_ctx; - file->private_data = &inst->fh; return 0; diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index 0838d64ce8fe..fba07557a399 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -1515,9 +1515,8 @@ static int venc_open(struct file *file) v4l2_fh_init(&inst->fh, core->vdev_enc); inst->fh.ctrl_handler = &inst->ctrl_handler; - v4l2_fh_add(&inst->fh); + v4l2_fh_add(&inst->fh, file); inst->fh.m2m_ctx = inst->m2m_ctx; - file->private_data = &inst->fh; return 0; diff --git a/drivers/media/platform/renesas/rcar_fdp1.c b/drivers/media/platform/renesas/rcar_fdp1.c index 12a5dcc0ca6c..f1ea303ac038 100644 --- a/drivers/media/platform/renesas/rcar_fdp1.c +++ b/drivers/media/platform/renesas/rcar_fdp1.c @@ -2093,7 +2093,6 @@ static int fdp1_open(struct file *file) } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->fdp1 = fdp1; /* Initialise Queues */ @@ -2142,7 +2141,7 @@ static int fdp1_open(struct file *file) if (ret < 0) goto error_pm; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); dprintk(fdp1, "Created instance: %p, m2m_ctx: %p\n", ctx, ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/renesas/rcar_jpu.c b/drivers/media/platform/renesas/rcar_jpu.c index 7d5e9df53dfb..d0d4ee3f8bdc 100644 --- a/drivers/media/platform/renesas/rcar_jpu.c +++ b/drivers/media/platform/renesas/rcar_jpu.c @@ -1231,8 +1231,7 @@ static int jpu_open(struct file *file) v4l2_fh_init(&ctx->fh, vfd); ctx->fh.ctrl_handler = &ctx->ctrl_handler; - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->jpu = jpu; ctx->encoder = vfd == &jpu->vfd_encoder; diff --git a/drivers/media/platform/renesas/vsp1/vsp1_video.c b/drivers/media/platform/renesas/vsp1/vsp1_video.c index 656fb5e6cb30..b6dc1ee3dc50 100644 --- a/drivers/media/platform/renesas/vsp1/vsp1_video.c +++ b/drivers/media/platform/renesas/vsp1/vsp1_video.c @@ -1079,9 +1079,7 @@ static int vsp1_video_open(struct file *file) return -ENOMEM; v4l2_fh_init(vfh, &video->video); - v4l2_fh_add(vfh); - - file->private_data = vfh; + v4l2_fh_add(vfh, file); ret = vsp1_device_get(video->vsp1); if (ret < 0) { diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c index 8a6e618d605c..d88817023996 100644 --- a/drivers/media/platform/rockchip/rga/rga.c +++ b/drivers/media/platform/rockchip/rga/rga.c @@ -395,8 +395,7 @@ static int rga_open(struct file *file) return ret; } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); rga_setup_ctrls(ctx); diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c b/drivers/media/platform/rockchip/rkvdec/rkvdec.c index 41ab90cbcc0c..2fbad685e92c 100644 --- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c +++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c @@ -938,8 +938,7 @@ static int rkvdec_open(struct file *filp) if (ret) goto err_cleanup_m2m_ctx; - filp->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, filp); return 0; diff --git a/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c b/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c index 968bb4327b7a..39d84ffd1b05 100644 --- a/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c +++ b/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c @@ -625,8 +625,7 @@ static int gsc_m2m_open(struct file *file) /* Use separate control handler per file handle */ ctx->fh.ctrl_handler = &ctx->ctrl_handler; - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->gsc_dev = gsc; /* Default color format */ diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c b/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c index feedf60dad09..b002b02a899e 100644 --- a/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c +++ b/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c @@ -634,8 +634,7 @@ static int fimc_m2m_open(struct file *file) /* Use separate control handler per file handle */ ctx->fh.ctrl_handler = &ctx->ctrls.handler; - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); /* Setup the device context for memory-to-memory mode */ ctx->state = FIMC_CTX_M2M; diff --git a/drivers/media/platform/samsung/s5p-g2d/g2d.c b/drivers/media/platform/samsung/s5p-g2d/g2d.c index 44fcedbbc90a..e34cae9c9cf6 100644 --- a/drivers/media/platform/samsung/s5p-g2d/g2d.c +++ b/drivers/media/platform/samsung/s5p-g2d/g2d.c @@ -257,8 +257,7 @@ static int g2d_open(struct file *file) return ret; } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); g2d_setup_ctrls(ctx); diff --git a/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c b/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c index a5ecfe03db09..9e35dd939ad7 100644 --- a/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c +++ b/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c @@ -970,8 +970,7 @@ static int s5p_jpeg_open(struct file *file) v4l2_fh_init(&ctx->fh, vfd); /* Use separate control handler per file handle */ ctx->fh.ctrl_handler = &ctx->ctrl_handler; - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->jpeg = jpeg; if (vfd == jpeg->vfd_encoder) { diff --git a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c index dd9761df59df..74629db05121 100644 --- a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c @@ -801,8 +801,7 @@ static int s5p_mfc_open(struct file *file) } init_waitqueue_head(&ctx->queue); v4l2_fh_init(&ctx->fh, vdev); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->dev = dev; INIT_LIST_HEAD(&ctx->src_queue); INIT_LIST_HEAD(&ctx->dst_queue); diff --git a/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c b/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c index f3844e4e47ca..5e983799e298 100644 --- a/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c +++ b/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c @@ -608,8 +608,7 @@ static int bdisp_open(struct file *file) /* Use separate control handler per file handle */ ctx->fh.ctrl_handler = &ctx->ctrl_handler; - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); /* Default format */ ctx->src = bdisp_dflt_fmt; diff --git a/drivers/media/platform/st/sti/delta/delta-v4l2.c b/drivers/media/platform/st/sti/delta/delta-v4l2.c index a12fdbd8abed..3063a98ed25b 100644 --- a/drivers/media/platform/st/sti/delta/delta-v4l2.c +++ b/drivers/media/platform/st/sti/delta/delta-v4l2.c @@ -1639,8 +1639,7 @@ static int delta_open(struct file *file) ctx->dev = delta; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); INIT_WORK(&ctx->run_work, delta_run_work); mutex_init(&ctx->lock); diff --git a/drivers/media/platform/st/sti/hva/hva-v4l2.c b/drivers/media/platform/st/sti/hva/hva-v4l2.c index 29142c806cb7..2f9413fa7318 100644 --- a/drivers/media/platform/st/sti/hva/hva-v4l2.c +++ b/drivers/media/platform/st/sti/hva/hva-v4l2.c @@ -1174,8 +1174,7 @@ static int hva_open(struct file *file) INIT_WORK(&ctx->run_work, hva_run_work); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ret = hva_ctrls_setup(ctx); if (ret) { diff --git a/drivers/media/platform/st/stm32/dma2d/dma2d.c b/drivers/media/platform/st/stm32/dma2d/dma2d.c index f4c5d73447a7..b2bced06a1e6 100644 --- a/drivers/media/platform/st/stm32/dma2d/dma2d.c +++ b/drivers/media/platform/st/stm32/dma2d/dma2d.c @@ -304,8 +304,7 @@ static int dma2d_open(struct file *file) } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); dma2d_setup_ctrls(ctx); diff --git a/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c b/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c index e9c7c99fbc55..7823eb97faf7 100644 --- a/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c +++ b/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c @@ -730,7 +730,6 @@ static int deinterlace_open(struct file *file) deinterlace_prepare_format(&ctx->dst_fmt); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(dev->m2m_dev, ctx, @@ -740,7 +739,7 @@ static int deinterlace_open(struct file *file) goto err_free; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); mutex_unlock(&dev->dev_mutex); diff --git a/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c b/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c index 9ea65cb7187f..368a858b8c0f 100644 --- a/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c +++ b/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c @@ -659,7 +659,6 @@ static int rotate_open(struct file *file) rotate_set_cap_format(ctx, &ctx->dst_fmt, ctx->rotate); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(dev->m2m_dev, ctx, @@ -669,7 +668,7 @@ static int rotate_open(struct file *file) goto err_free; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ret = rotate_setup_ctrls(ctx); if (ret) diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.c b/drivers/media/platform/ti/omap3isp/ispvideo.c index b76d40aeca17..d10a2b96c13c 100644 --- a/drivers/media/platform/ti/omap3isp/ispvideo.c +++ b/drivers/media/platform/ti/omap3isp/ispvideo.c @@ -1297,7 +1297,7 @@ static int isp_video_open(struct file *file) return -ENOMEM; v4l2_fh_init(&handle->vfh, &video->video); - v4l2_fh_add(&handle->vfh); + v4l2_fh_add(&handle->vfh, file); /* If this is the first user, initialise the pipeline. */ if (omap3isp_get(video->isp) == NULL) { @@ -1333,7 +1333,6 @@ static int isp_video_open(struct file *file) handle->timeperframe.denominator = 1; handle->video = video; - file->private_data = &handle->vfh; done: if (ret < 0) { diff --git a/drivers/media/platform/ti/vpe/vpe.c b/drivers/media/platform/ti/vpe/vpe.c index 4b9b2bec7377..a47c5d31c475 100644 --- a/drivers/media/platform/ti/vpe/vpe.c +++ b/drivers/media/platform/ti/vpe/vpe.c @@ -2310,7 +2310,6 @@ static int vpe_open(struct file *file) init_adb_hdrs(ctx); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; hdl = &ctx->hdl; v4l2_ctrl_handler_init(hdl, 1); @@ -2364,7 +2363,7 @@ static int vpe_open(struct file *file) goto exit_fh; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); /* * for now, just report the creation of the first instance, we can later diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c index b20b9c7f4131..aadc3d8fb3d1 100644 --- a/drivers/media/platform/verisilicon/hantro_drv.c +++ b/drivers/media/platform/verisilicon/hantro_drv.c @@ -663,8 +663,7 @@ static int hantro_open(struct file *filp) } v4l2_fh_init(&ctx->fh, vdev); - filp->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, filp); hantro_reset_fmts(ctx); diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index e27f6761cba1..f20d9d9643f5 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -1848,7 +1848,6 @@ static int vicodec_open(struct file *file) ctx->is_stateless = true; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; hdl = &ctx->hdl; v4l2_ctrl_handler_init(hdl, 5); @@ -1932,7 +1931,7 @@ static int vicodec_open(struct file *file) goto open_unlock; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); open_unlock: mutex_unlock(vfd->lock); diff --git a/drivers/media/test-drivers/vim2m.c b/drivers/media/test-drivers/vim2m.c index 55d885be5bcc..24574025f58f 100644 --- a/drivers/media/test-drivers/vim2m.c +++ b/drivers/media/test-drivers/vim2m.c @@ -1389,7 +1389,6 @@ static int vim2m_open(struct file *file) } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; hdl = &ctx->hdl; v4l2_ctrl_handler_init(hdl, 4); @@ -1433,7 +1432,7 @@ static int vim2m_open(struct file *file) goto open_unlock; } - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); atomic_inc(&dev->num_inst); dprintk(dev, 1, "Created instance: %p, m2m_ctx: %p\n", diff --git a/drivers/media/test-drivers/visl/visl-core.c b/drivers/media/test-drivers/visl/visl-core.c index 5bf3136b36eb..0f43ec23f40b 100644 --- a/drivers/media/test-drivers/visl/visl-core.c +++ b/drivers/media/test-drivers/visl/visl-core.c @@ -341,7 +341,6 @@ static int visl_open(struct file *file) ctx->tpg_str_buf = kzalloc(TPG_STR_BUF_SZ, GFP_KERNEL); v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; rc = visl_init_ctrls(ctx); @@ -361,7 +360,7 @@ static int visl_open(struct file *file) if (rc) goto free_m2m_ctx; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); dprintk(dev, "Created instance: %p, m2m_ctx: %p\n", ctx, ctx->fh.m2m_ctx); diff --git a/drivers/media/usb/hdpvr/hdpvr-video.c b/drivers/media/usb/hdpvr/hdpvr-video.c index ea17f1a5f5b0..6c6e467f8554 100644 --- a/drivers/media/usb/hdpvr/hdpvr-video.c +++ b/drivers/media/usb/hdpvr/hdpvr-video.c @@ -380,8 +380,7 @@ static int hdpvr_open(struct file *file) return -ENOMEM; fh->legacy_mode = true; v4l2_fh_init(&fh->fh, video_devdata(file)); - v4l2_fh_add(&fh->fh); - file->private_data = &fh->fh; + v4l2_fh_add(&fh->fh, file); return 0; } diff --git a/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c b/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c index 481b03bbecf8..04c77af0c51e 100644 --- a/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c +++ b/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c @@ -1003,10 +1003,9 @@ static int pvr2_v4l2_open(struct file *file) } fhp->file = file; - file->private_data = &fhp->fh; fhp->fw_mode_flag = pvr2_hdw_cpufw_get_enabled(hdw); - v4l2_fh_add(&fhp->fh); + v4l2_fh_add(&fhp->fh, file); return 0; } diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c index 6dd329a972fd..09677ed639ae 100644 --- a/drivers/media/usb/uvc/uvc_v4l2.c +++ b/drivers/media/usb/uvc/uvc_v4l2.c @@ -600,10 +600,9 @@ static int uvc_v4l2_open(struct file *file) return -ENOMEM; v4l2_fh_init(&handle->vfh, &stream->vdev); - v4l2_fh_add(&handle->vfh); + v4l2_fh_add(&handle->vfh, file); handle->chain = stream->chain; handle->stream = stream; - file->private_data = &handle->vfh; return 0; } diff --git a/drivers/media/v4l2-core/v4l2-fh.c b/drivers/media/v4l2-core/v4l2-fh.c index 7a5f7aa5e253..b59b1084d8cd 100644 --- a/drivers/media/v4l2-core/v4l2-fh.c +++ b/drivers/media/v4l2-core/v4l2-fh.c @@ -41,10 +41,12 @@ void v4l2_fh_init(struct v4l2_fh *fh, struct video_device *vdev) } EXPORT_SYMBOL_GPL(v4l2_fh_init); -void v4l2_fh_add(struct v4l2_fh *fh) +void v4l2_fh_add(struct v4l2_fh *fh, struct file *filp) { unsigned long flags; + filp->private_data = fh; + v4l2_prio_open(fh->vdev->prio, &fh->prio); spin_lock_irqsave(&fh->vdev->fh_lock, flags); list_add(&fh->list, &fh->vdev->fh_list); @@ -57,11 +59,10 @@ int v4l2_fh_open(struct file *filp) struct video_device *vdev = video_devdata(filp); struct v4l2_fh *fh = kzalloc(sizeof(*fh), GFP_KERNEL); - filp->private_data = fh; if (fh == NULL) return -ENOMEM; v4l2_fh_init(fh, vdev); - v4l2_fh_add(fh); + v4l2_fh_add(fh, filp); return 0; } EXPORT_SYMBOL_GPL(v4l2_fh_open); diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 29d3b788b288..bf35ac436249 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -86,8 +86,7 @@ static int subdev_open(struct file *file) } v4l2_fh_init(&subdev_fh->vfh, vdev); - v4l2_fh_add(&subdev_fh->vfh); - file->private_data = &subdev_fh->vfh; + v4l2_fh_add(&subdev_fh->vfh, file); if (sd->v4l2_dev->mdev && sd->entity.graph_obj.mdev->dev) { struct module *owner; diff --git a/drivers/staging/media/imx/imx-media-csc-scaler.c b/drivers/staging/media/imx/imx-media-csc-scaler.c index fb67b383436d..7fedb33dda34 100644 --- a/drivers/staging/media/imx/imx-media-csc-scaler.c +++ b/drivers/staging/media/imx/imx-media-csc-scaler.c @@ -765,8 +765,7 @@ static int ipu_csc_scaler_open(struct file *file) ctx->rot_mode = IPU_ROTATE_NONE; v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); ctx->priv = priv; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(priv->m2m_dev, ctx, diff --git a/drivers/staging/media/meson/vdec/vdec.c b/drivers/staging/media/meson/vdec/vdec.c index f1ee53f9f298..b92666ff50a1 100644 --- a/drivers/staging/media/meson/vdec/vdec.c +++ b/drivers/staging/media/meson/vdec/vdec.c @@ -908,9 +908,8 @@ static int vdec_open(struct file *file) v4l2_fh_init(&sess->fh, core->vdev_dec); sess->fh.ctrl_handler = &sess->ctrl_handler; - v4l2_fh_add(&sess->fh); + v4l2_fh_add(&sess->fh, file); sess->fh.m2m_ctx = sess->m2m_ctx; - file->private_data = &sess->fh; return 0; diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.c b/drivers/staging/media/sunxi/cedrus/cedrus.c index 80b43187f6ee..ebefd646dbdb 100644 --- a/drivers/staging/media/sunxi/cedrus/cedrus.c +++ b/drivers/staging/media/sunxi/cedrus/cedrus.c @@ -366,7 +366,6 @@ static int cedrus_open(struct file *file) } v4l2_fh_init(&ctx->fh, video_devdata(file)); - file->private_data = &ctx->fh; ctx->dev = dev; ctx->bit_depth = 8; @@ -383,7 +382,7 @@ static int cedrus_open(struct file *file) if (ret) goto err_m2m_release; - v4l2_fh_add(&ctx->fh); + v4l2_fh_add(&ctx->fh, file); mutex_unlock(&dev->dev_mutex); diff --git a/drivers/staging/most/video/video.c b/drivers/staging/most/video/video.c index 116331cead2a..24a68e3e5419 100644 --- a/drivers/staging/most/video/video.c +++ b/drivers/staging/most/video/video.c @@ -96,9 +96,7 @@ static int comp_vdev_open(struct file *filp) fh->mdev = mdev; v4l2_fh_init(&fh->fh, vdev); - filp->private_data = &fh->fh; - - v4l2_fh_add(&fh->fh); + v4l2_fh_add(&fh->fh, filp); ret = most_start_channel(mdev->iface, mdev->ch_idx, &comp); if (ret) { diff --git a/drivers/usb/gadget/function/uvc_v4l2.c b/drivers/usb/gadget/function/uvc_v4l2.c index 886300a29b90..680f25d17dc2 100644 --- a/drivers/usb/gadget/function/uvc_v4l2.c +++ b/drivers/usb/gadget/function/uvc_v4l2.c @@ -672,10 +672,9 @@ uvc_v4l2_open(struct file *file) return -ENOMEM; v4l2_fh_init(&handle->vfh, vdev); - v4l2_fh_add(&handle->vfh); + v4l2_fh_add(&handle->vfh, file); handle->device = &uvc->video; - file->private_data = &handle->vfh; return 0; } diff --git a/include/media/v4l2-fh.h b/include/media/v4l2-fh.h index 14e7136e693f..d8fcf49f10e0 100644 --- a/include/media/v4l2-fh.h +++ b/include/media/v4l2-fh.h @@ -87,11 +87,14 @@ void v4l2_fh_init(struct v4l2_fh *fh, struct video_device *vdev); * v4l2_fh_add - Add the fh to the list of file handles on a video_device. * * @fh: pointer to &struct v4l2_fh + * @filp: pointer to &struct file associated with @fh + * + * The function sets filp->private_data to point to @fh. * * .. note:: * The @fh file handle must be initialised first. */ -void v4l2_fh_add(struct v4l2_fh *fh); +void v4l2_fh_add(struct v4l2_fh *fh, struct file *filp); /** * v4l2_fh_open - Ancillary routine that can be used as the open\(\) op -- cgit v1.2.3 From 277966749f46bc6292c4052b4e66a554f193a78a Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:30:09 +0300 Subject: media: Reset file->private_data to NULL in v4l2_fh_del() Multiple drivers that use v4l2_fh and call v4l2_fh_del() manually reset the file->private_data pointer to NULL in their video device .release() file operation handler. Move the code to the v4l2_fh_del() function to avoid direct access to file->private_data in drivers. This requires adding a file pointer argument to the function. Changes to drivers have been generated with the following coccinelle semantic patch: @@ expression fh; identifier filp; identifier release; type ret; @@ ret release(..., struct file *filp, ...) { <... - filp->private_data = NULL; ... - v4l2_fh_del(fh); + v4l2_fh_del(fh, filp); ...> } @@ expression fh; identifier filp; identifier release; type ret; @@ ret release(..., struct file *filp, ...) { <... - v4l2_fh_del(fh); + v4l2_fh_del(fh, filp); ... - filp->private_data = NULL; ...> } @@ expression fh; identifier filp; identifier release; type ret; @@ ret release(..., struct file *filp, ...) { <... - v4l2_fh_del(fh); + v4l2_fh_del(fh, filp); ...> } Manual changes have been applied to Documentation/ to update the usage patterns, to drivers/media/v4l2-core/v4l2-fh.c to update the v4l2_fh_del() prototype and reset file->private_data, and to include/media/v4l2-fh.h to update the v4l2_fh_del() function prototype and its documentation. Additionally, white space issues have been fixed manually in drivers/usb/gadget/function/uvc_v4l2.c Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- Documentation/driver-api/media/v4l2-fh.rst | 4 ++-- Documentation/translations/zh_CN/video4linux/v4l2-framework.txt | 4 ++-- drivers/media/pci/cx18/cx18-fileops.c | 4 ++-- drivers/media/pci/ivtv/ivtv-fileops.c | 4 ++-- drivers/media/pci/saa7164/saa7164-encoder.c | 2 +- drivers/media/pci/saa7164/saa7164-vbi.c | 2 +- drivers/media/platform/allegro-dvt/allegro-core.c | 2 +- drivers/media/platform/amlogic/meson-ge2d/ge2d.c | 2 +- drivers/media/platform/amphion/vpu_v4l2.c | 4 ++-- drivers/media/platform/chips-media/coda/coda-common.c | 4 ++-- drivers/media/platform/chips-media/wave5/wave5-helper.c | 2 +- drivers/media/platform/imagination/e5010-jpeg-enc.c | 4 ++-- drivers/media/platform/m2m-deinterlace.c | 2 +- drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c | 4 ++-- drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c | 4 ++-- drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c | 4 ++-- .../media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c | 4 ++-- .../media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c | 4 ++-- drivers/media/platform/nvidia/tegra-vde/v4l2.c | 2 +- drivers/media/platform/nxp/dw100/dw100.c | 2 +- drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c | 4 ++-- drivers/media/platform/nxp/imx-pxp.c | 2 +- drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c | 2 +- drivers/media/platform/nxp/mx2_emmaprp.c | 2 +- drivers/media/platform/qcom/iris/iris_vidc.c | 3 +-- drivers/media/platform/qcom/venus/core.c | 2 +- drivers/media/platform/renesas/rcar_fdp1.c | 2 +- drivers/media/platform/renesas/rcar_jpu.c | 4 ++-- drivers/media/platform/renesas/vsp1/vsp1_video.c | 2 +- drivers/media/platform/rockchip/rga/rga.c | 2 +- drivers/media/platform/rockchip/rkvdec/rkvdec.c | 2 +- drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c | 4 ++-- drivers/media/platform/samsung/exynos4-is/fimc-m2m.c | 4 ++-- drivers/media/platform/samsung/s5p-g2d/g2d.c | 2 +- drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c | 4 ++-- drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c | 4 ++-- drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c | 4 ++-- drivers/media/platform/st/sti/delta/delta-v4l2.c | 4 ++-- drivers/media/platform/st/sti/hva/hva-v4l2.c | 4 ++-- drivers/media/platform/st/stm32/dma2d/dma2d.c | 2 +- drivers/media/platform/sunxi/sun8i-di/sun8i-di.c | 2 +- drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c | 2 +- drivers/media/platform/ti/omap3isp/ispvideo.c | 5 ++--- drivers/media/platform/ti/vpe/vpe.c | 2 +- drivers/media/platform/verisilicon/hantro_drv.c | 4 ++-- drivers/media/test-drivers/vicodec/vicodec-core.c | 2 +- drivers/media/test-drivers/vim2m.c | 2 +- drivers/media/test-drivers/visl/visl-core.c | 2 +- drivers/media/usb/pvrusb2/pvrusb2-v4l2.c | 3 +-- drivers/media/v4l2-core/v4l2-fh.c | 7 ++++--- drivers/media/v4l2-core/v4l2-subdev.c | 5 ++--- drivers/staging/media/imx/imx-media-csc-scaler.c | 4 ++-- drivers/staging/media/meson/vdec/vdec.c | 2 +- drivers/staging/media/sunxi/cedrus/cedrus.c | 2 +- drivers/staging/most/video/video.c | 4 ++-- drivers/usb/gadget/function/uvc_v4l2.c | 3 +-- include/media/v4l2-fh.h | 5 ++++- 57 files changed, 89 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/media/v4l2-fh.rst b/Documentation/driver-api/media/v4l2-fh.rst index a7393067f5db..afcad22ead7c 100644 --- a/Documentation/driver-api/media/v4l2-fh.rst +++ b/Documentation/driver-api/media/v4l2-fh.rst @@ -65,7 +65,7 @@ Example: struct my_fh *my_fh = container_of(fh, struct my_fh, fh); ... - v4l2_fh_del(&my_fh->fh); + v4l2_fh_del(&my_fh->fh, file); v4l2_fh_exit(&my_fh->fh); kfree(my_fh); return 0; @@ -86,7 +86,7 @@ Below is a short description of the :c:type:`v4l2_fh` functions used: Must be called once the file handle is completely initialized. :c:func:`v4l2_fh_del ` -(:c:type:`fh `) +(:c:type:`fh `, struct file \*filp) - Unassociate the file handle from :c:type:`video_device`. The file handle exit function may now be called. diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt index 2d38ae17d940..1653c6e2cb46 100644 --- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt @@ -822,7 +822,7 @@ int my_release(struct file *file) struct my_fh *my_fh = container_of(fh, struct my_fh, fh); ... - v4l2_fh_del(&my_fh->fh); + v4l2_fh_del(&my_fh->fh, file); v4l2_fh_exit(&my_fh->fh); kfree(my_fh); return 0; @@ -840,7 +840,7 @@ void v4l2_fh_add(struct v4l2_fh *fh, struct file *filp) 添加一个 v4l2_fh 到 video_device 文件句柄列表。一旦文件句柄 初始化完成就必须调用。 -void v4l2_fh_del(struct v4l2_fh *fh) +void v4l2_fh_del(struct v4l2_fh *fh, struct file *filp) 从 video_device() 中解除文件句柄的关联。文件句柄的退出函数也 将被调用。 diff --git a/drivers/media/pci/cx18/cx18-fileops.c b/drivers/media/pci/cx18/cx18-fileops.c index f90b547f5d67..d49fa4c4119b 100644 --- a/drivers/media/pci/cx18/cx18-fileops.c +++ b/drivers/media/pci/cx18/cx18-fileops.c @@ -713,7 +713,7 @@ int cx18_v4l2_close(struct file *filp) vb2_queue_release(vdev->queue); vdev->queue->owner = NULL; } - v4l2_fh_del(fh); + v4l2_fh_del(fh, filp); v4l2_fh_exit(fh); /* 'Unclaim' this stream */ @@ -751,7 +751,7 @@ static int cx18_serialized_open(struct cx18_stream *s, struct file *filp) if (atomic_read(&cx->ana_capturing) > 0) { /* switching to radio while capture is in progress is not polite */ - v4l2_fh_del(&item->fh); + v4l2_fh_del(&item->fh, filp); v4l2_fh_exit(&item->fh); kfree(item); return -EBUSY; diff --git a/drivers/media/pci/ivtv/ivtv-fileops.c b/drivers/media/pci/ivtv/ivtv-fileops.c index aa5f5f16427c..0040a5e7f654 100644 --- a/drivers/media/pci/ivtv/ivtv-fileops.c +++ b/drivers/media/pci/ivtv/ivtv-fileops.c @@ -911,7 +911,7 @@ int ivtv_v4l2_close(struct file *filp) ivtv_unmute(itv); } - v4l2_fh_del(fh); + v4l2_fh_del(fh, filp); v4l2_fh_exit(fh); /* Easy case first: this stream was never claimed by us */ @@ -1006,7 +1006,7 @@ static int ivtv_open(struct file *filp) if (atomic_read(&itv->capturing) > 0) { /* switching to radio while capture is in progress is not polite */ - v4l2_fh_del(&item->fh); + v4l2_fh_del(&item->fh, filp); v4l2_fh_exit(&item->fh); kfree(item); return -EBUSY; diff --git a/drivers/media/pci/saa7164/saa7164-encoder.c b/drivers/media/pci/saa7164/saa7164-encoder.c index e6e353a251cf..66d650b5f69a 100644 --- a/drivers/media/pci/saa7164/saa7164-encoder.c +++ b/drivers/media/pci/saa7164/saa7164-encoder.c @@ -746,7 +746,7 @@ static int fops_release(struct file *file) } } - v4l2_fh_del(&fh->fh); + v4l2_fh_del(&fh->fh, file); v4l2_fh_exit(&fh->fh); kfree(fh); diff --git a/drivers/media/pci/saa7164/saa7164-vbi.c b/drivers/media/pci/saa7164/saa7164-vbi.c index 181442fcb43b..57e4362c0d19 100644 --- a/drivers/media/pci/saa7164/saa7164-vbi.c +++ b/drivers/media/pci/saa7164/saa7164-vbi.c @@ -449,7 +449,7 @@ static int fops_release(struct file *file) } } - v4l2_fh_del(&fh->fh); + v4l2_fh_del(&fh->fh, file); v4l2_fh_exit(&fh->fh); kfree(fh); diff --git a/drivers/media/platform/allegro-dvt/allegro-core.c b/drivers/media/platform/allegro-dvt/allegro-core.c index 8c30f3cd4fc5..5e3b1f5d7206 100644 --- a/drivers/media/platform/allegro-dvt/allegro-core.c +++ b/drivers/media/platform/allegro-dvt/allegro-core.c @@ -3241,7 +3241,7 @@ static int allegro_release(struct file *file) v4l2_ctrl_handler_free(&channel->ctrl_handler); - v4l2_fh_del(&channel->fh); + v4l2_fh_del(&channel->fh, file); v4l2_fh_exit(&channel->fh); kfree(channel); diff --git a/drivers/media/platform/amlogic/meson-ge2d/ge2d.c b/drivers/media/platform/amlogic/meson-ge2d/ge2d.c index d36891b546bc..b1b0b6535fb1 100644 --- a/drivers/media/platform/amlogic/meson-ge2d/ge2d.c +++ b/drivers/media/platform/amlogic/meson-ge2d/ge2d.c @@ -883,7 +883,7 @@ static int ge2d_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); diff --git a/drivers/media/platform/amphion/vpu_v4l2.c b/drivers/media/platform/amphion/vpu_v4l2.c index e13bfe09af1b..fcb2eff813ac 100644 --- a/drivers/media/platform/amphion/vpu_v4l2.c +++ b/drivers/media/platform/amphion/vpu_v4l2.c @@ -791,7 +791,7 @@ int vpu_v4l2_open(struct file *file, struct vpu_inst *inst) return 0; error: - v4l2_fh_del(&inst->fh); + v4l2_fh_del(&inst->fh, file); v4l2_fh_exit(&inst->fh); vpu_inst_put(inst); return ret; @@ -812,7 +812,7 @@ int vpu_v4l2_close(struct file *file) call_void_vop(inst, release); vpu_inst_unlock(inst); - v4l2_fh_del(&inst->fh); + v4l2_fh_del(&inst->fh, file); v4l2_fh_exit(&inst->fh); vpu_inst_unregister(inst); diff --git a/drivers/media/platform/chips-media/coda/coda-common.c b/drivers/media/platform/chips-media/coda/coda-common.c index 7d874fd502b8..583759eed610 100644 --- a/drivers/media/platform/chips-media/coda/coda-common.c +++ b/drivers/media/platform/chips-media/coda/coda-common.c @@ -2725,7 +2725,7 @@ err_clk_ahb: err_clk_enable: pm_runtime_put_sync(dev->dev); err_pm_get: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); err_coda_name_init: ida_free(&dev->ida, ctx->idx); @@ -2763,7 +2763,7 @@ static int coda_release(struct file *file) clk_disable_unprepare(dev->clk_ahb); clk_disable_unprepare(dev->clk_per); pm_runtime_put_sync(dev->dev); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); ida_free(&dev->ida, ctx->idx); if (ctx->ops->release) diff --git a/drivers/media/platform/chips-media/wave5/wave5-helper.c b/drivers/media/platform/chips-media/wave5/wave5-helper.c index ed8ff04a899d..0bce62f0c039 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-helper.c +++ b/drivers/media/platform/chips-media/wave5/wave5-helper.c @@ -46,7 +46,7 @@ void wave5_cleanup_instance(struct vpu_instance *inst, struct file *filp) wave5_vdi_free_dma_memory(inst->dev, &inst->bitstream_vbuf); v4l2_ctrl_handler_free(&inst->v4l2_ctrl_hdl); if (inst->v4l2_fh.vdev) { - v4l2_fh_del(&inst->v4l2_fh); + v4l2_fh_del(&inst->v4l2_fh, filp); v4l2_fh_exit(&inst->v4l2_fh); } list_del_init(&inst->list); diff --git a/drivers/media/platform/imagination/e5010-jpeg-enc.c b/drivers/media/platform/imagination/e5010-jpeg-enc.c index 1da00ff4b1e3..c4e0097cb8b7 100644 --- a/drivers/media/platform/imagination/e5010-jpeg-enc.c +++ b/drivers/media/platform/imagination/e5010-jpeg-enc.c @@ -769,7 +769,7 @@ static int e5010_open(struct file *file) err_ctrls_setup: v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); exit: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mutex_unlock(&e5010->mutex); free: @@ -786,7 +786,7 @@ static int e5010_release(struct file *file) mutex_lock(&e5010->mutex); v4l2_ctrl_handler_free(&ctx->ctrl_handler); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); mutex_unlock(&e5010->mutex); diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c index a343dffd19f0..51c2f206cb1f 100644 --- a/drivers/media/platform/m2m-deinterlace.c +++ b/drivers/media/platform/m2m-deinterlace.c @@ -880,7 +880,7 @@ static int deinterlace_release(struct file *file) dprintk(pcdev, "Releasing instance %p\n", ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); kfree(ctx->xt); diff --git a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c index 5178a1b170fe..de15d6f0b490 100644 --- a/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c +++ b/drivers/media/platform/mediatek/jpeg/mtk_jpeg_core.c @@ -1201,7 +1201,7 @@ static int mtk_jpeg_open(struct file *file) return 0; error: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mutex_unlock(&jpeg->lock); free: @@ -1217,7 +1217,7 @@ static int mtk_jpeg_release(struct file *file) mutex_lock(&jpeg->lock); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); mutex_unlock(&jpeg->lock); diff --git a/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c b/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c index 7a1a8e51dbca..7e89a8443707 100644 --- a/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c +++ b/drivers/media/platform/mediatek/mdp/mtk_mdp_m2m.c @@ -1130,7 +1130,7 @@ err_load_vpu: error_m2m_ctx: v4l2_ctrl_handler_free(&ctx->ctrl_handler); error_ctrls: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mutex_unlock(&mdp->lock); err_lock: @@ -1148,7 +1148,7 @@ static int mtk_mdp_m2m_release(struct file *file) mutex_lock(&mdp->lock); v4l2_m2m_ctx_release(ctx->m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mtk_mdp_vpu_deinit(&ctx->vpu); mdp->ctx_num--; diff --git a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c index 847d6b310e74..e68ae19d71a9 100644 --- a/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c +++ b/drivers/media/platform/mediatek/mdp3/mtk-mdp3-m2m.c @@ -633,7 +633,7 @@ err_release_m2m_ctx: v4l2_m2m_ctx_release(ctx->m2m_ctx); err_release_handler: v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); err_exit_fh: v4l2_fh_exit(&ctx->fh); ida_free(&mdp->mdp_ida, ctx->id); @@ -657,7 +657,7 @@ static int mdp_m2m_release(struct file *file) mdp_vpu_put_locked(mdp); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); ida_free(&mdp->mdp_ida, ctx->id); mutex_unlock(&mdp->m2m_lock); diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c index 952a77c383bd..46d176e6de63 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c @@ -282,7 +282,7 @@ err_load_fw: err_m2m_ctx_init: v4l2_ctrl_handler_free(&ctx->ctrl_hdl); err_ctrls_setup: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); mutex_unlock(&dev->dev_mutex); @@ -307,7 +307,7 @@ static int fops_vcodec_release(struct file *file) v4l2_m2m_ctx_release(ctx->m2m_ctx); mtk_vcodec_dec_release(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c index 9cacb6cbcf28..fb1c3bdc2dae 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c @@ -191,7 +191,7 @@ err_load_fw: err_m2m_ctx_init: v4l2_ctrl_handler_free(&ctx->ctrl_hdl); err_ctrls_setup: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); mutex_unlock(&dev->dev_mutex); @@ -209,7 +209,7 @@ static int fops_vcodec_release(struct file *file) v4l2_m2m_ctx_release(ctx->m2m_ctx); mtk_vcodec_enc_release(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); diff --git a/drivers/media/platform/nvidia/tegra-vde/v4l2.c b/drivers/media/platform/nvidia/tegra-vde/v4l2.c index 688b776b3010..0c50f4ff82e0 100644 --- a/drivers/media/platform/nvidia/tegra-vde/v4l2.c +++ b/drivers/media/platform/nvidia/tegra-vde/v4l2.c @@ -856,7 +856,7 @@ static int tegra_release(struct file *file) struct tegra_ctx *ctx = fh_to_tegra_ctx(fh); struct tegra_vde *vde = ctx->vde; - v4l2_fh_del(fh); + v4l2_fh_del(fh, file); v4l2_m2m_ctx_release(fh->m2m_ctx); v4l2_ctrl_handler_free(&ctx->hdl); v4l2_fh_exit(fh); diff --git a/drivers/media/platform/nxp/dw100/dw100.c b/drivers/media/platform/nxp/dw100/dw100.c index 2bd30910ddf9..97744c7b7c03 100644 --- a/drivers/media/platform/nxp/dw100/dw100.c +++ b/drivers/media/platform/nxp/dw100/dw100.c @@ -667,7 +667,7 @@ static int dw100_release(struct file *file) { struct dw100_ctx *ctx = dw100_file2ctx(file); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c index d7cecc56a9eb..a34e644b2cb1 100644 --- a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c +++ b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c @@ -2238,7 +2238,7 @@ static int mxc_jpeg_open(struct file *file) err_ctrls_setup: v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); error: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mutex_unlock(&mxc_jpeg->lock); free: @@ -2751,7 +2751,7 @@ static int mxc_jpeg_release(struct file *file) ctx->slot); v4l2_ctrl_handler_free(&ctx->ctrl_handler); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); mutex_unlock(&mxc_jpeg->lock); diff --git a/drivers/media/platform/nxp/imx-pxp.c b/drivers/media/platform/nxp/imx-pxp.c index 9602409f3ece..6cc9b07ea53a 100644 --- a/drivers/media/platform/nxp/imx-pxp.c +++ b/drivers/media/platform/nxp/imx-pxp.c @@ -1716,7 +1716,7 @@ static int pxp_release(struct file *file) dprintk(dev, "Releasing instance %p\n", ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); mutex_lock(&dev->dev_mutex); diff --git a/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c b/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c index d6df6e2725f5..31298307c672 100644 --- a/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c +++ b/drivers/media/platform/nxp/imx8-isi/imx8-isi-m2m.c @@ -716,7 +716,7 @@ static int mxc_isi_m2m_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mxc_isi_m2m_ctx_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mutex_destroy(&ctx->vb2_lock); diff --git a/drivers/media/platform/nxp/mx2_emmaprp.c b/drivers/media/platform/nxp/mx2_emmaprp.c index 8c8f834e6250..d23da93304bd 100644 --- a/drivers/media/platform/nxp/mx2_emmaprp.c +++ b/drivers/media/platform/nxp/mx2_emmaprp.c @@ -769,7 +769,7 @@ static int emmaprp_release(struct file *file) mutex_lock(&pcdev->dev_mutex); clk_disable_unprepare(pcdev->clk_emma_ahb); clk_disable_unprepare(pcdev->clk_emma_ipg); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mutex_unlock(&pcdev->dev_mutex); diff --git a/drivers/media/platform/qcom/iris/iris_vidc.c b/drivers/media/platform/qcom/iris/iris_vidc.c index cdd34a3b71ff..541ae86f7892 100644 --- a/drivers/media/platform/qcom/iris/iris_vidc.c +++ b/drivers/media/platform/qcom/iris/iris_vidc.c @@ -30,8 +30,7 @@ static void iris_v4l2_fh_init(struct iris_inst *inst, struct file *filp) static void iris_v4l2_fh_deinit(struct iris_inst *inst, struct file *filp) { - filp->private_data = NULL; - v4l2_fh_del(&inst->fh); + v4l2_fh_del(&inst->fh, filp); inst->fh.ctrl_handler = NULL; v4l2_fh_exit(&inst->fh); } diff --git a/drivers/media/platform/qcom/venus/core.c b/drivers/media/platform/qcom/venus/core.c index 5e1ace16a490..90de29f166ad 100644 --- a/drivers/media/platform/qcom/venus/core.c +++ b/drivers/media/platform/qcom/venus/core.c @@ -607,7 +607,7 @@ void venus_close_common(struct venus_inst *inst, struct file *filp) v4l2_m2m_ctx_release(inst->m2m_ctx); v4l2_m2m_release(inst->m2m_dev); hfi_session_destroy(inst); - v4l2_fh_del(&inst->fh); + v4l2_fh_del(&inst->fh, filp); v4l2_fh_exit(&inst->fh); v4l2_ctrl_handler_free(&inst->ctrl_handler); diff --git a/drivers/media/platform/renesas/rcar_fdp1.c b/drivers/media/platform/renesas/rcar_fdp1.c index f1ea303ac038..e2dba0e4f315 100644 --- a/drivers/media/platform/renesas/rcar_fdp1.c +++ b/drivers/media/platform/renesas/rcar_fdp1.c @@ -2166,7 +2166,7 @@ static int fdp1_release(struct file *file) dprintk(fdp1, "Releasing instance %p\n", ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); mutex_lock(&fdp1->dev_mutex); diff --git a/drivers/media/platform/renesas/rcar_jpu.c b/drivers/media/platform/renesas/rcar_jpu.c index d0d4ee3f8bdc..0b479dfa2917 100644 --- a/drivers/media/platform/renesas/rcar_jpu.c +++ b/drivers/media/platform/renesas/rcar_jpu.c @@ -1276,7 +1276,7 @@ jpu_reset_rollback: device_prepare_rollback: mutex_unlock(&jpu->mutex); v4l_prepare_rollback: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); return ret; @@ -1289,7 +1289,7 @@ static int jpu_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); diff --git a/drivers/media/platform/renesas/vsp1/vsp1_video.c b/drivers/media/platform/renesas/vsp1/vsp1_video.c index b6dc1ee3dc50..75f9a1a85d55 100644 --- a/drivers/media/platform/renesas/vsp1/vsp1_video.c +++ b/drivers/media/platform/renesas/vsp1/vsp1_video.c @@ -1083,7 +1083,7 @@ static int vsp1_video_open(struct file *file) ret = vsp1_device_get(video->vsp1); if (ret < 0) { - v4l2_fh_del(vfh); + v4l2_fh_del(vfh, file); v4l2_fh_exit(vfh); kfree(vfh); } diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c index d88817023996..45c42c7ad846 100644 --- a/drivers/media/platform/rockchip/rga/rga.c +++ b/drivers/media/platform/rockchip/rga/rga.c @@ -418,7 +418,7 @@ static int rga_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c b/drivers/media/platform/rockchip/rkvdec/rkvdec.c index 2fbad685e92c..481c2488f9ac 100644 --- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c +++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c @@ -954,7 +954,7 @@ static int rkvdec_release(struct file *filp) { struct rkvdec_ctx *ctx = file_to_rkvdec_ctx(filp); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, filp); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); v4l2_fh_exit(&ctx->fh); diff --git a/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c b/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c index 39d84ffd1b05..2999fb2610f0 100644 --- a/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c +++ b/drivers/media/platform/samsung/exynos-gsc/gsc-m2m.c @@ -654,7 +654,7 @@ static int gsc_m2m_open(struct file *file) error_ctrls: gsc_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); error_fh: v4l2_fh_exit(&ctx->fh); kfree(ctx); @@ -675,7 +675,7 @@ static int gsc_m2m_release(struct file *file) v4l2_m2m_ctx_release(ctx->m2m_ctx); gsc_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); if (--gsc->m2m.refcnt <= 0) diff --git a/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c b/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c index b002b02a899e..609fd84f89d4 100644 --- a/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c +++ b/drivers/media/platform/samsung/exynos4-is/fimc-m2m.c @@ -663,7 +663,7 @@ error_m2m_ctx: v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); error_c: fimc_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); error_fh: v4l2_fh_exit(&ctx->fh); kfree(ctx); @@ -684,7 +684,7 @@ static int fimc_m2m_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); fimc_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); if (--fimc->m2m.refcnt <= 0) diff --git a/drivers/media/platform/samsung/s5p-g2d/g2d.c b/drivers/media/platform/samsung/s5p-g2d/g2d.c index e34cae9c9cf6..922262f61e7b 100644 --- a/drivers/media/platform/samsung/s5p-g2d/g2d.c +++ b/drivers/media/platform/samsung/s5p-g2d/g2d.c @@ -280,7 +280,7 @@ static int g2d_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mutex_unlock(&dev->mutex); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); v4l2_info(&dev->v4l2_dev, "instance closed\n"); diff --git a/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c b/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c index 9e35dd939ad7..65f256db4c76 100644 --- a/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c +++ b/drivers/media/platform/samsung/s5p-jpeg/jpeg-core.c @@ -1005,7 +1005,7 @@ static int s5p_jpeg_open(struct file *file) return 0; error: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); mutex_unlock(&jpeg->lock); free: @@ -1021,7 +1021,7 @@ static int s5p_jpeg_release(struct file *file) mutex_lock(&jpeg->lock); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); mutex_unlock(&jpeg->lock); diff --git a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c index 74629db05121..a5e756049620 100644 --- a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c @@ -955,7 +955,7 @@ err_ctrls_setup: err_bad_node: dev->ctx[ctx->num] = NULL; err_no_ctx: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); err_alloc: @@ -1010,7 +1010,7 @@ static int s5p_mfc_release(struct file *file) if (dev) dev->ctx[ctx->num] = NULL; s5p_mfc_dec_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); /* vdev is gone if dev is null */ if (dev) v4l2_fh_exit(&ctx->fh); diff --git a/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c b/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c index 5e983799e298..38b2a2924443 100644 --- a/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c +++ b/drivers/media/platform/st/sti/bdisp/bdisp-v4l2.c @@ -634,7 +634,7 @@ static int bdisp_open(struct file *file) error_ctrls: bdisp_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); error_fh: v4l2_fh_exit(&ctx->fh); bdisp_hw_free_nodes(ctx); @@ -659,7 +659,7 @@ static int bdisp_release(struct file *file) bdisp_ctrls_delete(ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); if (--bdisp->m2m.refcnt <= 0) diff --git a/drivers/media/platform/st/sti/delta/delta-v4l2.c b/drivers/media/platform/st/sti/delta/delta-v4l2.c index 3063a98ed25b..385b26d21408 100644 --- a/drivers/media/platform/st/sti/delta/delta-v4l2.c +++ b/drivers/media/platform/st/sti/delta/delta-v4l2.c @@ -1684,7 +1684,7 @@ static int delta_open(struct file *file) return 0; err_fh_del: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); err: @@ -1712,7 +1712,7 @@ static int delta_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); /* disable ST231 clocks */ diff --git a/drivers/media/platform/st/sti/hva/hva-v4l2.c b/drivers/media/platform/st/sti/hva/hva-v4l2.c index 2f9413fa7318..3581b73a99b8 100644 --- a/drivers/media/platform/st/sti/hva/hva-v4l2.c +++ b/drivers/media/platform/st/sti/hva/hva-v4l2.c @@ -1218,7 +1218,7 @@ static int hva_open(struct file *file) err_ctrls: v4l2_ctrl_handler_free(&ctx->ctrl_handler); err_fh: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); out: @@ -1249,7 +1249,7 @@ static int hva_release(struct file *file) v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); #ifdef CONFIG_VIDEO_STI_HVA_DEBUGFS diff --git a/drivers/media/platform/st/stm32/dma2d/dma2d.c b/drivers/media/platform/st/stm32/dma2d/dma2d.c index b2bced06a1e6..bc0f81e78018 100644 --- a/drivers/media/platform/st/stm32/dma2d/dma2d.c +++ b/drivers/media/platform/st/stm32/dma2d/dma2d.c @@ -326,7 +326,7 @@ static int dma2d_release(struct file *file) v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mutex_unlock(&dev->mutex); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); diff --git a/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c b/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c index 7823eb97faf7..eb519afb30ca 100644 --- a/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c +++ b/drivers/media/platform/sunxi/sun8i-di/sun8i-di.c @@ -759,7 +759,7 @@ static int deinterlace_release(struct file *file) mutex_lock(&dev->dev_mutex); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c b/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c index 368a858b8c0f..89992feaab60 100644 --- a/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c +++ b/drivers/media/platform/sunxi/sun8i-rotate/sun8i_rotate.c @@ -695,7 +695,7 @@ static int rotate_release(struct file *file) mutex_lock(&dev->dev_mutex); v4l2_ctrl_handler_free(&ctx->ctrl_handler); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/ti/omap3isp/ispvideo.c b/drivers/media/platform/ti/omap3isp/ispvideo.c index d10a2b96c13c..2c0008444b7e 100644 --- a/drivers/media/platform/ti/omap3isp/ispvideo.c +++ b/drivers/media/platform/ti/omap3isp/ispvideo.c @@ -1336,7 +1336,7 @@ static int isp_video_open(struct file *file) done: if (ret < 0) { - v4l2_fh_del(&handle->vfh); + v4l2_fh_del(&handle->vfh, file); v4l2_fh_exit(&handle->vfh); kfree(handle); } @@ -1360,10 +1360,9 @@ static int isp_video_release(struct file *file) v4l2_pipeline_pm_put(&video->video.entity); /* Release the file handle. */ - v4l2_fh_del(vfh); + v4l2_fh_del(vfh, file); v4l2_fh_exit(vfh); kfree(handle); - file->private_data = NULL; omap3isp_put(video->isp); diff --git a/drivers/media/platform/ti/vpe/vpe.c b/drivers/media/platform/ti/vpe/vpe.c index a47c5d31c475..6029d4e8e0bd 100644 --- a/drivers/media/platform/ti/vpe/vpe.c +++ b/drivers/media/platform/ti/vpe/vpe.c @@ -2421,7 +2421,7 @@ static int vpe_release(struct file *file) vpdma_free_desc_buf(&ctx->sc_coeff_v); vpdma_free_desc_buf(&ctx->sc_coeff_h); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c index aadc3d8fb3d1..4cc9d00fd293 100644 --- a/drivers/media/platform/verisilicon/hantro_drv.c +++ b/drivers/media/platform/verisilicon/hantro_drv.c @@ -677,7 +677,7 @@ static int hantro_open(struct file *filp) return 0; err_fh_free: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, filp); v4l2_fh_exit(&ctx->fh); err_ctx_free: kfree(ctx); @@ -693,7 +693,7 @@ static int hantro_release(struct file *filp) * to this file. */ v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, filp); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_handler); kfree(ctx); diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index f20d9d9643f5..c340fd226040 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -1946,7 +1946,7 @@ static int vicodec_release(struct file *file) mutex_lock(vfd->lock); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mutex_unlock(vfd->lock); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); kvfree(ctx->state.compressed_frame); diff --git a/drivers/media/test-drivers/vim2m.c b/drivers/media/test-drivers/vim2m.c index 24574025f58f..d0e760118c82 100644 --- a/drivers/media/test-drivers/vim2m.c +++ b/drivers/media/test-drivers/vim2m.c @@ -1450,7 +1450,7 @@ static int vim2m_release(struct file *file) dprintk(dev, 1, "Releasing instance %p\n", ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); mutex_lock(&dev->dev_mutex); diff --git a/drivers/media/test-drivers/visl/visl-core.c b/drivers/media/test-drivers/visl/visl-core.c index 0f43ec23f40b..26c6c6835f79 100644 --- a/drivers/media/test-drivers/visl/visl-core.c +++ b/drivers/media/test-drivers/visl/visl-core.c @@ -389,7 +389,7 @@ static int visl_release(struct file *file) dprintk(dev, "Releasing instance %p\n", ctx); tpg_free(&ctx->tpg); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); mutex_lock(&dev->dev_mutex); diff --git a/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c b/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c index 04c77af0c51e..f9535a484738 100644 --- a/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c +++ b/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c @@ -900,9 +900,8 @@ static int pvr2_v4l2_release(struct file *file) fhp->rhp = NULL; } - v4l2_fh_del(&fhp->fh); + v4l2_fh_del(&fhp->fh, file); v4l2_fh_exit(&fhp->fh); - file->private_data = NULL; pvr2_channel_done(&fhp->channel); pvr2_trace(PVR2_TRACE_STRUCT, diff --git a/drivers/media/v4l2-core/v4l2-fh.c b/drivers/media/v4l2-core/v4l2-fh.c index b59b1084d8cd..df3ba9d4674b 100644 --- a/drivers/media/v4l2-core/v4l2-fh.c +++ b/drivers/media/v4l2-core/v4l2-fh.c @@ -67,7 +67,7 @@ int v4l2_fh_open(struct file *filp) } EXPORT_SYMBOL_GPL(v4l2_fh_open); -void v4l2_fh_del(struct v4l2_fh *fh) +void v4l2_fh_del(struct v4l2_fh *fh, struct file *filp) { unsigned long flags; @@ -75,6 +75,8 @@ void v4l2_fh_del(struct v4l2_fh *fh) list_del_init(&fh->list); spin_unlock_irqrestore(&fh->vdev->fh_lock, flags); v4l2_prio_close(fh->vdev->prio, fh->prio); + + filp->private_data = NULL; } EXPORT_SYMBOL_GPL(v4l2_fh_del); @@ -94,10 +96,9 @@ int v4l2_fh_release(struct file *filp) struct v4l2_fh *fh = file_to_v4l2_fh(filp); if (fh) { - v4l2_fh_del(fh); + v4l2_fh_del(fh, filp); v4l2_fh_exit(fh); kfree(fh); - filp->private_data = NULL; } return 0; } diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index bf35ac436249..41e4aca77b7f 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -109,7 +109,7 @@ static int subdev_open(struct file *file) err: module_put(subdev_fh->owner); - v4l2_fh_del(&subdev_fh->vfh); + v4l2_fh_del(&subdev_fh->vfh, file); v4l2_fh_exit(&subdev_fh->vfh); subdev_fh_free(subdev_fh); kfree(subdev_fh); @@ -127,11 +127,10 @@ static int subdev_close(struct file *file) if (sd->internal_ops && sd->internal_ops->close) sd->internal_ops->close(sd, subdev_fh); module_put(subdev_fh->owner); - v4l2_fh_del(vfh); + v4l2_fh_del(vfh, file); v4l2_fh_exit(vfh); subdev_fh_free(subdev_fh); kfree(subdev_fh); - file->private_data = NULL; return 0; } diff --git a/drivers/staging/media/imx/imx-media-csc-scaler.c b/drivers/staging/media/imx/imx-media-csc-scaler.c index 7fedb33dda34..dc7f9a77cbe6 100644 --- a/drivers/staging/media/imx/imx-media-csc-scaler.c +++ b/drivers/staging/media/imx/imx-media-csc-scaler.c @@ -792,7 +792,7 @@ static int ipu_csc_scaler_open(struct file *file) err_ctrls: v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); err_ctx: - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); return ret; @@ -807,7 +807,7 @@ static int ipu_csc_scaler_release(struct file *file) v4l2_ctrl_handler_free(&ctx->ctrl_hdlr); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_fh_exit(&ctx->fh); kfree(ctx); diff --git a/drivers/staging/media/meson/vdec/vdec.c b/drivers/staging/media/meson/vdec/vdec.c index b92666ff50a1..49e497a32973 100644 --- a/drivers/staging/media/meson/vdec/vdec.c +++ b/drivers/staging/media/meson/vdec/vdec.c @@ -926,7 +926,7 @@ static int vdec_close(struct file *file) v4l2_m2m_ctx_release(sess->m2m_ctx); v4l2_m2m_release(sess->m2m_dev); - v4l2_fh_del(&sess->fh); + v4l2_fh_del(&sess->fh, file); v4l2_fh_exit(&sess->fh); mutex_destroy(&sess->lock); diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.c b/drivers/staging/media/sunxi/cedrus/cedrus.c index ebefd646dbdb..bff42ea1871f 100644 --- a/drivers/staging/media/sunxi/cedrus/cedrus.c +++ b/drivers/staging/media/sunxi/cedrus/cedrus.c @@ -404,7 +404,7 @@ static int cedrus_release(struct file *file) mutex_lock(&dev->dev_mutex); - v4l2_fh_del(&ctx->fh); + v4l2_fh_del(&ctx->fh, file); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(&ctx->hdl); diff --git a/drivers/staging/most/video/video.c b/drivers/staging/most/video/video.c index 24a68e3e5419..32f71d9a9cf7 100644 --- a/drivers/staging/most/video/video.c +++ b/drivers/staging/most/video/video.c @@ -107,7 +107,7 @@ static int comp_vdev_open(struct file *filp) return 0; err_rm: - v4l2_fh_del(&fh->fh); + v4l2_fh_del(&fh->fh, filp); v4l2_fh_exit(&fh->fh); err_dec: @@ -143,7 +143,7 @@ static int comp_vdev_close(struct file *filp) most_stop_channel(mdev->iface, mdev->ch_idx, &comp); mdev->mute = false; - v4l2_fh_del(&fh->fh); + v4l2_fh_del(&fh->fh, filp); v4l2_fh_exit(&fh->fh); atomic_dec(&mdev->access_ref); diff --git a/drivers/usb/gadget/function/uvc_v4l2.c b/drivers/usb/gadget/function/uvc_v4l2.c index 680f25d17dc2..fd4b998ccd16 100644 --- a/drivers/usb/gadget/function/uvc_v4l2.c +++ b/drivers/usb/gadget/function/uvc_v4l2.c @@ -692,8 +692,7 @@ uvc_v4l2_release(struct file *file) uvc_v4l2_disable(uvc); mutex_unlock(&video->mutex); - file->private_data = NULL; - v4l2_fh_del(&handle->vfh); + v4l2_fh_del(&handle->vfh, file); v4l2_fh_exit(&handle->vfh); kfree(handle); diff --git a/include/media/v4l2-fh.h b/include/media/v4l2-fh.h index d8fcf49f10e0..5e4c76163512 100644 --- a/include/media/v4l2-fh.h +++ b/include/media/v4l2-fh.h @@ -114,12 +114,15 @@ int v4l2_fh_open(struct file *filp); * v4l2_fh_del - Remove file handle from the list of file handles. * * @fh: pointer to &struct v4l2_fh + * @filp: pointer to &struct file associated with @fh + * + * The function resets filp->private_data to NULL. * * .. note:: * Must be called in v4l2_file_operations->release\(\) handler if the driver * uses &struct v4l2_fh. */ -void v4l2_fh_del(struct v4l2_fh *fh); +void v4l2_fh_del(struct v4l2_fh *fh, struct file *filp); /** * v4l2_fh_exit - Release resources related to a file handle. -- cgit v1.2.3 From bb4d6be205dae94aa2d3c3a1ad814dad90d4fd62 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:30:14 +0300 Subject: media: Drop V4L2_FL_USES_V4L2_FH checks Now that all drivers use v4l2_fh, we can drop the V4L2_FL_USES_V4L2_FH checks through the V4L2 core. To ensure that all new drivers use v4l2_fh, keep setting the V4L2_FL_USES_V4L2_FH flag in v4l2_fh_init(), and verify it is set after the .open() file operation returns. Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- Documentation/driver-api/media/v4l2-fh.rst | 15 ++------- .../zh_CN/video4linux/v4l2-framework.txt | 5 --- drivers/media/common/videobuf2/videobuf2-v4l2.c | 12 +++---- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 7 ++-- drivers/media/v4l2-core/v4l2-ctrls-api.c | 2 +- drivers/media/v4l2-core/v4l2-dev.c | 21 +++++++++--- drivers/media/v4l2-core/v4l2-ioctl.c | 37 +++++++--------------- drivers/media/v4l2-core/v4l2-mem2mem.c | 12 +++---- include/media/v4l2-dev.h | 2 +- include/media/v4l2-fh.h | 2 +- 10 files changed, 43 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/media/v4l2-fh.rst b/Documentation/driver-api/media/v4l2-fh.rst index afcad22ead7c..a934caa483a4 100644 --- a/Documentation/driver-api/media/v4l2-fh.rst +++ b/Documentation/driver-api/media/v4l2-fh.rst @@ -3,13 +3,8 @@ V4L2 File handles ----------------- -struct v4l2_fh provides a way to easily keep file handle specific -data that is used by the V4L2 framework. - -.. attention:: - New drivers must use struct v4l2_fh - since it is also used to implement priority handling - (:ref:`VIDIOC_G_PRIORITY`). +struct v4l2_fh provides a way to easily keep file handle specific data that is +used by the V4L2 framework. Its usage is mandatory in all drivers. struct v4l2_fh is allocated in the driver's ``open()`` file operation handler. It is typically embedded in a larger driver-specific structure. The @@ -134,12 +129,6 @@ associated device node: - Same, but it calls v4l2_fh_is_singular with filp->private_data. -.. note:: - The V4L2 framework knows whether a driver uses :c:type:`v4l2_fh` as its - ``file->private_data`` pointer by testing the ``V4L2_FL_USES_V4L2_FH`` - bit in :c:type:`video_device`->flags. This bit is set whenever - :c:func:`v4l2_fh_init` is called. - V4L2 fh functions and data structures ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt index 1653c6e2cb46..f0be21a60a0f 100644 --- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt @@ -775,11 +775,6 @@ v4l2_fh 结构体提供一个保存用于 V4L2 框架的文件句柄特定数据 如果 video_device 标志,新驱动 必须使用 v4l2_fh 结构体,因为它也用于实现优先级处理(VIDIOC_G/S_PRIORITY)。 -v4l2_fh 的用户(位于 V4l2 框架中,并非驱动)可通过测试 -video_device->flags 中的 V4L2_FL_USES_V4L2_FH 位得知驱动是否使用 -v4l2_fh 作为他的 file->private_data 指针。这个位会在调用 v4l2_fh_init() -时被设置。 - v4l2_fh 结构体作为驱动自身文件句柄结构体的一部分被分配,且驱动在 其打开函数中将 file->private_data 指向它。 diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index f29307e59be5..d911021c1bb0 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -973,18 +973,14 @@ EXPORT_SYMBOL_GPL(vb2_queue_change_type); __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait) { - struct video_device *vfd = video_devdata(file); + struct v4l2_fh *fh = file_to_v4l2_fh(file); __poll_t res; res = vb2_core_poll(q, file, wait); - if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) { - struct v4l2_fh *fh = file_to_v4l2_fh(file); - - poll_wait(file, &fh->wait, wait); - if (v4l2_event_pending(fh)) - res |= EPOLLPRI; - } + poll_wait(file, &fh->wait, wait); + if (v4l2_event_pending(fh)) + res |= EPOLLPRI; return res; } diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index 8a5559225ff2..e5642e639811 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -672,15 +672,12 @@ struct v4l2_ext_control32 { static inline bool ctrl_is_pointer(struct file *file, u32 id) { struct video_device *vdev = video_devdata(file); - struct v4l2_fh *fh = NULL; + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct v4l2_ctrl_handler *hdl = NULL; struct v4l2_query_ext_ctrl qec = { id }; const struct v4l2_ioctl_ops *ops = vdev->ioctl_ops; - if (test_bit(V4L2_FL_USES_V4L2_FH, &vdev->flags)) - fh = file_to_v4l2_fh(file); - - if (fh && fh->ctrl_handler) + if (fh->ctrl_handler) hdl = fh->ctrl_handler; else if (vdev->ctrl_handler) hdl = vdev->ctrl_handler; diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c index b0bba8eec143..afb4e5581b90 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-api.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c @@ -1254,7 +1254,7 @@ int v4l2_ctrl_log_status(struct file *file, void *fh) { struct video_device *vfd = video_devdata(file); - if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) && vfd->v4l2_dev) { + if (vfd->v4l2_dev) { struct v4l2_fh *vfh = file_to_v4l2_fh(file); v4l2_ctrl_handler_log_status(vfh->ctrl_handler, diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index 1a4184b94838..10a126e50c1c 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -425,14 +425,26 @@ static int v4l2_open(struct inode *inode, struct file *filp) video_get(vdev); mutex_unlock(&videodev_lock); - if (video_is_registered(vdev)) - ret = vdev->fops->open(filp); - else + if (!video_is_registered(vdev)) { + ret = -ENODEV; + goto done; + } + + ret = vdev->fops->open(filp); + if (ret) + goto done; + + /* All drivers must use v4l2_fh. */ + if (WARN_ON(!test_bit(V4L2_FL_USES_V4L2_FH, &vdev->flags))) { + vdev->fops->release(filp); ret = -ENODEV; + } +done: if (vdev->dev_debug & V4L2_DEV_DEBUG_FOP) dprintk("%s: open (%d)\n", video_device_node_name(vdev), ret); + /* decrease the refcount in case of an error */ if (ret) video_put(vdev); @@ -1114,8 +1126,7 @@ void video_unregister_device(struct video_device *vdev) */ clear_bit(V4L2_FL_REGISTERED, &vdev->flags); mutex_unlock(&videodev_lock); - if (test_bit(V4L2_FL_USES_V4L2_FH, &vdev->flags)) - v4l2_event_wake_all(vdev); + v4l2_event_wake_all(vdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL(video_unregister_device); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 8c81852c3046..6c684884873e 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1195,8 +1195,6 @@ static int v4l_s_priority(const struct v4l2_ioctl_ops *ops, u32 *p = arg; vfd = video_devdata(file); - if (!test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) - return -ENOTTY; vfh = file_to_v4l2_fh(file); return v4l2_prio_change(vfd->prio, &vfh->prio, *p); } @@ -2297,8 +2295,7 @@ static int v4l_queryctrl(const struct v4l2_ioctl_ops *ops, struct video_device *vfd = video_devdata(file); struct v4l2_query_ext_ctrl qec = {}; struct v4l2_queryctrl *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; int ret; if (vfh && vfh->ctrl_handler) @@ -2322,8 +2319,7 @@ static int v4l_query_ext_ctrl(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_query_ext_ctrl *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; if (vfh && vfh->ctrl_handler) return v4l2_query_ext_ctrl(vfh->ctrl_handler, p); @@ -2339,8 +2335,7 @@ static int v4l_querymenu(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_querymenu *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; if (vfh && vfh->ctrl_handler) return v4l2_querymenu(vfh->ctrl_handler, p); @@ -2356,8 +2351,7 @@ static int v4l_g_ctrl(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_control *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; struct v4l2_ext_controls ctrls; struct v4l2_ext_control ctrl; @@ -2388,8 +2382,7 @@ static int v4l_s_ctrl(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_control *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; struct v4l2_ext_controls ctrls; struct v4l2_ext_control ctrl; int ret; @@ -2418,8 +2411,7 @@ static int v4l_g_ext_ctrls(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_ext_controls *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; p->error_idx = p->count; if (vfh && vfh->ctrl_handler) @@ -2439,8 +2431,7 @@ static int v4l_s_ext_ctrls(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_ext_controls *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; p->error_idx = p->count; if (vfh && vfh->ctrl_handler) @@ -2460,8 +2451,7 @@ static int v4l_try_ext_ctrls(const struct v4l2_ioctl_ops *ops, { struct video_device *vfd = video_devdata(file); struct v4l2_ext_controls *p = arg; - struct v4l2_fh *vfh = - test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags) ? fh : NULL; + struct v4l2_fh *vfh = fh; p->error_idx = p->count; if (vfh && vfh->ctrl_handler) @@ -3073,7 +3063,7 @@ static long __video_do_ioctl(struct file *file, struct v4l2_ioctl_info default_info; const struct v4l2_ioctl_info *info; void *fh = file->private_data; - struct v4l2_fh *vfh = NULL; + struct v4l2_fh *vfh = file_to_v4l2_fh(file); int dev_debug = vfd->dev_debug; long ret = -ENOTTY; @@ -3083,9 +3073,6 @@ static long __video_do_ioctl(struct file *file, return ret; } - if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) - vfh = file_to_v4l2_fh(file); - /* * We need to serialize streamon/off with queueing new requests. * These ioctls may trigger the cancellation of a streaming @@ -3117,10 +3104,10 @@ static long __video_do_ioctl(struct file *file, info = &v4l2_ioctls[_IOC_NR(cmd)]; if (!is_valid_ioctl(vfd, cmd) && - !((info->flags & INFO_FL_CTRL) && vfh && vfh->ctrl_handler)) + !((info->flags & INFO_FL_CTRL) && vfh->ctrl_handler)) goto done; - if (vfh && (info->flags & INFO_FL_PRIO)) { + if (info->flags & INFO_FL_PRIO) { ret = v4l2_prio_check(vfd->prio, vfh->prio); if (ret) goto done; @@ -3139,7 +3126,7 @@ static long __video_do_ioctl(struct file *file, ret = -ENOTTY; } else { ret = ops->vidioc_default(file, fh, - vfh ? v4l2_prio_check(vfd->prio, vfh->prio) >= 0 : 0, + v4l2_prio_check(vfd->prio, vfh->prio) >= 0, cmd, arg); } diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c index e67e67f76f72..7678b8dbedbd 100644 --- a/drivers/media/v4l2-core/v4l2-mem2mem.c +++ b/drivers/media/v4l2-core/v4l2-mem2mem.c @@ -951,7 +951,7 @@ static __poll_t v4l2_m2m_poll_for_data(struct file *file, __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct poll_table_struct *wait) { - struct video_device *vfd = video_devdata(file); + struct v4l2_fh *fh = file_to_v4l2_fh(file); struct vb2_queue *src_q = v4l2_m2m_get_src_vq(m2m_ctx); struct vb2_queue *dst_q = v4l2_m2m_get_dst_vq(m2m_ctx); __poll_t req_events = poll_requested_events(wait); @@ -970,13 +970,9 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, if (req_events & (EPOLLOUT | EPOLLWRNORM | EPOLLIN | EPOLLRDNORM)) rc = v4l2_m2m_poll_for_data(file, m2m_ctx, wait); - if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) { - struct v4l2_fh *fh = file_to_v4l2_fh(file); - - poll_wait(file, &fh->wait, wait); - if (v4l2_event_pending(fh)) - rc |= EPOLLPRI; - } + poll_wait(file, &fh->wait, wait); + if (v4l2_event_pending(fh)) + rc |= EPOLLPRI; return rc; } diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h index a69801274800..a213c3398dcf 100644 --- a/include/media/v4l2-dev.h +++ b/include/media/v4l2-dev.h @@ -74,7 +74,7 @@ struct dentry; * @V4L2_FL_USES_V4L2_FH: * indicates that file->private_data points to &struct v4l2_fh. * This flag is set by the core when v4l2_fh_init() is called. - * All new drivers should use it. + * All drivers must use it. * @V4L2_FL_QUIRK_INVERTED_CROP: * some old M2M drivers use g/s_crop/cropcap incorrectly: crop and * compose are swapped. If this flag is set, then the selection diff --git a/include/media/v4l2-fh.h b/include/media/v4l2-fh.h index 5e4c76163512..aad4b3689d7e 100644 --- a/include/media/v4l2-fh.h +++ b/include/media/v4l2-fh.h @@ -3,7 +3,7 @@ * v4l2-fh.h * * V4L2 file handle. Store per file handle data for the V4L2 - * framework. Using file handles is optional for the drivers. + * framework. Using file handles is mandatory for the drivers. * * Copyright (C) 2009--2010 Nokia Corporation. * -- cgit v1.2.3 From 0f1a7facb64abe5104c3eb235824ea8d6296474d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:30:51 +0300 Subject: media: v4l2-ioctl: Stop passing fh pointer to ioctl handlers Now that all drivers access the v4l2_fh from the file structure, there is no need to pass it as an explicit argument to ioctl handlers. Set the argument to NULL in the __video_do_ioctl() function, and rename the 'fh' argument in the ioctl handler declarations to 'priv' indicate it does not contain a file handle. The argument could be removed altogether with a mechanical change (probably using coccinelle), but there are plans to pass a new argument to the ioctl handlers in the near future. The tree-wide change to remove the argument, only to add another one soon after, would be too much churn. While at it, fix argument alignment in vidioc_try_fmt_vid_out_overlay(). Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 2 +- drivers/media/v4l2-core/v4l2-ioctl.c | 4 +- include/media/v4l2-ioctl.h | 238 +++++++++++++------------- 3 files changed, 122 insertions(+), 122 deletions(-) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index e5642e639811..85c10ba9f813 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -691,7 +691,7 @@ static inline bool ctrl_is_pointer(struct file *file, u32 id) if (!ops || !ops->vidioc_query_ext_ctrl) return false; - return !ops->vidioc_query_ext_ctrl(file, fh, &qec) && + return !ops->vidioc_query_ext_ctrl(file, NULL, &qec) && (qec.flags & V4L2_CTRL_FLAG_HAS_PAYLOAD); } diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 378a2a149fec..d815291624a9 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -3126,11 +3126,11 @@ static long __video_do_ioctl(struct file *file, write_only = _IOC_DIR(cmd) == _IOC_WRITE; if (info != &default_info) { - ret = info->func(ops, file, vfh, arg); + ret = info->func(ops, file, NULL, arg); } else if (!ops->vidioc_default) { ret = -ENOTTY; } else { - ret = ops->vidioc_default(file, vfh, + ret = ops->vidioc_default(file, NULL, v4l2_prio_check(vfd->prio, vfh->prio) >= 0, cmd, arg); } diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h index 82695c3a300a..6f7a58350441 100644 --- a/include/media/v4l2-ioctl.h +++ b/include/media/v4l2-ioctl.h @@ -293,144 +293,144 @@ struct v4l2_ioctl_ops { /* ioctl callbacks */ /* VIDIOC_QUERYCAP handler */ - int (*vidioc_querycap)(struct file *file, void *fh, + int (*vidioc_querycap)(struct file *file, void *priv, struct v4l2_capability *cap); /* VIDIOC_ENUM_FMT handlers */ - int (*vidioc_enum_fmt_vid_cap)(struct file *file, void *fh, + int (*vidioc_enum_fmt_vid_cap)(struct file *file, void *priv, struct v4l2_fmtdesc *f); - int (*vidioc_enum_fmt_vid_overlay)(struct file *file, void *fh, + int (*vidioc_enum_fmt_vid_overlay)(struct file *file, void *priv, struct v4l2_fmtdesc *f); - int (*vidioc_enum_fmt_vid_out)(struct file *file, void *fh, + int (*vidioc_enum_fmt_vid_out)(struct file *file, void *priv, struct v4l2_fmtdesc *f); - int (*vidioc_enum_fmt_sdr_cap)(struct file *file, void *fh, + int (*vidioc_enum_fmt_sdr_cap)(struct file *file, void *priv, struct v4l2_fmtdesc *f); - int (*vidioc_enum_fmt_sdr_out)(struct file *file, void *fh, + int (*vidioc_enum_fmt_sdr_out)(struct file *file, void *priv, struct v4l2_fmtdesc *f); - int (*vidioc_enum_fmt_meta_cap)(struct file *file, void *fh, + int (*vidioc_enum_fmt_meta_cap)(struct file *file, void *priv, struct v4l2_fmtdesc *f); - int (*vidioc_enum_fmt_meta_out)(struct file *file, void *fh, + int (*vidioc_enum_fmt_meta_out)(struct file *file, void *priv, struct v4l2_fmtdesc *f); /* VIDIOC_G_FMT handlers */ - int (*vidioc_g_fmt_vid_cap)(struct file *file, void *fh, + int (*vidioc_g_fmt_vid_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vid_overlay)(struct file *file, void *fh, + int (*vidioc_g_fmt_vid_overlay)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vid_out)(struct file *file, void *fh, + int (*vidioc_g_fmt_vid_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vid_out_overlay)(struct file *file, void *fh, + int (*vidioc_g_fmt_vid_out_overlay)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vbi_cap)(struct file *file, void *fh, + int (*vidioc_g_fmt_vbi_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vbi_out)(struct file *file, void *fh, + int (*vidioc_g_fmt_vbi_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_sliced_vbi_cap)(struct file *file, void *fh, + int (*vidioc_g_fmt_sliced_vbi_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_sliced_vbi_out)(struct file *file, void *fh, + int (*vidioc_g_fmt_sliced_vbi_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vid_cap_mplane)(struct file *file, void *fh, + int (*vidioc_g_fmt_vid_cap_mplane)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_vid_out_mplane)(struct file *file, void *fh, + int (*vidioc_g_fmt_vid_out_mplane)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_sdr_cap)(struct file *file, void *fh, + int (*vidioc_g_fmt_sdr_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_sdr_out)(struct file *file, void *fh, + int (*vidioc_g_fmt_sdr_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_meta_cap)(struct file *file, void *fh, + int (*vidioc_g_fmt_meta_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_g_fmt_meta_out)(struct file *file, void *fh, + int (*vidioc_g_fmt_meta_out)(struct file *file, void *priv, struct v4l2_format *f); /* VIDIOC_S_FMT handlers */ - int (*vidioc_s_fmt_vid_cap)(struct file *file, void *fh, + int (*vidioc_s_fmt_vid_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vid_overlay)(struct file *file, void *fh, + int (*vidioc_s_fmt_vid_overlay)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vid_out)(struct file *file, void *fh, + int (*vidioc_s_fmt_vid_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vid_out_overlay)(struct file *file, void *fh, + int (*vidioc_s_fmt_vid_out_overlay)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vbi_cap)(struct file *file, void *fh, + int (*vidioc_s_fmt_vbi_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vbi_out)(struct file *file, void *fh, + int (*vidioc_s_fmt_vbi_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_sliced_vbi_cap)(struct file *file, void *fh, + int (*vidioc_s_fmt_sliced_vbi_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_sliced_vbi_out)(struct file *file, void *fh, + int (*vidioc_s_fmt_sliced_vbi_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vid_cap_mplane)(struct file *file, void *fh, + int (*vidioc_s_fmt_vid_cap_mplane)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_vid_out_mplane)(struct file *file, void *fh, + int (*vidioc_s_fmt_vid_out_mplane)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_sdr_cap)(struct file *file, void *fh, + int (*vidioc_s_fmt_sdr_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_sdr_out)(struct file *file, void *fh, + int (*vidioc_s_fmt_sdr_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_meta_cap)(struct file *file, void *fh, + int (*vidioc_s_fmt_meta_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_s_fmt_meta_out)(struct file *file, void *fh, + int (*vidioc_s_fmt_meta_out)(struct file *file, void *priv, struct v4l2_format *f); /* VIDIOC_TRY_FMT handlers */ - int (*vidioc_try_fmt_vid_cap)(struct file *file, void *fh, + int (*vidioc_try_fmt_vid_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_vid_overlay)(struct file *file, void *fh, + int (*vidioc_try_fmt_vid_overlay)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_vid_out)(struct file *file, void *fh, + int (*vidioc_try_fmt_vid_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_vid_out_overlay)(struct file *file, void *fh, - struct v4l2_format *f); - int (*vidioc_try_fmt_vbi_cap)(struct file *file, void *fh, + int (*vidioc_try_fmt_vid_out_overlay)(struct file *file, void *priv, + struct v4l2_format *f); + int (*vidioc_try_fmt_vbi_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_vbi_out)(struct file *file, void *fh, + int (*vidioc_try_fmt_vbi_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_sliced_vbi_cap)(struct file *file, void *fh, + int (*vidioc_try_fmt_sliced_vbi_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_sliced_vbi_out)(struct file *file, void *fh, + int (*vidioc_try_fmt_sliced_vbi_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_vid_cap_mplane)(struct file *file, void *fh, + int (*vidioc_try_fmt_vid_cap_mplane)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_vid_out_mplane)(struct file *file, void *fh, + int (*vidioc_try_fmt_vid_out_mplane)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_sdr_cap)(struct file *file, void *fh, + int (*vidioc_try_fmt_sdr_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_sdr_out)(struct file *file, void *fh, + int (*vidioc_try_fmt_sdr_out)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_meta_cap)(struct file *file, void *fh, + int (*vidioc_try_fmt_meta_cap)(struct file *file, void *priv, struct v4l2_format *f); - int (*vidioc_try_fmt_meta_out)(struct file *file, void *fh, + int (*vidioc_try_fmt_meta_out)(struct file *file, void *priv, struct v4l2_format *f); /* Buffer handlers */ - int (*vidioc_reqbufs)(struct file *file, void *fh, + int (*vidioc_reqbufs)(struct file *file, void *priv, struct v4l2_requestbuffers *b); - int (*vidioc_querybuf)(struct file *file, void *fh, + int (*vidioc_querybuf)(struct file *file, void *priv, struct v4l2_buffer *b); - int (*vidioc_qbuf)(struct file *file, void *fh, + int (*vidioc_qbuf)(struct file *file, void *priv, struct v4l2_buffer *b); - int (*vidioc_expbuf)(struct file *file, void *fh, + int (*vidioc_expbuf)(struct file *file, void *priv, struct v4l2_exportbuffer *e); - int (*vidioc_dqbuf)(struct file *file, void *fh, + int (*vidioc_dqbuf)(struct file *file, void *priv, struct v4l2_buffer *b); - int (*vidioc_create_bufs)(struct file *file, void *fh, + int (*vidioc_create_bufs)(struct file *file, void *priv, struct v4l2_create_buffers *b); - int (*vidioc_prepare_buf)(struct file *file, void *fh, + int (*vidioc_prepare_buf)(struct file *file, void *priv, struct v4l2_buffer *b); - int (*vidioc_remove_bufs)(struct file *file, void *fh, + int (*vidioc_remove_bufs)(struct file *file, void *priv, struct v4l2_remove_buffers *d); - int (*vidioc_overlay)(struct file *file, void *fh, unsigned int i); - int (*vidioc_g_fbuf)(struct file *file, void *fh, + int (*vidioc_overlay)(struct file *file, void *priv, unsigned int i); + int (*vidioc_g_fbuf)(struct file *file, void *priv, struct v4l2_framebuffer *a); - int (*vidioc_s_fbuf)(struct file *file, void *fh, + int (*vidioc_s_fbuf)(struct file *file, void *priv, const struct v4l2_framebuffer *a); /* Stream on/off */ - int (*vidioc_streamon)(struct file *file, void *fh, + int (*vidioc_streamon)(struct file *file, void *priv, enum v4l2_buf_type i); - int (*vidioc_streamoff)(struct file *file, void *fh, + int (*vidioc_streamoff)(struct file *file, void *priv, enum v4l2_buf_type i); /* @@ -438,135 +438,135 @@ struct v4l2_ioctl_ops { * * Note: ENUMSTD is handled by videodev.c */ - int (*vidioc_g_std)(struct file *file, void *fh, v4l2_std_id *norm); - int (*vidioc_s_std)(struct file *file, void *fh, v4l2_std_id norm); - int (*vidioc_querystd)(struct file *file, void *fh, v4l2_std_id *a); + int (*vidioc_g_std)(struct file *file, void *priv, v4l2_std_id *norm); + int (*vidioc_s_std)(struct file *file, void *priv, v4l2_std_id norm); + int (*vidioc_querystd)(struct file *file, void *priv, v4l2_std_id *a); /* Input handling */ - int (*vidioc_enum_input)(struct file *file, void *fh, + int (*vidioc_enum_input)(struct file *file, void *priv, struct v4l2_input *inp); - int (*vidioc_g_input)(struct file *file, void *fh, unsigned int *i); - int (*vidioc_s_input)(struct file *file, void *fh, unsigned int i); + int (*vidioc_g_input)(struct file *file, void *priv, unsigned int *i); + int (*vidioc_s_input)(struct file *file, void *priv, unsigned int i); /* Output handling */ - int (*vidioc_enum_output)(struct file *file, void *fh, + int (*vidioc_enum_output)(struct file *file, void *priv, struct v4l2_output *a); - int (*vidioc_g_output)(struct file *file, void *fh, unsigned int *i); - int (*vidioc_s_output)(struct file *file, void *fh, unsigned int i); + int (*vidioc_g_output)(struct file *file, void *priv, unsigned int *i); + int (*vidioc_s_output)(struct file *file, void *priv, unsigned int i); /* Control handling */ - int (*vidioc_query_ext_ctrl)(struct file *file, void *fh, + int (*vidioc_query_ext_ctrl)(struct file *file, void *priv, struct v4l2_query_ext_ctrl *a); - int (*vidioc_g_ext_ctrls)(struct file *file, void *fh, + int (*vidioc_g_ext_ctrls)(struct file *file, void *priv, struct v4l2_ext_controls *a); - int (*vidioc_s_ext_ctrls)(struct file *file, void *fh, + int (*vidioc_s_ext_ctrls)(struct file *file, void *priv, struct v4l2_ext_controls *a); - int (*vidioc_try_ext_ctrls)(struct file *file, void *fh, + int (*vidioc_try_ext_ctrls)(struct file *file, void *priv, struct v4l2_ext_controls *a); - int (*vidioc_querymenu)(struct file *file, void *fh, + int (*vidioc_querymenu)(struct file *file, void *priv, struct v4l2_querymenu *a); /* Audio ioctls */ - int (*vidioc_enumaudio)(struct file *file, void *fh, + int (*vidioc_enumaudio)(struct file *file, void *priv, struct v4l2_audio *a); - int (*vidioc_g_audio)(struct file *file, void *fh, + int (*vidioc_g_audio)(struct file *file, void *priv, struct v4l2_audio *a); - int (*vidioc_s_audio)(struct file *file, void *fh, + int (*vidioc_s_audio)(struct file *file, void *priv, const struct v4l2_audio *a); /* Audio out ioctls */ - int (*vidioc_enumaudout)(struct file *file, void *fh, + int (*vidioc_enumaudout)(struct file *file, void *priv, struct v4l2_audioout *a); - int (*vidioc_g_audout)(struct file *file, void *fh, + int (*vidioc_g_audout)(struct file *file, void *priv, struct v4l2_audioout *a); - int (*vidioc_s_audout)(struct file *file, void *fh, + int (*vidioc_s_audout)(struct file *file, void *priv, const struct v4l2_audioout *a); - int (*vidioc_g_modulator)(struct file *file, void *fh, + int (*vidioc_g_modulator)(struct file *file, void *priv, struct v4l2_modulator *a); - int (*vidioc_s_modulator)(struct file *file, void *fh, + int (*vidioc_s_modulator)(struct file *file, void *priv, const struct v4l2_modulator *a); /* Crop ioctls */ - int (*vidioc_g_pixelaspect)(struct file *file, void *fh, + int (*vidioc_g_pixelaspect)(struct file *file, void *priv, int buf_type, struct v4l2_fract *aspect); - int (*vidioc_g_selection)(struct file *file, void *fh, + int (*vidioc_g_selection)(struct file *file, void *priv, struct v4l2_selection *s); - int (*vidioc_s_selection)(struct file *file, void *fh, + int (*vidioc_s_selection)(struct file *file, void *priv, struct v4l2_selection *s); /* Compression ioctls */ - int (*vidioc_g_jpegcomp)(struct file *file, void *fh, + int (*vidioc_g_jpegcomp)(struct file *file, void *priv, struct v4l2_jpegcompression *a); - int (*vidioc_s_jpegcomp)(struct file *file, void *fh, + int (*vidioc_s_jpegcomp)(struct file *file, void *priv, const struct v4l2_jpegcompression *a); - int (*vidioc_g_enc_index)(struct file *file, void *fh, + int (*vidioc_g_enc_index)(struct file *file, void *priv, struct v4l2_enc_idx *a); - int (*vidioc_encoder_cmd)(struct file *file, void *fh, + int (*vidioc_encoder_cmd)(struct file *file, void *priv, struct v4l2_encoder_cmd *a); - int (*vidioc_try_encoder_cmd)(struct file *file, void *fh, + int (*vidioc_try_encoder_cmd)(struct file *file, void *priv, struct v4l2_encoder_cmd *a); - int (*vidioc_decoder_cmd)(struct file *file, void *fh, + int (*vidioc_decoder_cmd)(struct file *file, void *priv, struct v4l2_decoder_cmd *a); - int (*vidioc_try_decoder_cmd)(struct file *file, void *fh, + int (*vidioc_try_decoder_cmd)(struct file *file, void *priv, struct v4l2_decoder_cmd *a); /* Stream type-dependent parameter ioctls */ - int (*vidioc_g_parm)(struct file *file, void *fh, + int (*vidioc_g_parm)(struct file *file, void *priv, struct v4l2_streamparm *a); - int (*vidioc_s_parm)(struct file *file, void *fh, + int (*vidioc_s_parm)(struct file *file, void *priv, struct v4l2_streamparm *a); /* Tuner ioctls */ - int (*vidioc_g_tuner)(struct file *file, void *fh, + int (*vidioc_g_tuner)(struct file *file, void *priv, struct v4l2_tuner *a); - int (*vidioc_s_tuner)(struct file *file, void *fh, + int (*vidioc_s_tuner)(struct file *file, void *priv, const struct v4l2_tuner *a); - int (*vidioc_g_frequency)(struct file *file, void *fh, + int (*vidioc_g_frequency)(struct file *file, void *priv, struct v4l2_frequency *a); - int (*vidioc_s_frequency)(struct file *file, void *fh, + int (*vidioc_s_frequency)(struct file *file, void *priv, const struct v4l2_frequency *a); - int (*vidioc_enum_freq_bands)(struct file *file, void *fh, + int (*vidioc_enum_freq_bands)(struct file *file, void *priv, struct v4l2_frequency_band *band); /* Sliced VBI cap */ - int (*vidioc_g_sliced_vbi_cap)(struct file *file, void *fh, + int (*vidioc_g_sliced_vbi_cap)(struct file *file, void *priv, struct v4l2_sliced_vbi_cap *a); /* Log status ioctl */ - int (*vidioc_log_status)(struct file *file, void *fh); + int (*vidioc_log_status)(struct file *file, void *priv); - int (*vidioc_s_hw_freq_seek)(struct file *file, void *fh, + int (*vidioc_s_hw_freq_seek)(struct file *file, void *priv, const struct v4l2_hw_freq_seek *a); /* Debugging ioctls */ #ifdef CONFIG_VIDEO_ADV_DEBUG - int (*vidioc_g_register)(struct file *file, void *fh, + int (*vidioc_g_register)(struct file *file, void *priv, struct v4l2_dbg_register *reg); - int (*vidioc_s_register)(struct file *file, void *fh, + int (*vidioc_s_register)(struct file *file, void *priv, const struct v4l2_dbg_register *reg); - int (*vidioc_g_chip_info)(struct file *file, void *fh, + int (*vidioc_g_chip_info)(struct file *file, void *priv, struct v4l2_dbg_chip_info *chip); #endif - int (*vidioc_enum_framesizes)(struct file *file, void *fh, + int (*vidioc_enum_framesizes)(struct file *file, void *priv, struct v4l2_frmsizeenum *fsize); - int (*vidioc_enum_frameintervals)(struct file *file, void *fh, + int (*vidioc_enum_frameintervals)(struct file *file, void *priv, struct v4l2_frmivalenum *fival); /* DV Timings IOCTLs */ - int (*vidioc_s_dv_timings)(struct file *file, void *fh, + int (*vidioc_s_dv_timings)(struct file *file, void *priv, struct v4l2_dv_timings *timings); - int (*vidioc_g_dv_timings)(struct file *file, void *fh, + int (*vidioc_g_dv_timings)(struct file *file, void *priv, struct v4l2_dv_timings *timings); - int (*vidioc_query_dv_timings)(struct file *file, void *fh, + int (*vidioc_query_dv_timings)(struct file *file, void *priv, struct v4l2_dv_timings *timings); - int (*vidioc_enum_dv_timings)(struct file *file, void *fh, + int (*vidioc_enum_dv_timings)(struct file *file, void *priv, struct v4l2_enum_dv_timings *timings); - int (*vidioc_dv_timings_cap)(struct file *file, void *fh, + int (*vidioc_dv_timings_cap)(struct file *file, void *priv, struct v4l2_dv_timings_cap *cap); - int (*vidioc_g_edid)(struct file *file, void *fh, + int (*vidioc_g_edid)(struct file *file, void *priv, struct v4l2_edid *edid); - int (*vidioc_s_edid)(struct file *file, void *fh, + int (*vidioc_s_edid)(struct file *file, void *priv, struct v4l2_edid *edid); int (*vidioc_subscribe_event)(struct v4l2_fh *fh, @@ -575,7 +575,7 @@ struct v4l2_ioctl_ops { const struct v4l2_event_subscription *sub); /* For other private ioctls */ - long (*vidioc_default)(struct file *file, void *fh, + long (*vidioc_default)(struct file *file, void *priv, bool valid_prio, unsigned int cmd, void *arg); }; -- cgit v1.2.3 From 9d05191c4ed31bb817c03a1b7028ed81fefa8bfb Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 10 Aug 2025 04:30:56 +0300 Subject: media: v4l2-core: Rename second ioctl handlers argument to 'void *priv' The second argument to the ioctl handlers is not a file handle any more. Rename it from 'void *fh' to 'void *priv' in the V4L2 core, to avoid misconceptions. While at it, align function arguments in include/media/v4l2-mem2mem.h. Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-ctrls-api.c | 2 +- drivers/media/v4l2-core/v4l2-mem2mem.c | 6 ++--- include/media/v4l2-ctrls.h | 4 +-- include/media/v4l2-mem2mem.h | 42 ++++++++++++++++---------------- 4 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c index afb4e5581b90..49a5c7538a09 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-api.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c @@ -1250,7 +1250,7 @@ EXPORT_SYMBOL(v4l2_querymenu); * VIDIOC_LOG_STATUS helpers */ -int v4l2_ctrl_log_status(struct file *file, void *fh) +int v4l2_ctrl_log_status(struct file *file, void *priv) { struct video_device *vfd = video_devdata(file); diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c index 7678b8dbedbd..21acd9bc8607 100644 --- a/drivers/media/v4l2-core/v4l2-mem2mem.c +++ b/drivers/media/v4l2-core/v4l2-mem2mem.c @@ -1460,7 +1460,7 @@ int v4l2_m2m_ioctl_streamoff(struct file *file, void *priv, } EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_streamoff); -int v4l2_m2m_ioctl_try_encoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_try_encoder_cmd(struct file *file, void *priv, struct v4l2_encoder_cmd *ec) { if (ec->cmd != V4L2_ENC_CMD_STOP && ec->cmd != V4L2_ENC_CMD_START) @@ -1471,7 +1471,7 @@ int v4l2_m2m_ioctl_try_encoder_cmd(struct file *file, void *fh, } EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_try_encoder_cmd); -int v4l2_m2m_ioctl_try_decoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_try_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc) { if (dc->cmd != V4L2_DEC_CMD_STOP && dc->cmd != V4L2_DEC_CMD_START) @@ -1553,7 +1553,7 @@ int v4l2_m2m_ioctl_decoder_cmd(struct file *file, void *priv, } EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_decoder_cmd); -int v4l2_m2m_ioctl_stateless_try_decoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_stateless_try_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc) { if (dc->cmd != V4L2_DEC_CMD_FLUSH) diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index c32c46286441..4a294a5c7bdd 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -1313,13 +1313,13 @@ void v4l2_ctrl_merge(const struct v4l2_event *old, struct v4l2_event *new); * v4l2_ctrl_log_status - helper function to implement %VIDIOC_LOG_STATUS ioctl * * @file: pointer to struct file - * @fh: unused. Kept just to be compatible to the arguments expected by + * @priv: unused. Kept just to be compatible to the arguments expected by * &struct v4l2_ioctl_ops.vidioc_log_status. * * Can be used as a vidioc_log_status function that just dumps all controls * associated with the filehandle. */ -int v4l2_ctrl_log_status(struct file *file, void *fh); +int v4l2_ctrl_log_status(struct file *file, void *priv); /** * v4l2_ctrl_subscribe_event - Subscribes to an event diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h index 0af330cf91c3..09c6164577cc 100644 --- a/include/media/v4l2-mem2mem.h +++ b/include/media/v4l2-mem2mem.h @@ -864,34 +864,34 @@ void v4l2_m2m_request_queue(struct media_request *req); /* v4l2 ioctl helpers */ int v4l2_m2m_ioctl_reqbufs(struct file *file, void *priv, - struct v4l2_requestbuffers *rb); -int v4l2_m2m_ioctl_create_bufs(struct file *file, void *fh, - struct v4l2_create_buffers *create); + struct v4l2_requestbuffers *rb); +int v4l2_m2m_ioctl_create_bufs(struct file *file, void *priv, + struct v4l2_create_buffers *create); int v4l2_m2m_ioctl_remove_bufs(struct file *file, void *priv, struct v4l2_remove_buffers *d); -int v4l2_m2m_ioctl_querybuf(struct file *file, void *fh, - struct v4l2_buffer *buf); -int v4l2_m2m_ioctl_expbuf(struct file *file, void *fh, - struct v4l2_exportbuffer *eb); -int v4l2_m2m_ioctl_qbuf(struct file *file, void *fh, - struct v4l2_buffer *buf); -int v4l2_m2m_ioctl_dqbuf(struct file *file, void *fh, - struct v4l2_buffer *buf); -int v4l2_m2m_ioctl_prepare_buf(struct file *file, void *fh, +int v4l2_m2m_ioctl_querybuf(struct file *file, void *priv, + struct v4l2_buffer *buf); +int v4l2_m2m_ioctl_expbuf(struct file *file, void *priv, + struct v4l2_exportbuffer *eb); +int v4l2_m2m_ioctl_qbuf(struct file *file, void *priv, + struct v4l2_buffer *buf); +int v4l2_m2m_ioctl_dqbuf(struct file *file, void *priv, + struct v4l2_buffer *buf); +int v4l2_m2m_ioctl_prepare_buf(struct file *file, void *priv, struct v4l2_buffer *buf); -int v4l2_m2m_ioctl_streamon(struct file *file, void *fh, - enum v4l2_buf_type type); -int v4l2_m2m_ioctl_streamoff(struct file *file, void *fh, - enum v4l2_buf_type type); -int v4l2_m2m_ioctl_encoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_streamon(struct file *file, void *priv, + enum v4l2_buf_type type); +int v4l2_m2m_ioctl_streamoff(struct file *file, void *priv, + enum v4l2_buf_type type); +int v4l2_m2m_ioctl_encoder_cmd(struct file *file, void *priv, struct v4l2_encoder_cmd *ec); -int v4l2_m2m_ioctl_decoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc); -int v4l2_m2m_ioctl_try_encoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_try_encoder_cmd(struct file *file, void *priv, struct v4l2_encoder_cmd *ec); -int v4l2_m2m_ioctl_try_decoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_try_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc); -int v4l2_m2m_ioctl_stateless_try_decoder_cmd(struct file *file, void *fh, +int v4l2_m2m_ioctl_stateless_try_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc); int v4l2_m2m_ioctl_stateless_decoder_cmd(struct file *file, void *priv, struct v4l2_decoder_cmd *dc); -- cgit v1.2.3 From 1fd143c24fb621f063f913cb1e48cc688c7eca15 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 23:08:31 -0400 Subject: scsi: switch scsi_bios_ptable() and scsi_partsize() to gendisk Both helpers are reading the partition table of the disk specified by block_device of some partition on it; result depends only upon the disk in question, so we might as well pass the struct gendisk instead. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/scsi/scsi_mid_low_api.rst | 4 ++-- drivers/scsi/BusLogic.c | 2 +- drivers/scsi/aacraid/linit.c | 2 +- drivers/scsi/aic7xxx/aic79xx_osm.c | 2 +- drivers/scsi/aic7xxx/aic7xxx_osm.c | 2 +- drivers/scsi/arcmsr/arcmsr_hba.c | 2 +- drivers/scsi/fdomain.c | 2 +- drivers/scsi/megaraid.c | 2 +- drivers/scsi/scsicam.c | 12 ++++++------ include/scsi/scsicam.h | 5 +++-- 10 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst index 3ac4c7fafb55..96c8230d638e 100644 --- a/Documentation/scsi/scsi_mid_low_api.rst +++ b/Documentation/scsi/scsi_mid_low_api.rst @@ -380,7 +380,7 @@ Details:: /** * scsi_bios_ptable - return copy of block device's partition table - * @dev: pointer to block device + * @dev: pointer to gendisk * * Returns pointer to partition table, or NULL for failure * @@ -390,7 +390,7 @@ Details:: * * Defined in: drivers/scsi/scsicam.c **/ - unsigned char *scsi_bios_ptable(struct block_device *dev) + unsigned char *scsi_bios_ptable(struct gendisk *dev) /** diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 1f100270cd38..743be2ef6d1a 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, diskparam->sectors = 32; } diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors); - buf = scsi_bios_ptable(dev); + buf = scsi_bios_ptable(dev->bd_disk); if (buf == NULL) return 0; /* diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 4b12e6dd8f07..2264a97d91a0 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, * entry whose end_head matches one of the standard geometry * translations ( 64/32, 128/32, 255/63 ). */ - buf = scsi_bios_ptable(bdev); + buf = scsi_bios_ptable(bdev->bd_disk); if (!buf) return 0; if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 17dfc3c72110..2cff19e95fec 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahd = *((struct ahd_softc **)sdev->host->hostdata); - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index cebf8c5d0caf..05bdb73d1157 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index fb57343a97bd..6968da9fb67c 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -381,7 +381,7 @@ static int arcmsr_bios_param(struct scsi_device *sdev, { int heads, sectors, cylinders, total_capacity; - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; total_capacity = capacity; diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 504c4e0c5d17..4a3716dc644c 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -472,7 +472,7 @@ static int fdomain_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) { - unsigned char *p = scsi_bios_ptable(bdev); + unsigned char *p = scsi_bios_ptable(bdev->bd_disk); if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */ && p[4]) { /* Partition type */ diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 2006094af418..c7581c7829af 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; dev_info(&adapter->dev->dev, diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 19e6c3852d50..3ff2cf51d5b0 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -30,9 +30,9 @@ * starting at offset %0x1be. * Returns: partition table in kmalloc(GFP_KERNEL) memory, or NULL on error. */ -unsigned char *scsi_bios_ptable(struct block_device *dev) +unsigned char *scsi_bios_ptable(struct gendisk *dev) { - struct address_space *mapping = bdev_whole(dev)->bd_mapping; + struct address_space *mapping = dev->part0->bd_mapping; unsigned char *res = NULL; struct folio *folio; @@ -48,7 +48,7 @@ EXPORT_SYMBOL(scsi_bios_ptable); /** * scsi_partsize - Parse cylinders/heads/sectors from PC partition table - * @bdev: block device to parse + * @disk: gendisk of the disk to parse * @capacity: size of the disk in sectors * @geom: output in form of [hds, cylinders, sectors] * @@ -57,7 +57,7 @@ EXPORT_SYMBOL(scsi_bios_ptable); * * Returns: %false on failure, %true on success. */ -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]) { int cyl, ext_cyl, end_head, end_cyl, end_sector; unsigned int logical_end, physical_end, ext_physical_end; @@ -65,7 +65,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) void *buf; int ret = false; - buf = scsi_bios_ptable(bdev); + buf = scsi_bios_ptable(disk); if (!buf) return false; @@ -221,7 +221,7 @@ int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) int ret = 0; /* try to infer mapping from partition table */ - if (scsi_partsize(bdev, capacity, ip)) + if (scsi_partsize(bdev->bd_disk, capacity, ip)) return 0; if (capacity64 < (1ULL << 32)) { diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 08edd603e521..67f4e8835bc8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -13,7 +13,8 @@ #ifndef SCSICAM_H #define SCSICAM_H +struct gendisk; int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]); -unsigned char *scsi_bios_ptable(struct block_device *bdev); +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); +unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ -- cgit v1.2.3 From 3eb50369c09efb0f668a7f568a7e6f7cf4194cde Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 23:22:01 -0400 Subject: scsi: switch ->bios_param() to passing gendisk Instances are passed struct block_device *bdev argument; the only thing it is used for (if it's used in the first place) is bdev->bd_disk. Might as well pass that in the first place... Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/scsi/scsi_mid_low_api.rst | 4 ++-- drivers/ata/libata-scsi.c | 4 ++-- drivers/message/fusion/mptscsih.c | 2 +- drivers/message/fusion/mptscsih.h | 2 +- drivers/scsi/3w-9xxx.c | 2 +- drivers/scsi/3w-sas.c | 2 +- drivers/scsi/3w-xxxx.c | 2 +- drivers/scsi/BusLogic.c | 4 ++-- drivers/scsi/BusLogic.h | 2 +- drivers/scsi/aacraid/linit.c | 6 +++--- drivers/scsi/advansys.c | 2 +- drivers/scsi/aha152x.c | 4 ++-- drivers/scsi/aha1542.c | 2 +- drivers/scsi/aha1740.c | 2 +- drivers/scsi/aic7xxx/aic79xx_osm.c | 4 ++-- drivers/scsi/aic7xxx/aic7xxx_osm.c | 4 ++-- drivers/scsi/arcmsr/arcmsr_hba.c | 6 +++--- drivers/scsi/atp870u.c | 2 +- drivers/scsi/fdomain.c | 4 ++-- drivers/scsi/imm.c | 2 +- drivers/scsi/initio.c | 4 ++-- drivers/scsi/ipr.c | 8 ++++---- drivers/scsi/ips.c | 2 +- drivers/scsi/ips.h | 2 +- drivers/scsi/libsas/sas_scsi_host.c | 2 +- drivers/scsi/megaraid.c | 4 ++-- drivers/scsi/megaraid.h | 2 +- drivers/scsi/megaraid/megaraid_sas_base.c | 4 ++-- drivers/scsi/mpi3mr/mpi3mr_os.c | 4 ++-- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 4 ++-- drivers/scsi/mvumi.c | 2 +- drivers/scsi/myrb.c | 2 +- drivers/scsi/pcmcia/sym53c500_cs.c | 2 +- drivers/scsi/ppa.c | 2 +- drivers/scsi/qla1280.c | 2 +- drivers/scsi/qlogicfas408.c | 2 +- drivers/scsi/qlogicfas408.h | 2 +- drivers/scsi/scsicam.c | 6 +++--- drivers/scsi/sd.c | 4 ++-- drivers/scsi/stex.c | 2 +- drivers/scsi/storvsc_drv.c | 2 +- drivers/scsi/wd719x.c | 2 +- include/linux/libata.h | 2 +- include/scsi/libsas.h | 2 +- include/scsi/scsi_host.h | 2 +- include/scsi/scsicam.h | 2 +- 46 files changed, 68 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst index 96c8230d638e..634f5c28a849 100644 --- a/Documentation/scsi/scsi_mid_low_api.rst +++ b/Documentation/scsi/scsi_mid_low_api.rst @@ -623,7 +623,7 @@ Details:: * bios_param - fetch head, sector, cylinder info for a disk * @sdev: pointer to scsi device context (defined in * include/scsi/scsi_device.h) - * @bdev: pointer to block device context (defined in fs.h) + * @disk: pointer to gendisk (defined in blkdev.h) * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -643,7 +643,7 @@ Details:: * * Optionally defined in: LLD **/ - int bios_param(struct scsi_device * sdev, struct block_device *bdev, + int bios_param(struct scsi_device * sdev, struct gendisk *disk, sector_t capacity, int params[3]) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57f674f51b0c..3603b430724b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); /** * ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd. * @sdev: SCSI device for which BIOS geometry is to be determined - * @bdev: block device associated with @sdev + * @unused: gendisk associated with @sdev * @capacity: capacity of SCSI device * @geom: location to which geometry will be output * @@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); * RETURNS: * Zero. */ -int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, +int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { geom[0] = 255; diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c index 3a64dc7a7e27..3304f8824cf7 100644 --- a/drivers/message/fusion/mptscsih.c +++ b/drivers/message/fusion/mptscsih.c @@ -2074,7 +2074,7 @@ mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, * This is anyones guess quite frankly. */ int -mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, +mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/message/fusion/mptscsih.h b/drivers/message/fusion/mptscsih.h index 8c2bb2331fc1..f9678d48100c 100644 --- a/drivers/message/fusion/mptscsih.h +++ b/drivers/message/fusion/mptscsih.h @@ -123,7 +123,7 @@ extern int mptscsih_abort(struct scsi_cmnd * SCpnt); extern int mptscsih_dev_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_bus_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_host_reset(struct scsi_cmnd *SCpnt); -extern int mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, sector_t capacity, int geom[]); +extern int mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]); extern int mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_scandv_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 883d4a12a172..a377a6f6900a 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -1695,7 +1695,7 @@ out: } /* End twa_reset_sequence() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twa_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twa_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 8d4174c7107e..e319be7d369c 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1404,7 +1404,7 @@ out: } /* End twl_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twl_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twl_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 89bd56f78ef9..0306a228c702 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -1340,7 +1340,7 @@ static int tw_reset_device_extension(TW_Device_Extension *tw_dev) } /* End tw_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int tw_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int tw_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 743be2ef6d1a..8b17d8103e90 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3240,7 +3240,7 @@ static int blogic_resetadapter(struct blogic_adapter *adapter, bool hard_reset) the BIOS, and a warning may be displayed. */ -static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, +static int blogic_diskparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *params) { struct blogic_adapter *adapter = @@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, diskparam->sectors = 32; } diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors); - buf = scsi_bios_ptable(dev->bd_disk); + buf = scsi_bios_ptable(disk); if (buf == NULL) return 0; /* diff --git a/drivers/scsi/BusLogic.h b/drivers/scsi/BusLogic.h index 61bf26d4fc10..79de815e33b0 100644 --- a/drivers/scsi/BusLogic.h +++ b/drivers/scsi/BusLogic.h @@ -1273,7 +1273,7 @@ static inline void blogic_incszbucket(unsigned int *cmdsz_buckets, static const char *blogic_drvr_info(struct Scsi_Host *); static int blogic_qcmd(struct Scsi_Host *h, struct scsi_cmnd *); -static int blogic_diskparam(struct scsi_device *, struct block_device *, sector_t, int *); +static int blogic_diskparam(struct scsi_device *, struct gendisk *, sector_t, int *); static int blogic_sdev_configure(struct scsi_device *, struct queue_limits *lim); static void blogic_qcompleted_ccb(struct blogic_ccb *); diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 2264a97d91a0..ea66196ef7c7 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -273,7 +273,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) /** * aac_biosparm - return BIOS parameters for disk * @sdev: The scsi device corresponding to the disk - * @bdev: the block device corresponding to the disk + * @disk: the gendisk corresponding to the disk * @capacity: the sector capacity of the disk * @geom: geometry block to fill in * @@ -292,7 +292,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) * be displayed. */ -static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, +static int aac_biosparm(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *geom) { struct diskparm *param = (struct diskparm *)geom; @@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, * entry whose end_head matches one of the standard geometry * translations ( 64/32, 128/32, 255/63 ). */ - buf = scsi_bios_ptable(bdev->bd_disk); + buf = scsi_bios_ptable(disk); if (!buf) return 0; if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index 3a2c336307c0..063e1b5818d3 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -7096,7 +7096,7 @@ static int advansys_reset(struct scsi_cmnd *scp) * ip[2]: cylinders */ static int -advansys_biosparam(struct scsi_device *sdev, struct block_device *bdev, +advansys_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { struct asc_board *boardp = shost_priv(sdev->host); diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index e94c0a19c435..182aa80ec4c6 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -1246,7 +1246,7 @@ int aha152x_host_reset_host(struct Scsi_Host *shpnt) * Return the "logical geometry" * */ -static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int aha152x_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *info_array) { struct Scsi_Host *shpnt = sdev->host; @@ -1261,7 +1261,7 @@ static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev int info[3]; /* try to figure out the geometry from the partition table */ - if (scsicam_bios_param(bdev, capacity, info) < 0 || + if (scsicam_bios_param(disk, capacity, info) < 0 || !((info[0] == 64 && info[1] == 32) || (info[0] == 255 && info[1] == 63))) { if (EXT_TRANS) { printk(KERN_NOTICE diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c index 389499d3e00a..371e8300f029 100644 --- a/drivers/scsi/aha1542.c +++ b/drivers/scsi/aha1542.c @@ -992,7 +992,7 @@ static int aha1542_host_reset(struct scsi_cmnd *cmd) } static int aha1542_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { struct aha1542_hostdata *aha1542 = shost_priv(sdev->host); diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index be7ebbbb9ba8..b234621f6b37 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c @@ -510,7 +510,7 @@ static void aha1740_getconfig(unsigned int base, unsigned int *irq_level, } static int aha1740_biosparam(struct scsi_device *sdev, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int* ip) { int size = capacity; diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 2cff19e95fec..c3d1b9dd24ae 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -720,7 +720,7 @@ ahd_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahd_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahd = *((struct ahd_softc **)sdev->host->hostdata); - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index 05bdb73d1157..8b2b98666d61 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -683,7 +683,7 @@ ahc_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahc_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 6968da9fb67c..f0c5a30ce51b 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -112,7 +112,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb); static int arcmsr_abort(struct scsi_cmnd *); static int arcmsr_bus_reset(struct scsi_cmnd *); static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *info); + struct gendisk *disk, sector_t capacity, int *info); static int arcmsr_queue_command(struct Scsi_Host *h, struct scsi_cmnd *cmd); static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id); @@ -377,11 +377,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id) } static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *geom) + struct gendisk *disk, sector_t capacity, int *geom) { int heads, sectors, cylinders, total_capacity; - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; total_capacity = capacity; diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c index 401242912855..df6f40b51deb 100644 --- a/drivers/scsi/atp870u.c +++ b/drivers/scsi/atp870u.c @@ -1692,7 +1692,7 @@ static int atp870u_show_info(struct seq_file *m, struct Scsi_Host *HBAptr) } -static int atp870u_biosparam(struct scsi_device *disk, struct block_device *dev, +static int atp870u_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int *ip) { int heads, sectors, cylinders; diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 4a3716dc644c..c0b2a980db34 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -469,10 +469,10 @@ static int fdomain_host_reset(struct scsi_cmnd *cmd) } static int fdomain_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, + struct gendisk *disk, sector_t capacity, int geom[]) { - unsigned char *p = scsi_bios_ptable(bdev->bd_disk); + unsigned char *p = scsi_bios_ptable(disk); if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */ && p[4]) { /* Partition type */ diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c index 0821cf994b98..5c602c057798 100644 --- a/drivers/scsi/imm.c +++ b/drivers/scsi/imm.c @@ -954,7 +954,7 @@ static DEF_SCSI_QCMD(imm_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int imm_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int imm_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index 8648bd965287..ed34ad92c807 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -2645,7 +2645,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) /** * i91u_biosparam - return the "logical geometry * @sdev: SCSI device - * @dev: Matching block device + * @unused: Matching gendisk * @capacity: Sector size of drive * @info_array: Return space for BIOS geometry * @@ -2655,7 +2655,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) * FIXME: limited to 2^32 sector devices. */ -static int i91u_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int i91u_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info_array) { struct initio_host *host; /* Point to Host adapter control block */ diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index d06b79f03538..dd6754db7e4c 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4644,10 +4644,10 @@ ATTRIBUTE_GROUPS(ipr_dev); /** * ipr_biosparam - Return the HSC mapping - * @sdev: scsi device struct - * @block_device: block device pointer + * @sdev: scsi device struct + * @unused: gendisk pointer * @capacity: capacity of the device - * @parm: Array containing returned HSC values. + * @parm: Array containing returned HSC values. * * This function generates the HSC parms that fdisk uses. * We want to make sure we return something that places partitions @@ -4657,7 +4657,7 @@ ATTRIBUTE_GROUPS(ipr_dev); * 0 on success **/ static int ipr_biosparam(struct scsi_device *sdev, - struct block_device *block_device, + struct gendisk *unused, sector_t capacity, int *parm) { int heads, sectors; diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 94adb6ac02a4..3393a288fd23 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -1123,7 +1123,7 @@ static DEF_SCSI_QCMD(ips_queue) /* Set bios geometry for the controller */ /* */ /****************************************************************************/ -static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { ips_ha_t *ha = (ips_ha_t *) sdev->host->hostdata; diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 8ac932ec4444..30a4d4a580e9 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -398,7 +398,7 @@ /* * Scsi_Host Template */ - static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, + static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); static int ips_sdev_configure(struct scsi_device *SDptr, struct queue_limits *lim); diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 928723c90b75..ffa5b49aaf08 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -845,7 +845,7 @@ int sas_change_queue_depth(struct scsi_device *sdev, int depth) EXPORT_SYMBOL_GPL(sas_change_queue_depth); int sas_bios_param(struct scsi_device *scsi_dev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int *hsc) { hsc[0] = 255; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index c7581c7829af..a00622c0c526 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2780,7 +2780,7 @@ static inline void mega_create_proc_entry(int index, struct proc_dir_entry *pare * Return the disk geometry for a particular disk */ static int -megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, +megaraid_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { adapter_t *adapter; @@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; dev_info(&adapter->dev->dev, diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h index 013fbfb911b9..d6bfd26a8843 100644 --- a/drivers/scsi/megaraid.h +++ b/drivers/scsi/megaraid.h @@ -975,7 +975,7 @@ static void mega_free_scb(adapter_t *, scb_t *); static int megaraid_abort(struct scsi_cmnd *); static int megaraid_reset(struct scsi_cmnd *); static int megaraid_abort_and_reset(adapter_t *, struct scsi_cmnd *, int); -static int megaraid_biosparam(struct scsi_device *, struct block_device *, +static int megaraid_biosparam(struct scsi_device *, struct gendisk *, sector_t, int []); static int mega_build_sglist (adapter_t *adapter, scb_t *scb, diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 615e06fd4ee8..abbbc4b36cd1 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3137,12 +3137,12 @@ static int megasas_reset_target(struct scsi_cmnd *scmd) /** * megasas_bios_param - Returns disk geometry for a disk * @sdev: device handle - * @bdev: block device + * @unused: gendisk * @capacity: drive capacity * @geom: geometry parameters */ static int -megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, +megasas_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index e467b56949e9..3df52a3b435b 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -4031,7 +4031,7 @@ out: /** * mpi3mr_bios_param - BIOS param callback * @sdev: SCSI device reference - * @bdev: Block device reference + * @unused: gendisk reference * @capacity: Capacity in logical sectors * @params: Parameter array * @@ -4040,7 +4040,7 @@ out: * Return: 0 always */ static int mpi3mr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int params[]) + struct gendisk *unused, sector_t capacity, int params[]) { int heads; int sectors; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 967af259118e..7092d0debef3 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2754,7 +2754,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) /** * scsih_bios_param - fetch head, sector, cylinder info for a disk * @sdev: scsi device struct - * @bdev: pointer to block device context + * @unused: pointer to gendisk * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -2762,7 +2762,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * params[2] number of cylinders */ static int -scsih_bios_param(struct scsi_device *sdev, struct block_device *bdev, +scsih_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int params[]) { int heads; diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index 96549e7f5705..bdc2f2f17753 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -2142,7 +2142,7 @@ static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd) } static int -mvumi_bios_param(struct scsi_device *sdev, struct block_device *bdev, +mvumi_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index 486db5b2f05d..b8453c0333dc 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1745,7 +1745,7 @@ static void myrb_sdev_destroy(struct scsi_device *sdev) kfree(sdev->hostdata); } -static int myrb_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int myrb_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { struct myrb_hba *cb = shost_priv(sdev->host); diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c index 278c78d066c4..a3b505240351 100644 --- a/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/drivers/scsi/pcmcia/sym53c500_cs.c @@ -597,7 +597,7 @@ SYM53C500_host_reset(struct scsi_cmnd *SCpnt) static int SYM53C500_biosparm(struct scsi_device *disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int *info_array) { int size; diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c index 1ed3171f1797..ea682f3044b6 100644 --- a/drivers/scsi/ppa.c +++ b/drivers/scsi/ppa.c @@ -845,7 +845,7 @@ static DEF_SCSI_QCMD(ppa_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int ppa_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int ppa_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 6af018f1ca22..ef841f643171 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -1023,7 +1023,7 @@ qla1280_eh_adapter_reset(struct scsi_cmnd *cmd) } static int -qla1280_biosparam(struct scsi_device *sdev, struct block_device *bdev, +qla1280_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c index 3e065d5fc80c..1ce469b7db99 100644 --- a/drivers/scsi/qlogicfas408.c +++ b/drivers/scsi/qlogicfas408.c @@ -492,7 +492,7 @@ DEF_SCSI_QCMD(qlogicfas408_queuecommand) * Return bios parameters */ -int qlogicfas408_biosparam(struct scsi_device *disk, struct block_device *dev, +int qlogicfas408_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int ip[]) { /* This should mimic the DOS Qlogic driver's behavior exactly */ diff --git a/drivers/scsi/qlogicfas408.h b/drivers/scsi/qlogicfas408.h index a971db11d293..83ef86c71f2f 100644 --- a/drivers/scsi/qlogicfas408.h +++ b/drivers/scsi/qlogicfas408.h @@ -106,7 +106,7 @@ struct qlogicfas408_priv { irqreturn_t qlogicfas408_ihandl(int irq, void *dev_id); int qlogicfas408_queuecommand(struct Scsi_Host *h, struct scsi_cmnd * cmd); int qlogicfas408_biosparam(struct scsi_device * disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int ip[]); int qlogicfas408_abort(struct scsi_cmnd * cmd); extern int qlogicfas408_host_reset(struct scsi_cmnd *cmd); diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 3ff2cf51d5b0..887de505bcf9 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -205,7 +205,7 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds /** * scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors. - * @bdev: which device + * @disk: which device * @capacity: size of the disk in sectors * @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders * @@ -215,13 +215,13 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds * * Returns : -1 on failure, 0 on success. */ -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip) { u64 capacity64 = capacity; /* Suppress gcc warning */ int ret = 0; /* try to infer mapping from partition table */ - if (scsi_partsize(bdev->bd_disk, capacity, ip)) + if (scsi_partsize(disk, capacity, ip)) return 0; if (capacity64 < (1ULL << 32)) { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5b8668accf8e..3edcc43f180b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev, capacity, diskinfo); + host->hostt->bios_param(sdp, bdev->bd_disk, capacity, diskinfo); else - scsicam_bios_param(bdev, capacity, diskinfo); + scsicam_bios_param(bdev->bd_disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index 63ed7f9aaa93..d8ad02c29320 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c @@ -1457,7 +1457,7 @@ static void stex_reset_work(struct work_struct *work) } static int stex_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { int heads = 255, sectors = 63; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index d9e59204a9c3..dc51ea352198 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1615,7 +1615,7 @@ static int storvsc_sdev_configure(struct scsi_device *sdevice, return 0; } -static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev, +static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info) { sector_t nsect = capacity; diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c index 5a380eecfc75..0c9987828774 100644 --- a/drivers/scsi/wd719x.c +++ b/drivers/scsi/wd719x.c @@ -544,7 +544,7 @@ static int wd719x_host_reset(struct scsi_cmnd *cmd) return wd719x_chip_init(wd) == 0 ? SUCCESS : FAILED; } -static int wd719x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int wd719x_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { if (capacity >= 0x200000) { diff --git a/include/linux/libata.h b/include/linux/libata.h index 0620dd67369f..21de0935775d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1203,7 +1203,7 @@ extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ba460b6c0374..9c6e90829dbd 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -685,7 +685,7 @@ extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim); extern int sas_change_queue_depth(struct scsi_device *, int new_depth); -extern int sas_bios_param(struct scsi_device *, struct block_device *, +extern int sas_bios_param(struct scsi_device *, struct gendisk *, sector_t capacity, int *hsc); int sas_execute_internal_abort_single(struct domain_device *device, u16 tag, unsigned int qid, diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index c53812b9026f..f5a243261236 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -318,7 +318,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* bios_param)(struct scsi_device *, struct block_device *, + int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 67f4e8835bc8..1131f51ed2c8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -14,7 +14,7 @@ #ifndef SCSICAM_H #define SCSICAM_H struct gendisk; -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip); bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ -- cgit v1.2.3 From 4fc8728aa34f54835b72e4db0f3db76a72948b65 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 22:19:55 -0400 Subject: block: switch ->getgeo() to struct gendisk Instances are happier that way and it makes more sense anyway - the only part of the result that is related to partition we are given is the start sector, and that has been filled in by the caller. Everything else is a function of the disk. Only one instance (DASD) is ever looking at anything other than bdev->bd_disk and that one is trivial to adjust. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/filesystems/locking.rst | 2 +- arch/m68k/emu/nfblock.c | 4 ++-- arch/um/drivers/ubd_kern.c | 6 +++--- block/ioctl.c | 4 ++-- block/partitions/ibm.c | 2 +- drivers/block/amiflop.c | 10 +++++----- drivers/block/aoe/aoeblk.c | 4 ++-- drivers/block/floppy.c | 4 ++-- drivers/block/mtip32xx/mtip32xx.c | 6 +++--- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/sunvdc.c | 3 +-- drivers/block/swim.c | 4 ++-- drivers/block/virtio_blk.c | 6 +++--- drivers/block/xen-blkfront.c | 4 ++-- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 4 ++-- drivers/memstick/core/ms_block.c | 4 ++-- drivers/memstick/core/mspro_block.c | 4 ++-- drivers/mmc/core/block.c | 4 ++-- drivers/mtd/mtd_blkdevs.c | 4 ++-- drivers/mtd/ubi/block.c | 4 ++-- drivers/nvdimm/btt.c | 4 ++-- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/nvme.h | 2 +- drivers/s390/block/dasd.c | 7 ++++--- drivers/scsi/sd.c | 8 ++++---- include/linux/blkdev.h | 2 +- 27 files changed, 59 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aa287ccdac2f..77704fde9845 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -443,7 +443,7 @@ prototypes:: int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 83410f8184ec..94a4fadc651a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -77,9 +77,9 @@ static void nfhd_submit_bio(struct bio *bio) bio_endio(bio); } -static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nfhd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct nfhd_device *dev = bdev->bd_disk->private_data; + struct nfhd_device *dev = disk->private_data; geo->cylinders = dev->blocks >> (6 - dev->bshift); geo->heads = 4; diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 4de6613e7468..f2b2feeeb455 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,7 +108,7 @@ static DEFINE_MUTEX(ubd_lock); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo); #define MAX_DEV (16) @@ -1324,9 +1324,9 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, return res; } -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct ubd *ubd_dev = bdev->bd_disk->private_data; + struct ubd *ubd_dev = disk->private_data; geo->heads = 128; geo->sectors = 32; diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..a14304f737c5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev, */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) @@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, * want to override it. */ geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 82d9c4c3fb41..631291fbb356 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state) goto out_nolab; /* set start if not filled by getgeo function e.g. virtblk */ geo->start = get_start_sect(bdev); - if (disk->fops->getgeo(bdev, geo)) + if (disk->fops->getgeo(disk, geo)) goto out_freeall; if (!fn || fn(disk, info)) { kfree(info); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 6357d86eafdc..2932b6653b6f 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = MINOR(bdev->bd_dev) & 3; + struct amiga_floppy_struct *p = disk->private_data; - geo->heads = unit[drive].type->heads; - geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult; - geo->cylinders = unit[drive].type->tracks; + geo->heads = p->type->heads; + geo->sectors = p->dtype->sects * p->type->sect_mult; + geo->cylinders = p->type->tracks; return 0; } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 00b74a845328..34ead75e7e02 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx, } static int -aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct aoedev *d = bdev->bd_disk->private_data; + struct aoedev *d = disk->private_data; if ((d->flags & DEVFL_UP) == 0) { printk(KERN_ERR "aoe: disk not up\n"); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 24be0c2c4075..eb125a36ec24 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3363,9 +3363,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) return 0; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = (long)bdev->bd_disk->private_data; + int drive = (long)disk->private_data; int type = ITYPE(drive_state[drive].fd_device); struct floppy_struct *g; int ret; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 8fc7761397bd..567192e371a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev, * that each partition is also 4KB aligned. Non-aligned partitions adversely * affects performance. * - * @dev Pointer to the block_device strucutre. + * @disk Pointer to the gendisk strucutre. * @geo Pointer to a hd_geometry structure. * * return value * 0 Operation completed successfully. * -ENOTTY An error occurred while reading the drive capacity. */ -static int mtip_block_getgeo(struct block_device *dev, +static int mtip_block_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct driver_data *dd = dev->bd_disk->private_data; + struct driver_data *dd = disk->private_data; sector_t capacity; if (!dd) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 15627417f12e..119b7e27023f 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen) rnbd_clt_put_dev(dev); } -static int rnbd_client_getgeo(struct block_device *block_device, +static int rnbd_client_getgeo(struct gendisk *disk, struct hd_geometry *geo) { u64 size; - struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct rnbd_clt_dev *dev = disk->private_data; struct queue_limits *limit = &dev->queue->limits; size = dev->size * (limit->logical_block_size / SECTOR_SIZE); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7af21fe67671..107751721178 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) return vio_dring_avail(dr, VDC_TX_RING_SIZE); } -static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct gendisk *disk = bdev->bd_disk; sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index eda33c5eb5e2..416015947ae6 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } -static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_state *fs = disk->private_data; struct floppy_struct *g; int ret; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e649fa67bac1..85efa7e535f8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -829,9 +829,9 @@ out: } /* We provide getgeo only to please some old bootloader/partitioning tools */ -static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct virtio_blk *vblk = bd->bd_disk->private_data; + struct virtio_blk *vblk = disk->private_data; int ret = 0; mutex_lock(&vblk->vdev_mutex); @@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) /* some standard values, similar to sd */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; } out: mutex_unlock(&vblk->vdev_mutex); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5babe575c288..04fc6b552c04 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg) schedule_work(&rinfo->work); } -static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg) { /* We don't have real geometry info, but let's at least return values consistent with the size of the device */ - sector_t nsect = get_capacity(bd->bd_disk); + sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; hg->heads = 0xff; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a44e8c2dccee..7bd6fa05b00a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } diff --git a/drivers/md/md.c b/drivers/md/md.c index ac85ec73a409..236bb04f3e54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7678,9 +7678,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index d34892782f6e..1af157ce0a63 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1953,10 +1953,10 @@ static void msb_data_clear(struct msb_data *msb) msb->card = NULL; } -static int msb_bd_getgeo(struct block_device *bdev, +static int msb_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct msb_data *msb = bdev->bd_disk->private_data; + struct msb_data *msb = disk->private_data; *geo = msb->geometry; return 0; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c9853d887d28..075519caa547 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -189,10 +189,10 @@ static void mspro_block_bd_free_disk(struct gendisk *disk) kfree(msb); } -static int mspro_block_bd_getgeo(struct block_device *bdev, +static int mspro_block_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mspro_block_data *msb = bdev->bd_disk->private_data; + struct mspro_block_data *msb = disk->private_data; geo->heads = msb->heads; geo->sectors = msb->sectors_per_track; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9cc47bf94804..d1f295af9713 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -435,9 +435,9 @@ static void mmc_blk_release(struct gendisk *disk) } static int -mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +mmc_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16); + geo->cylinders = get_capacity(disk) / (4 * 16); geo->heads = 4; geo->sectors = 16; return 0; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 847c11542f02..28e09d080440 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -246,9 +246,9 @@ unlock: blktrans_dev_put(dev); } -static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int blktrans_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data; + struct mtd_blktrans_dev *dev = disk->private_data; int ret = -ENXIO; mutex_lock(&dev->lock); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 39cc0a6a4d37..b53fd147fa65 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -282,12 +282,12 @@ static void ubiblock_release(struct gendisk *gd) mutex_unlock(&dev->dev_mutex); } -static int ubiblock_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubiblock_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* Some tools might require this information */ geo->heads = 1; geo->cylinders = 1; - geo->sectors = get_capacity(bdev->bd_disk); + geo->sectors = get_capacity(disk); geo->start = 0; return 0; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 2a1aa32e6693..a933db961ed7 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1478,12 +1478,12 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int btt_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 812c1565114f..f96f74bff6ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1803,12 +1803,12 @@ static void nvme_release(struct gendisk *disk) nvme_ns_release(disk->private_data); } -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cfd2b5b90b91..102fae6a231c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -936,7 +936,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_id_ns **id); -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 506a947d00a5..582ac7e61b24 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3317,11 +3317,11 @@ static void dasd_release(struct gendisk *disk) /* * Return disk geometry. */ -static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dasd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct dasd_device *base; - base = dasd_device_from_gendisk(bdev->bd_disk); + base = dasd_device_from_gendisk(disk); if (!base) return -ENODEV; @@ -3331,7 +3331,8 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return -EINVAL; } base->discipline->fill_geometry(base->block, geo); - geo->start = get_start_sect(bdev) >> base->block->s2b_shift; + // geo->start is left unchanged by the above + geo->start >>= base->block->s2b_shift; dasd_put_device(base); return 0; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3edcc43f180b..00ad574ce61c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1599,9 +1599,9 @@ static void sd_release(struct gendisk *disk) scsi_device_put(sdev); } -static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); + struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev->bd_disk, capacity, diskinfo); + host->hostt->bios_param(sdp, disk, capacity, diskinfo); else - scsicam_bios_param(bdev->bd_disk, capacity, diskinfo); + scsicam_bios_param(disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95886b404b16..fbc45121cd4f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1659,7 +1659,7 @@ struct block_device_operations { unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ -- cgit v1.2.3 From a892a3e74fb4f6ef040659297603abf11ccf29a7 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 30 Jun 2025 13:52:32 +0300 Subject: RDMA/sa_query: Support IB service records resolution Add an SA query API ib_sa_service_rec_get() to support building and sending SA query MADs that ask for service records with a specific name or ID, and receiving and parsing responses from the SM. Signed-off-by: Or Har-Toov Signed-off-by: Mark Zhang Reviewed-by: Vlad Dumitrescu Link: https://patch.msgid.link/9af6c82f3a3a9d975115a33235fb4ffc7c8edb21.1751279793.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/sa_query.c | 238 +++++++++++++++++++++++++++++++++++++ include/rdma/ib_mad.h | 1 + include/rdma/ib_sa.h | 37 ++++++ 3 files changed, 276 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 770e9f18349b..c0a7af1b4fe4 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -152,6 +152,13 @@ struct ib_sa_mcmember_query { struct ib_sa_query sa_query; }; +struct ib_sa_service_query { + void (*callback)(int status, struct sa_service_rec *rec, + unsigned int num_services, void *context); + void *context; + struct ib_sa_query sa_query; +}; + static LIST_HEAD(ib_nl_request_list); static DEFINE_SPINLOCK(ib_nl_request_lock); static atomic_t ib_nl_sa_request_seq; @@ -686,6 +693,58 @@ static const struct ib_field guidinfo_rec_table[] = { .size_bits = 512 }, }; +#define SERVICE_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct sa_service_rec, field), \ + .struct_size_bytes = sizeof_field(struct sa_service_rec, field), \ + .field_name = "sa_service_rec:" #field + +static const struct ib_field service_rec_table[] = { + { SERVICE_REC_FIELD(id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { SERVICE_REC_FIELD(gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(pkey), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 16 }, + { RESERVED, + .offset_words = 6, + .offset_bits = 16, + .size_bits = 16 }, + { SERVICE_REC_FIELD(lease), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 32 }, + { SERVICE_REC_FIELD(key), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(name), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 512 }, + { SERVICE_REC_FIELD(data_8), + .offset_words = 28, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_16), + .offset_words = 32, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_32), + .offset_words = 36, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_64), + .offset_words = 40, + .offset_bits = 0, + .size_bits = 128 }, +}; + #define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) @@ -1392,6 +1451,20 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) } EXPORT_SYMBOL(ib_sa_pack_path); +void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute) +{ + ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec, + attribute); +} +EXPORT_SYMBOL(ib_sa_pack_service); + +void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec) +{ + ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute, + rec); +} +EXPORT_SYMBOL(ib_sa_unpack_service); + static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, struct ib_sa_device *sa_dev, u32 port_num) @@ -1481,6 +1554,68 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, } } +#define IB_SA_DATA_OFFS 56 +#define IB_SERVICE_REC_SZ 176 + +static void ib_unpack_service_rmpp(struct sa_service_rec *rec, + struct ib_mad_recv_wc *mad_wc, + int num_services) +{ + unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0; + struct ib_mad_recv_buf *mad_buf; + u8 buf[IB_SERVICE_REC_SZ]; + u8 *data; + + data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data); + + list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) { + data = ((struct ib_sa_mad *) mad_buf->mad)->data; + data_i = 0; + while (data_i < data_size && rec_i < num_services) { + cp_sz = min(IB_SERVICE_REC_SZ - buf_i, + data_size - data_i); + memcpy(buf + buf_i, data + data_i, cp_sz); + data_i += cp_sz; + buf_i += cp_sz; + if (buf_i == IB_SERVICE_REC_SZ) { + ib_sa_unpack_service(buf, rec + rec_i); + buf_i = 0; + rec_i++; + } + } + } +} + +static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, + struct ib_mad_recv_wc *mad_wc) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + struct sa_service_rec *rec; + int num_services; + + if (!mad_wc || !mad_wc->recv_buf.mad) { + query->callback(status, NULL, 0, query->context); + return; + } + + num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ; + if (!num_services) { + query->callback(-ENODATA, NULL, 0, query->context); + return; + } + + rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL); + if (!rec) { + query->callback(-ENOMEM, NULL, 0, query->context); + return; + } + + ib_unpack_service_rmpp(rec, mad_wc, num_services); + query->callback(status, rec, num_services, query->context); + kfree(rec); +} + static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { struct ib_sa_path_query *query = @@ -1490,6 +1625,14 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) kfree(query); } +static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + + kfree(query); +} + /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client @@ -1620,6 +1763,101 @@ err1: } EXPORT_SYMBOL(ib_sa_path_rec_get); +/** + * ib_sa_service_rec_get - Start a Service get query + * @client: SA client + * @device: device to send query on + * @port_num: port number to send query on + * @rec: Service Record to send in query + * @comp_mask: component mask to send in query + * @timeout_ms: time to wait for response + * @gfp_mask: GFP mask to use for internal allocations + * @callback: function called when query completes, times out or is + * canceled + * @context: opaque user context passed to callback + * @sa_query: query context, used to cancel query + * + * Send a Service Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_service_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_service_rec_get(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct sa_service_rec *resp, + unsigned int num_services, + void *context), + void *context, struct ib_sa_query **sa_query) +{ + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_service_query *query; + struct ib_mad_agent *agent; + struct ib_sa_port *port; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(&query->sa_query, agent); + + query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback : + NULL; + query->sa_query.release = ib_sa_service_rec_release; + mad->mad_hdr.method = IB_MGMT_METHOD_GET_TABLE; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_sa_pack_service(rec, mad->data); + + *sa_query = &query->sa_query; + query->sa_query.mad_buf->context[1] = rec; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_service_rec_get); + static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 3f1b58d8b4bf..8bd0e1eb393b 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -48,6 +48,7 @@ #define IB_MGMT_METHOD_REPORT 0x06 #define IB_MGMT_METHOD_REPORT_RESP 0x86 #define IB_MGMT_METHOD_TRAP_REPRESS 0x07 +#define IB_MGMT_METHOD_GET_TABLE 0x12 #define IB_MGMT_METHOD_RESP 0x80 #define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b46353fc53bf..95e8924ad563 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -189,6 +189,20 @@ struct sa_path_rec { u32 flags; }; +struct sa_service_rec { + __be64 id; + __u8 gid[16]; + __be16 pkey; + __u8 reserved[2]; + __be32 lease; + __u8 key[16]; + __u8 name[64]; + __u8 data_8[16]; + __be16 data_16[8]; + __be32 data_32[4]; + __be64 data_64[2]; +}; + static inline enum ib_gid_type sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec) { @@ -417,6 +431,17 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, unsigned int num_prs, void *context), void *context, struct ib_sa_query **query); +int ib_sa_service_rec_get(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct sa_service_rec *resp, + unsigned int num_services, + void *context), + void *context, struct ib_sa_query **sa_query); + struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; ib_sa_comp_mask comp_mask; @@ -508,6 +533,18 @@ int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, */ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute); +/** + * ib_sa_pack_service - Convert a service record from struct ib_sa_service_rec + * to IB MAD wire format. + */ +void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute); + +/** + * ib_sa_unpack_service - Convert a service record from MAD format to struct + * ib_sa_service_rec. + */ +void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec); + /** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. -- cgit v1.2.3 From a6404823fe20e06d4061bc63e0295b7165af4c14 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 30 Jun 2025 13:52:33 +0300 Subject: RDMA/cma: Support IB service record resolution Add new UCMA command and the corresponding CMA implementation. Userspace can send this command to request service resolution based on service name or ID. On a successful resolution, one or multiple service records are returned, the first one will be used as destination address by default. Two new CM events are added and returned to caller accordingly: - RDMA_CM_EVENT_ADDRINFO_RESOLVED: Resolve succeeded; - RDMA_CM_EVENT_ADDRINFO_ERROR: Resolve failed. Internally two new CM states are added: - RDMA_CM_ADDRINFO_QUERY: CM is in the process of IB service resolution; - RDMA_CM_ADDRINFO_RESOLVED: CM has finished the resolve process. With these new states, beside existing state transfer processes, 2 new processes are supported: 1. The default address is used: RDMA_CM_ADDR_BOUND -> RDMA_CM_ADDRINFO_QUERY -> RDMA_CM_ADDRINFO_RESOLVED -> RDMA_CM_ROUTE_QUERY 2. To use a different address: RDMA_CM_ADDR_BOUND -> RDMA_CM_ADDRINFO_QUERY-> RDMA_CM_ADDRINFO_RESOLVED -> RDMA_CM_ADDR_QUERY -> RDMA_CM_ADDR_RESOLVED -> RDMA_CM_ROUTE_QUERY In the 2nd case, resolve_addrinfo returns multiple records, a user could call rdma_resolve_addr() with the one that is not the first. Signed-off-by: Or Har-Toov Signed-off-by: Mark Zhang Reviewed-by: Vlad Dumitrescu Link: https://patch.msgid.link/b6e82ad75522a13b5efe4ff86da0e465aab04cc2.1751279794.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 136 ++++++++++++++++++++++++++++++++++++- drivers/infiniband/core/cma_priv.h | 4 +- drivers/infiniband/core/ucma.c | 30 +++++++- include/rdma/rdma_cm.h | 18 ++++- include/uapi/rdma/rdma_user_cm.h | 20 +++++- 5 files changed, 202 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9b471548e7ae..5b2d3ae3f9fc 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2076,6 +2076,7 @@ static void _destroy_id(struct rdma_id_private *id_priv, kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); + kfree(id_priv->id.route.service_recs); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); @@ -3382,13 +3383,18 @@ err1: int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; + enum rdma_cm_state state; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + state = id_priv->state; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY) && + !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED, + RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); @@ -3409,7 +3415,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) return 0; err: - cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state); cma_id_put(id_priv); return ret; } @@ -5506,3 +5512,129 @@ static void __exit cma_cleanup(void) module_init(cma_init); module_exit(cma_cleanup); + +static void cma_query_ib_service_handler(int status, + struct sa_service_rec *recs, + unsigned int num_recs, void *context) +{ + struct cma_work *work = context; + struct rdma_id_private *id_priv = work->id; + struct sockaddr_ib *addr; + + if (status) + goto fail; + + if (!num_recs) { + status = -ENOENT; + goto fail; + } + + if (id_priv->id.route.service_recs) { + status = -EALREADY; + goto fail; + } + + id_priv->id.route.service_recs = + kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL); + if (!id_priv->id.route.service_recs) { + status = -ENOMEM; + goto fail; + } + + id_priv->id.route.num_service_recs = num_recs; + memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs); + + addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr; + addr->sib_family = AF_IB; + addr->sib_addr = *(struct ib_addr *)&recs->gid; + addr->sib_pkey = recs->pkey; + addr->sib_sid = recs->id; + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, + (union ib_gid *)&addr->sib_addr); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, + ntohs(addr->sib_pkey)); + + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDR_BOUND; + work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR; + work->event.status = status; + pr_debug_ratelimited( + "RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n", + status); + queue_work(cma_wq, &work->work); +} + +static int cma_resolve_ib_service(struct rdma_id_private *id_priv, + struct rdma_ucm_ib_service *ibs) +{ + struct sa_service_rec sr = {}; + ib_sa_comp_mask mask = 0; + struct cma_work *work; + + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; + + cma_id_get(id_priv); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDRINFO_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED; + + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) { + sr.id = cpu_to_be64(ibs->service_id); + mask |= IB_SA_SERVICE_REC_SERVICE_ID; + } + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) { + strscpy(sr.name, ibs->service_name, sizeof(sr.name)); + mask |= IB_SA_SERVICE_REC_SERVICE_NAME; + } + + id_priv->query_id = ib_sa_service_rec_get(&sa_client, + id_priv->id.device, + id_priv->id.port_num, + &sr, mask, + 2000, GFP_KERNEL, + cma_query_ib_service_handler, + work, &id_priv->query); + + if (id_priv->query_id < 0) { + cma_id_put(id_priv); + kfree(work); + return id_priv->query_id; + } + + return 0; +} + +int rdma_resolve_ib_service(struct rdma_cm_id *id, + struct rdma_ucm_ib_service *ibs) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cma_dev || + !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY)) + return -EINVAL; + + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_service(id_priv, ibs); + else + ret = -EOPNOTSUPP; + + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ib_service); diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index b7354c94cf1b..c604b601f4d9 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -47,7 +47,9 @@ enum rdma_cm_state { RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN, RDMA_CM_DEVICE_REMOVAL, - RDMA_CM_DESTROYING + RDMA_CM_DESTROYING, + RDMA_CM_ADDRINFO_QUERY, + RDMA_CM_ADDRINFO_RESOLVED }; struct rdma_id_private { diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 6e700b974033..1915f4e68308 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -282,6 +282,10 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, } uevent->resp.event = event->event; uevent->resp.status = event->status; + + if (event->event == RDMA_CM_EVENT_ADDRINFO_RESOLVED) + goto out; + if (ctx->cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, &event->param.ud); @@ -289,6 +293,7 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); +out: uevent->resp.ece.vendor_id = event->ece.vendor_id; uevent->resp.ece.attr_mod = event->ece.attr_mod; return uevent; @@ -728,6 +733,28 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file, return ret; } +static ssize_t ucma_resolve_ib_service(struct ucma_file *file, + const char __user *inbuf, int in_len, + int out_len) +{ + struct rdma_ucm_resolve_ib_service cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_resolve_ib_service(ctx->cm_id, &cmd.ibs); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_resolve_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1703,7 +1730,8 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, [RDMA_USER_CM_CMD_QUERY] = ucma_query, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, + [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service }; static ssize_t ucma_write(struct file *filp, const char __user *buf, diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index d1593ad47e28..72d1568e4cfb 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -33,7 +33,9 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT + RDMA_CM_EVENT_TIMEWAIT_EXIT, + RDMA_CM_EVENT_ADDRINFO_RESOLVED, + RDMA_CM_EVENT_ADDRINFO_ERROR }; const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); @@ -63,6 +65,9 @@ struct rdma_route { * 2 - Both primary and alternate path are available */ int num_pri_alt_paths; + + unsigned int num_service_recs; + struct sa_service_rec *service_recs; }; struct rdma_conn_param { @@ -197,6 +202,17 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, */ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); +/** + * rdma_resolve_ib_service - Resolve the IB service record of the + * service with the given service ID or name. + * + * This function is optional in the rdma cm flow. It is called on the client + * side of a connection, before calling rdma_resolve_route. The resolution + * can be done once per rdma_cm_id. + */ +int rdma_resolve_ib_service(struct rdma_cm_id *id, + struct rdma_ucm_ib_service *ibs); + /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 7cea03581f79..8799623bcba0 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -67,7 +67,8 @@ enum { RDMA_USER_CM_CMD_QUERY, RDMA_USER_CM_CMD_BIND, RDMA_USER_CM_CMD_RESOLVE_ADDR, - RDMA_USER_CM_CMD_JOIN_MCAST + RDMA_USER_CM_CMD_JOIN_MCAST, + RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE }; /* See IBTA Annex A11, servies ID bytes 4 & 5 */ @@ -338,4 +339,21 @@ struct rdma_ucm_migrate_resp { __u32 events_reported; }; +enum { + RDMA_USER_CM_IB_SERVICE_FLAG_ID = 1 << 0, + RDMA_USER_CM_IB_SERVICE_FLAG_NAME = 1 << 1, +}; + +#define RDMA_USER_CM_IB_SERVICE_NAME_SIZE 64 +struct rdma_ucm_ib_service { + __u64 service_id; + __u8 service_name[RDMA_USER_CM_IB_SERVICE_NAME_SIZE]; + __u32 flags; + __u32 reserved; +}; + +struct rdma_ucm_resolve_ib_service { + __u32 id; + struct rdma_ucm_ib_service ibs; +}; #endif /* RDMA_USER_CM_H */ -- cgit v1.2.3 From 810f874eda8e492701ba3623aa298caad61bb47c Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 30 Jun 2025 13:52:34 +0300 Subject: RDMA/ucma: Support query resolved service records Enable user-space to query resolved service records through a ucma command when a RDMA_CM_EVENT_ADDRINFO_RESOLVED event is received. Signed-off-by: Or Har-Toov Signed-off-by: Mark Zhang Reviewed-by: Vlad Dumitrescu Link: https://patch.msgid.link/1090ee7c00c3f8058c4f9e7557de983504a16715.1751279794.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/ucma.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/uapi/rdma/ib_user_sa.h | 14 ++++++++++++++ include/uapi/rdma/rdma_user_cm.h | 8 +++++++- 3 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 1915f4e68308..3b9ca6d7a21b 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1021,6 +1021,43 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, return ret; } +static ssize_t ucma_query_ib_service(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_ib_service_resp *resp; + int n, ret = 0; + + if (out_len < sizeof(struct rdma_ucm_query_ib_service_resp)) + return -ENOSPC; + + if (!ctx->cm_id->route.service_recs) + return -ENODATA; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_service_recs = ctx->cm_id->route.num_service_recs; + + n = (out_len - sizeof(struct rdma_ucm_query_ib_service_resp)) / + sizeof(struct ib_user_service_rec); + + if (!n) + goto out; + + if (n > ctx->cm_id->route.num_service_recs) + n = ctx->cm_id->route.num_service_recs; + + memcpy(resp->recs, ctx->cm_id->route.service_recs, + sizeof(*resp->recs) * n); + if (copy_to_user(response, resp, struct_size(resp, recs, n))) + ret = -EFAULT; + +out: + kfree(resp); + return ret; +} + static ssize_t ucma_query(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1049,6 +1086,9 @@ static ssize_t ucma_query(struct ucma_file *file, case RDMA_USER_CM_QUERY_GID: ret = ucma_query_gid(ctx, response, out_len); break; + case RDMA_USER_CM_QUERY_IB_SERVICE: + ret = ucma_query_ib_service(ctx, response, out_len); + break; default: ret = -ENOSYS; break; diff --git a/include/uapi/rdma/ib_user_sa.h b/include/uapi/rdma/ib_user_sa.h index 435155d6e1c6..acfa20816bc6 100644 --- a/include/uapi/rdma/ib_user_sa.h +++ b/include/uapi/rdma/ib_user_sa.h @@ -74,4 +74,18 @@ struct ib_user_path_rec { __u8 preference; }; +struct ib_user_service_rec { + __be64 id; + __u8 gid[16]; + __be16 pkey; + __u8 reserved[2]; + __be32 lease; + __u8 key[16]; + __u8 name[64]; + __u8 data_8[16]; + __be16 data_16[8]; + __be32 data_32[4]; + __be64 data_64[2]; +}; + #endif /* IB_USER_SA_H */ diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 8799623bcba0..00501da0567e 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -148,7 +148,8 @@ struct rdma_ucm_resolve_route { enum { RDMA_USER_CM_QUERY_ADDR, RDMA_USER_CM_QUERY_PATH, - RDMA_USER_CM_QUERY_GID + RDMA_USER_CM_QUERY_GID, + RDMA_USER_CM_QUERY_IB_SERVICE }; struct rdma_ucm_query { @@ -188,6 +189,11 @@ struct rdma_ucm_query_path_resp { struct ib_path_rec_data path_data[]; }; +struct rdma_ucm_query_ib_service_resp { + __u32 num_service_recs; + struct ib_user_service_rec recs[]; +}; + struct rdma_ucm_conn_param { __u32 qp_num; __u32 qkey; -- cgit v1.2.3 From a3c9d0fcd3715541bbf97da2ddde9d032e2fe6d5 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 30 Jun 2025 13:52:35 +0300 Subject: RDMA/ucma: Support write an event into a CM Enable user-space to inject an event into a CM through it's event channel. Two new events are added and supported: RDMA_CM_EVENT_USER and RDMA_CM_EVENT_INTERNAL. With these 2 events a new event parameter "arg" is supported, which is passed from sender to receiver transparently. With this feature an application is able to write an event into a CM channel with a new user-space rdmacm API. For example thread T1 could write an event with the API: rdma_write_cm_event(cm_id, RDMA_CM_EVENT_USER, status, arg); and thread T2 could receive the event with rdma_get_cm_event(). Signed-off-by: Mark Zhang Reviewed-by: Vlad Dumitrescu Link: https://patch.msgid.link/fdf49d0b17a45933c5d8c1d90605c9447d9a3c73.1751279794.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/ucma.c | 52 +++++++++++++++++++++++++++++++++++++++- include/rdma/rdma_cm.h | 5 +++- include/uapi/rdma/rdma_user_cm.h | 16 ++++++++++++- 3 files changed, 70 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 3b9ca6d7a21b..f86ece701db6 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1745,6 +1745,55 @@ err_unlock: return ret; } +static ssize_t ucma_write_cm_event(struct ucma_file *file, + const char __user *inbuf, int in_len, + int out_len) +{ + struct rdma_ucm_write_cm_event cmd; + struct rdma_cm_event event = {}; + struct ucma_event *uevent; + struct ucma_context *ctx; + int ret = 0; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if ((cmd.event != RDMA_CM_EVENT_USER) && + (cmd.event != RDMA_CM_EVENT_INTERNAL)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + event.event = cmd.event; + event.status = cmd.status; + event.param.arg = cmd.param.arg; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) { + ret = -ENOMEM; + goto out; + } + + uevent->ctx = ctx; + uevent->resp.uid = ctx->uid; + uevent->resp.id = ctx->id; + uevent->resp.event = event.event; + uevent->resp.status = event.status; + memcpy(uevent->resp.param.arg32, &event.param.arg, + sizeof(event.param.arg)); + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + +out: + ucma_put_ctx(ctx); + return ret; +} + static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { @@ -1771,7 +1820,8 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, - [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service + [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service, + [RDMA_USER_CM_CMD_WRITE_CM_EVENT] = ucma_write_cm_event, }; static ssize_t ucma_write(struct file *filp, const char __user *buf, diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 72d1568e4cfb..9bd930a83e6e 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -35,7 +35,9 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_ADDR_CHANGE, RDMA_CM_EVENT_TIMEWAIT_EXIT, RDMA_CM_EVENT_ADDRINFO_RESOLVED, - RDMA_CM_EVENT_ADDRINFO_ERROR + RDMA_CM_EVENT_ADDRINFO_ERROR, + RDMA_CM_EVENT_USER, + RDMA_CM_EVENT_INTERNAL, }; const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); @@ -98,6 +100,7 @@ struct rdma_cm_event { union { struct rdma_conn_param conn; struct rdma_ud_param ud; + u64 arg; } param; struct rdma_ucm_ece ece; }; diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 00501da0567e..5ded174687ee 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -68,7 +68,8 @@ enum { RDMA_USER_CM_CMD_BIND, RDMA_USER_CM_CMD_RESOLVE_ADDR, RDMA_USER_CM_CMD_JOIN_MCAST, - RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE + RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE, + RDMA_USER_CM_CMD_WRITE_CM_EVENT, }; /* See IBTA Annex A11, servies ID bytes 4 & 5 */ @@ -304,6 +305,7 @@ struct rdma_ucm_event_resp { union { struct rdma_ucm_conn_param conn; struct rdma_ucm_ud_param ud; + __u32 arg32[2]; } param; __u32 reserved; struct rdma_ucm_ece ece; @@ -362,4 +364,16 @@ struct rdma_ucm_resolve_ib_service { __u32 id; struct rdma_ucm_ib_service ibs; }; + +struct rdma_ucm_write_cm_event { + __u32 id; + __u32 reserved; + __u32 event; + __u32 status; + union { + struct rdma_ucm_conn_param conn; + struct rdma_ucm_ud_param ud; + __u64 arg; + } param; +}; #endif /* RDMA_USER_CM_H */ -- cgit v1.2.3 From 7ecb662b717a34305beacf71e9fadd1559b4b9ee Mon Sep 17 00:00:00 2001 From: Mehdi Djait Date: Mon, 7 Jul 2025 16:32:53 +0200 Subject: media: v4l2-common: Add a helper for obtaining the clock producer Introduce a helper for v4l2 sensor drivers on both DT- and ACPI-based platforms to retrieve a reference to the clock producer from firmware. This helper behaves the same as devm_clk_get() except where there is no clock producer like in ACPI-based platforms. For ACPI-based platforms the function will read the "clock-frequency" ACPI _DSD property and register a fixed frequency clock with the frequency indicated in the property. This function also handles the special ACPI-based system case where: . The clock-frequency _DSD property is present. . A reference to the clock producer is present, where the clock is provided by a camera sensor PMIC driver (e.g. int3472/tps68470.c) In this case try to set the clock-frequency value to the provided clock. Reviewed-by: Laurent Pinchart Reviewed-by: Lad Prabhakar Signed-off-by: Mehdi Djait Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-common.c | 52 +++++++++++++++++++++++++++++++++++ include/media/v4l2-common.h | 27 ++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 6e585bc76367..a5334aa35992 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -34,6 +34,9 @@ * Added Gerd Knorrs v4l1 enhancements (Justin Schoeman) */ +#include +#include +#include #include #include #include @@ -705,3 +708,52 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, return 0; } EXPORT_SYMBOL_GPL(v4l2_link_freq_to_bitmap); + +struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id) +{ + const char *clk_id __free(kfree) = NULL; + struct clk_hw *clk_hw; + struct clk *clk; + bool of_node; + u32 rate; + int ret; + + clk = devm_clk_get_optional(dev, id); + if (IS_ERR(clk)) + return clk; + + ret = device_property_read_u32(dev, "clock-frequency", &rate); + of_node = is_of_node(dev_fwnode(dev)); + + if (clk) { + if (!ret && !of_node) { + ret = clk_set_rate(clk, rate); + if (ret) { + dev_err(dev, "Failed to set clock rate: %u\n", + rate); + return ERR_PTR(ret); + } + } + return clk; + } + + if (!IS_ENABLED(CONFIG_COMMON_CLK) || of_node) + return ERR_PTR(-ENOENT); + + if (ret) + return ERR_PTR(ret == -EINVAL ? -EPROBE_DEFER : ret); + + if (!id) { + clk_id = kasprintf(GFP_KERNEL, "clk-%s", dev_name(dev)); + if (!clk_id) + return ERR_PTR(-ENOMEM); + id = clk_id; + } + + clk_hw = devm_clk_hw_register_fixed_rate(dev, id, NULL, 0, rate); + if (IS_ERR(clk_hw)) + return ERR_CAST(clk_hw); + + return clk_hw->clk; +} +EXPORT_SYMBOL_GPL(devm_v4l2_sensor_clk_get); diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 0a43f56578bc..9d6c236e8f14 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -97,6 +97,7 @@ int v4l2_ctrl_query_fill(struct v4l2_queryctrl *qctrl, /* ------------------------------------------------------------------------- */ +struct clk; struct v4l2_device; struct v4l2_subdev; struct v4l2_subdev_ops; @@ -620,6 +621,32 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, unsigned int num_of_driver_link_freqs, unsigned long *bitmap); +/** + * devm_v4l2_sensor_clk_get - lookup and obtain a reference to a clock producer + * for a camera sensor. + * + * @dev: device for v4l2 sensor clock "consumer" + * @id: clock consumer ID + * + * This function behaves the same way as devm_clk_get() except where there + * is no clock producer like in ACPI-based platforms. + * + * For ACPI-based platforms, the function will read the "clock-frequency" + * ACPI _DSD property and register a fixed-clock with the frequency indicated + * in the property. + * + * This function also handles the special ACPI-based system case where: + * + * * The clock-frequency _DSD property is present. + * * A reference to the clock producer is present, where the clock is provided + * by a camera sensor PMIC driver (e.g. int3472/tps68470.c) + * + * In this case try to set the clock-frequency value to the provided clock. + * + * Returns a pointer to a struct clk on success or an error pointer on failure. + */ +struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id); + static inline u64 v4l2_buffer_get_timestamp(const struct v4l2_buffer *buf) { /* -- cgit v1.2.3 From 4c70fb2624ab1588faa58dcd407d4c61d64b288d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 13 Aug 2025 08:29:01 +0000 Subject: cpuset: remove redundant CS_ONLINE flag The CS_ONLINE flag was introduced prior to the CSS_ONLINE flag in the cpuset subsystem. Currently, the flag setting sequence is as follows: 1. cpuset_css_online() sets CS_ONLINE 2. css->flags gets CSS_ONLINE set ... 3. cgroup->kill_css sets CSS_DYING 4. cpuset_css_offline() clears CS_ONLINE 5. css->flags clears CSS_ONLINE The is_cpuset_online() check currently occurs between steps 1 and 3. However, it would be equally safe to perform this check between steps 2 and 3, as CSS_ONLINE provides the same synchronization guarantee as CS_ONLINE. Since CS_ONLINE is redundant with CSS_ONLINE and provides no additional synchronization benefits, we can safely remove it to simplify the code. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ kernel/cgroup/cpuset-internal.h | 3 +-- kernel/cgroup/cpuset.c | 4 +--- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..ae73dbb19165 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -354,6 +354,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_online(struct cgroup_subsys_state *css) +{ + return css->flags & CSS_ONLINE; +} + static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac6..75b3aef39231 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675..3466ebbf1016 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -207,7 +207,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -3496,7 +3496,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3571,7 +3570,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); -- cgit v1.2.3 From 2caa6b88e0ba0231fb4ff0ba8e73cedd5fb81fc8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 11 Aug 2025 14:08:04 +0200 Subject: bpf: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the past %pK was preferable to %p as it would not leak raw pointer values into the kernel log. Since commit ad67b74d2469 ("printk: hash addresses printed with %p") the regular %p has been improved to avoid this issue. Furthermore, restricted pointers ("%pK") were never meant to be used through printk(). They can still unintentionally leak raw pointers or acquire sleeping locks in atomic contexts. Switch to the regular pointer formatting which is safer and easier to reason about. Signed-off-by: Thomas Weißschuh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250811-restricted-pointers-bpf-v1-1-a1d7cc3cb9e7@linutronix.de --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e..52fecb7a1fe3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1296,7 +1296,7 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { - pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, + pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) -- cgit v1.2.3 From 139235103f6039c2c77cb8f51cb2e7e610fe0114 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Aug 2025 15:35:05 +0800 Subject: net: stmmac: Change first parameter of fix_soc_reset() In order to use netdev_err() to print message in the callback function of fix_soc_reset(), change fix_soc_reset() to have "struct stmmac_priv *" as its first parameter. This is preparation for later patch, no functionality change. Suggested-by: Andrew Lunn Signed-off-by: Tiezhu Yang Link: https://patch.msgid.link/20250811073506.27513-3-yangtiezhu@loongson.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- drivers/net/ethernet/stmicro/stmmac/hwif.c | 2 +- include/linux/stmmac.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 889e2bb6f7f5..c2d9e89f0063 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -49,7 +49,7 @@ struct imx_dwmac_ops { u32 flags; bool mac_rgmii_txclk_auto_adj; - int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); + int (*fix_soc_reset)(struct stmmac_priv *priv, void __iomem *ioaddr); int (*set_intf_mode)(struct plat_stmmacenet_data *plat_dat); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); }; @@ -265,9 +265,9 @@ static void imx93_dwmac_fix_speed(void *priv, int speed, unsigned int mode) writel(old_ctrl, dwmac->base_addr + MAC_CTRL_REG); } -static int imx_dwmac_mx93_reset(void *priv, void __iomem *ioaddr) +static int imx_dwmac_mx93_reset(struct stmmac_priv *priv, void __iomem *ioaddr) { - struct plat_stmmacenet_data *plat_dat = priv; + struct plat_stmmacenet_data *plat_dat = priv->plat; u32 value = readl(ioaddr + DMA_BUS_MODE); /* DMA SW reset */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 5769165ee5ba..4c477d6ce1e6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -509,7 +509,7 @@ static int loongson_dwmac_acpi_config(struct pci_dev *pdev, } /* Loongson's DWMAC device may take nearly two seconds to complete DMA reset */ -static int loongson_dwmac_fix_reset(void *priv, void __iomem *ioaddr) +static int loongson_dwmac_fix_reset(struct stmmac_priv *priv, void __iomem *ioaddr) { u32 value = readl(ioaddr + DMA_BUS_MODE); diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 99635b37044a..3f7c765dcb79 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -100,7 +100,7 @@ int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr) return -EINVAL; if (plat && plat->fix_soc_reset) - return plat->fix_soc_reset(plat, ioaddr); + return plat->fix_soc_reset(priv, ioaddr); return stmmac_do_callback(priv, dma, reset, ioaddr); } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 22c24dacbc65..e284f04964bf 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -238,7 +238,7 @@ struct plat_stmmacenet_data { int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); - int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); + int (*fix_soc_reset)(struct stmmac_priv *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); int (*mac_finish)(struct net_device *ndev, -- cgit v1.2.3 From 96326447d466ca62b713f660cfc73ef7879151a0 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 12 Aug 2025 06:57:23 +0200 Subject: net: mediatek: wed: Introduce MT7992 WED support to MT7988 SoC Introduce the second WDMA RX ring in WED driver for MT7988 SoC since the Mediatek MT7992 WiFi chipset supports two separated WDMA rings. Add missing MT7988 configurations to properly support WED for MT7992 in MT76 driver. Co-developed-by: Rex Lu Signed-off-by: Rex Lu Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250812-mt7992-wed-support-v3-1-9ada78a819a4@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 33 +++++++++++++++++++----- drivers/net/ethernet/mediatek/mtk_wed.h | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/mmio.c | 6 ++--- drivers/net/wireless/mediatek/mt76/mt7996/mmio.c | 12 ++++----- include/linux/soc/mediatek/mtk_wed.h | 2 +- 5 files changed, 38 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 0a80d8f8cff7..3dbb113b792c 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -59,7 +59,9 @@ struct mtk_wed_flow_block_priv { static const struct mtk_wed_soc_data mt7622_data = { .regmap = { .tx_bm_tkid = 0x088, - .wpdma_rx_ring0 = 0x770, + .wpdma_rx_ring = { + 0x770, + }, .reset_idx_tx_mask = GENMASK(3, 0), .reset_idx_rx_mask = GENMASK(17, 16), }, @@ -70,7 +72,9 @@ static const struct mtk_wed_soc_data mt7622_data = { static const struct mtk_wed_soc_data mt7986_data = { .regmap = { .tx_bm_tkid = 0x0c8, - .wpdma_rx_ring0 = 0x770, + .wpdma_rx_ring = { + 0x770, + }, .reset_idx_tx_mask = GENMASK(1, 0), .reset_idx_rx_mask = GENMASK(7, 6), }, @@ -81,7 +85,10 @@ static const struct mtk_wed_soc_data mt7986_data = { static const struct mtk_wed_soc_data mt7988_data = { .regmap = { .tx_bm_tkid = 0x0c8, - .wpdma_rx_ring0 = 0x7d0, + .wpdma_rx_ring = { + 0x7d0, + 0x7d8, + }, .reset_idx_tx_mask = GENMASK(1, 0), .reset_idx_rx_mask = GENMASK(7, 6), }, @@ -621,8 +628,8 @@ mtk_wed_amsdu_init(struct mtk_wed_device *dev) return ret; } - /* eagle E1 PCIE1 tx ring 22 flow control issue */ - if (dev->wlan.id == 0x7991) + /* Kite and Eagle E1 PCIE1 tx ring 22 flow control issue */ + if (dev->wlan.id == 0x7991 || dev->wlan.id == 0x7992) wed_clr(dev, MTK_WED_AMSDU_FIFO, MTK_WED_AMSDU_IS_PRIOR0_RING); wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_AMSDU_EN); @@ -1239,7 +1246,11 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev) return; wed_w32(dev, MTK_WED_WPDMA_RX_GLO_CFG, dev->wlan.wpdma_rx_glo); - wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring0, dev->wlan.wpdma_rx); + wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring[0], + dev->wlan.wpdma_rx[0]); + if (mtk_wed_is_v3_or_greater(dev->hw)) + wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring[1], + dev->wlan.wpdma_rx[1]); if (!dev->wlan.hw_rro) return; @@ -2323,6 +2334,16 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask) if (!dev->rx_wdma[i].desc) mtk_wed_wdma_rx_ring_setup(dev, i, 16, false); + if (dev->wlan.hw_rro) { + for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++) { + u32 addr = MTK_WED_RRO_MSDU_PG_CTRL0(i) + + MTK_WED_RING_OFS_COUNT; + + if (!wed_r32(dev, addr)) + wed_w32(dev, addr, 1); + } + } + mtk_wed_hw_init(dev); mtk_wed_configure_irq(dev, irq_mask); diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h index c1f0479d7a71..b49aee9a8b65 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.h +++ b/drivers/net/ethernet/mediatek/mtk_wed.h @@ -17,7 +17,7 @@ struct mtk_wed_wo; struct mtk_wed_soc_data { struct { u32 tx_bm_tkid; - u32 wpdma_rx_ring0; + u32 wpdma_rx_ring[MTK_WED_RX_QUEUES]; u32 reset_idx_tx_mask; u32 reset_idx_rx_mask; } regmap; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c index 4a82f8e4c118..36488aa6cc20 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c @@ -664,8 +664,8 @@ int mt7915_mmio_wed_init(struct mt7915_dev *dev, void *pdev_ptr, MT_RXQ_WED_RING_BASE; wed->wlan.wpdma_rx_glo = pci_resource_start(pci_dev, 0) + MT_WPDMA_GLO_CFG; - wed->wlan.wpdma_rx = pci_resource_start(pci_dev, 0) + - MT_RXQ_WED_DATA_RING_BASE; + wed->wlan.wpdma_rx[0] = pci_resource_start(pci_dev, 0) + + MT_RXQ_WED_DATA_RING_BASE; } else { struct platform_device *plat_dev = pdev_ptr; struct resource *res; @@ -687,7 +687,7 @@ int mt7915_mmio_wed_init(struct mt7915_dev *dev, void *pdev_ptr, wed->wlan.wpdma_tx = res->start + MT_TXQ_WED_RING_BASE; wed->wlan.wpdma_txfree = res->start + MT_RXQ_WED_RING_BASE; wed->wlan.wpdma_rx_glo = res->start + MT_WPDMA_GLO_CFG; - wed->wlan.wpdma_rx = res->start + MT_RXQ_WED_DATA_RING_BASE; + wed->wlan.wpdma_rx[0] = res->start + MT_RXQ_WED_DATA_RING_BASE; } wed->wlan.nbuf = MT7915_HW_TOKEN_SIZE; wed->wlan.tx_tbit[0] = is_mt7915(&dev->mt76) ? 4 : 30; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c index 30b40f4a91be..fb2428a9b877 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c @@ -503,9 +503,9 @@ int mt7996_mmio_wed_init(struct mt7996_dev *dev, void *pdev_ptr, } wed->wlan.wpdma_rx_glo = wed->wlan.phy_base + hif1_ofs + MT_WFDMA0_GLO_CFG; - wed->wlan.wpdma_rx = wed->wlan.phy_base + hif1_ofs + - MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + - MT7996_RXQ_BAND0 * MT_RING_SIZE; + wed->wlan.wpdma_rx[0] = wed->wlan.phy_base + hif1_ofs + + MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + + MT7996_RXQ_BAND0 * MT_RING_SIZE; wed->wlan.id = MT7996_DEVICE_ID_2; wed->wlan.tx_tbit[0] = ffs(MT_INT_TX_DONE_BAND2) - 1; @@ -518,9 +518,9 @@ int mt7996_mmio_wed_init(struct mt7996_dev *dev, void *pdev_ptr, wed->wlan.wpdma_rx_glo = wed->wlan.phy_base + MT_WFDMA0_GLO_CFG; - wed->wlan.wpdma_rx = wed->wlan.phy_base + - MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + - MT7996_RXQ_BAND0 * MT_RING_SIZE; + wed->wlan.wpdma_rx[0] = wed->wlan.phy_base + + MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + + MT7996_RXQ_BAND0 * MT_RING_SIZE; wed->wlan.wpdma_rx_rro[0] = wed->wlan.phy_base + MT_RXQ_RING_BASE(MT7996_RXQ_RRO_BAND0) + diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index d8949a4ed0dc..c4ff6bab176d 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -147,7 +147,7 @@ struct mtk_wed_device { u32 wpdma_tx; u32 wpdma_txfree; u32 wpdma_rx_glo; - u32 wpdma_rx; + u32 wpdma_rx[MTK_WED_RX_QUEUES]; u32 wpdma_rx_rro[MTK_WED_RX_QUEUES]; u32 wpdma_rx_pg; -- cgit v1.2.3 From c308bb4190a8f6f09270b541457d6ae905d2f7bc Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 13 Aug 2025 22:55:02 +0200 Subject: ALSA: hda: Use min() to simplify snd_hda_get_devices() Use min() to simplify snd_hda_get_devices() and improve its readability. Change the function parameter 'max_devices' from 'int' to 'unsigned int' to avoid a min() signedness error. Update all related local variables and the function's return type to 'unsigned int' accordingly. No functional changes intended. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250813205507.215658-2-thorsten.blum@linux.dev Signed-off-by: Takashi Iwai --- include/sound/hda_codec.h | 4 ++-- sound/hda/common/codec.c | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h index ddc9c392f93f..006d4e4a8195 100644 --- a/include/sound/hda_codec.h +++ b/include/sound/hda_codec.h @@ -360,8 +360,8 @@ int snd_hda_override_conn_list(struct hda_codec *codec, hda_nid_t nid, int nums, int snd_hda_get_conn_index(struct hda_codec *codec, hda_nid_t mux, hda_nid_t nid, int recursive); unsigned int snd_hda_get_num_devices(struct hda_codec *codec, hda_nid_t nid); -int snd_hda_get_devices(struct hda_codec *codec, hda_nid_t nid, - u8 *dev_list, int max_devices); +unsigned int snd_hda_get_devices(struct hda_codec *codec, hda_nid_t nid, + u8 *dev_list, unsigned int max_devices); int snd_hda_get_dev_select(struct hda_codec *codec, hda_nid_t nid); int snd_hda_set_dev_select(struct hda_codec *codec, hda_nid_t nid, int dev_id); diff --git a/sound/hda/common/codec.c b/sound/hda/common/codec.c index cab479111603..7a72d4c7ae91 100644 --- a/sound/hda/common/codec.c +++ b/sound/hda/common/codec.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -323,18 +324,16 @@ EXPORT_SYMBOL_GPL(snd_hda_get_num_devices); * Copy the device list. This info is dynamic and so not cached. * Currently called only from hda_proc.c, so not exported. */ -int snd_hda_get_devices(struct hda_codec *codec, hda_nid_t nid, - u8 *dev_list, int max_devices) +unsigned int snd_hda_get_devices(struct hda_codec *codec, hda_nid_t nid, + u8 *dev_list, unsigned int max_devices) { - unsigned int parm; - int i, dev_len, devices; + unsigned int parm, i, dev_len, devices; parm = snd_hda_get_num_devices(codec, nid); if (!parm) /* not multi-stream capable */ return 0; - dev_len = parm + 1; - dev_len = dev_len < max_devices ? dev_len : max_devices; + dev_len = min(parm + 1, max_devices); devices = 0; while (devices < dev_len) { -- cgit v1.2.3 From f22cc6f766f84496b260347d4f0d92cf95f30699 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Aug 2025 16:42:09 -0700 Subject: net: ethtool: support including Flow Label in the flow hash for RSS Some modern NICs support including the IPv6 Flow Label in the flow hash for RSS queue selection. This is outside the old "Microsoft spec", but was included in the OCP NIC spec: [ ] RSS include flow label in the hash (configurable) https://www.opencompute.org/w/index.php?title=Core_Offloads#Receive_Side_Scaling RSS Flow Label hashing allows TCP Protective Load Balancing (PLB) to recover from receiver congestion / overload. Rx CPU/queue hotspots are relatively common for data ingest workloads, and so far we had to try to detect the condition at the RPC layer and reopen the connection. PLB lets us change the Flow Label and therefore Rx CPU on RTO, with minimal packet reordering. PLB reaction times are much faster, and can happen at any point in the connection, not just at RPC boundaries. Due to the nature of host processing (relatively long queues, other kernel subsystems masking IRQs for 100s of msecs) the risk of reordering within the host is higher than in the network. But for applications which need it - it is far preferable to potentially persistent overload of subset of queues. It is expected that the hash communicated to the host may change if the Flow Label changes. This may be surprising to some host software, but I don't expect the devices can compute two Toeplitz hashes, one with the Flow Label for queue selection and one without for the rx hash communicated to the host. Besides, changing the hash may potentially help to change the path thru host queues. User can disable NETIF_F_RXHASH if they require a stable flow hash. The name RXH_IP6_FL was chosen based on what we call Flow Label variables in IPv6 processing (fl). I prefer fl_lbl but that appears to be an fbnic-only spelling. We could spell out RXH_IP6_FLOW_LABEL but existing RXH_ defines are a lot more terse. Willem notes [1] that Flow Label is defined as identifying the flow and therefore including both the flow label _and_ the L4 header fields is not generally necessary. But it should not hurt so it's not explicitly prevented if the driver supports hashing on both at the same time. Link: https://lore.kernel.org/68483433b45e2_3cd66f29440@willemb.c.googlers.com.notmuch [1] Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250811234212.580748-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool.h | 1 + net/ethtool/ioctl.c | 25 +++++++++++++++++++++++++ net/ethtool/rss.c | 27 ++++++++++++++------------- 4 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 1bc1bd7d33c2..7a7594713f1f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -204,6 +204,9 @@ definitions: doc: dst port in case of TCP/UDP/SCTP - name: gtp-teid + - + name: ip6-fl + doc: IPv6 Flow Label - name: discard value: 31 diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9e9afdd1238a..8bd5ea5469d9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2380,6 +2380,7 @@ enum { #define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ #define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ #define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_IP6_FL (1 << 9) /* IPv6 flow label */ #define RXH_DISCARD (1 << 31) #define RX_CLS_FLOW_DISC 0xffffffffffffffffULL diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 43a7854e784e..0b2a4d0573b3 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type) return false; } +static bool flow_type_v6(u32 flow_type) +{ + switch (flow_type) { + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV6_FLOW: + case GTPU_V6_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + /* When adding a new type, update the assert and, if it's hashable, add it to * the flow_type_hashable switch case. */ @@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) if (rc) return rc; + if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type)) + return -EINVAL; + if (info.flow_type & FLOW_RSS && info.rss_context && !ops->rxfh_per_ctx_fields) return -EINVAL; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 992e98abe9dd..202d95e8bf3e 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) #define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ RXH_GTP_TEID | RXH_DISCARD) +#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL) static const struct nla_policy ethnl_rss_flows_policy[] = { [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), }; const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { -- cgit v1.2.3 From f443d7c9ed4642489d2f73a35e86df6228f65dfc Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Wed, 13 Aug 2025 16:17:15 +0800 Subject: dt-bindings: reset: thead,th1520-reset: add more VOSYS resets VOSYS contains more resets for a display pipeline, includes ones for the display controller (called DPU in the manual), the HDMI controller and 2 MIPI DSI controllers. Allocate IDs for these resets in the dt binding header file. Now all peripheral related VOSYS reset controls are here, only the bus matrix / IOPMP ones are missing, which shouldn't be messed with. Signed-off-by: Icenowy Zheng Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250813081716.2181843-2-uwu@icenowy.me Signed-off-by: Philipp Zabel --- include/dt-bindings/reset/thead,th1520-reset.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/reset/thead,th1520-reset.h b/include/dt-bindings/reset/thead,th1520-reset.h index 00459f160489..ee799286c175 100644 --- a/include/dt-bindings/reset/thead,th1520-reset.h +++ b/include/dt-bindings/reset/thead,th1520-reset.h @@ -12,5 +12,12 @@ #define TH1520_RESET_ID_NPU 2 #define TH1520_RESET_ID_WDT0 3 #define TH1520_RESET_ID_WDT1 4 +#define TH1520_RESET_ID_DPU_AHB 5 +#define TH1520_RESET_ID_DPU_AXI 6 +#define TH1520_RESET_ID_DPU_CORE 7 +#define TH1520_RESET_ID_DSI0_APB 8 +#define TH1520_RESET_ID_DSI1_APB 9 +#define TH1520_RESET_ID_HDMI 10 +#define TH1520_RESET_ID_HDMI_APB 11 #endif /* _DT_BINDINGS_TH1520_RESET_H */ -- cgit v1.2.3 From 2412f16c9afa7710778fc032139a6df38b68fd7c Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 14 Aug 2025 16:50:07 +0300 Subject: media: v4l2-common: Improve devm_v4l2_sensor_clk_get() documentation Remove the extra leading period and provide more elaborate explanation for why devm_v4l2_sensor_clk_get() is only allowed to be used on camera sensor devices. Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/v4l2-common.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 9d6c236e8f14..39dd0c78d70f 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -623,7 +623,7 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, /** * devm_v4l2_sensor_clk_get - lookup and obtain a reference to a clock producer - * for a camera sensor. + * for a camera sensor * * @dev: device for v4l2 sensor clock "consumer" * @id: clock consumer ID @@ -643,6 +643,14 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, * * In this case try to set the clock-frequency value to the provided clock. * + * As the name indicates, this function may only be used on camera sensor + * devices. This is because generally only camera sensors do need a clock to + * query the frequency from, due to the requirement to configure the PLL for a + * given CSI-2 interface frequency where the sensor's external clock frequency + * is a factor. Additionally, the clock frequency tends to be available on ACPI + * firmware based systems for camera sensors specifically (if e.g. DisCo for + * Imaging compliant). + * * Returns a pointer to a struct clk on success or an error pointer on failure. */ struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id); -- cgit v1.2.3 From fb2f2a86f0cd9690357b9bb67af00d386a7e819f Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Mon, 16 Jun 2025 13:03:21 +0200 Subject: ice: cleanup capabilities evaluation When evaluating the capabilities field, the ICE_AQC_BIT_ROCEV2_LAG and ICE_AQC_BIT_SRIOV_LAG defines were both not using the BIT operator, instead simply setting a hex value that set the correct bits. While not inaccurate, this method is misleading, and when it is expanded in the following implementation it becomes even more confusing. Switch to using the BIT() operator to clarify what is being checked. Reviewed-by: Przemek Kitszel Reviewed-by: Aleksandr Loktionov Reviewed-by: Marcin Szycik Signed-off-by: Dave Ertman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- include/linux/net/intel/libie/adminq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index 012b5d499c1a..dbe93f940ef0 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -192,8 +192,8 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps); #define LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE 0x0085 #define LIBIE_AQC_CAPS_NAC_TOPOLOGY 0x0087 #define LIBIE_AQC_CAPS_FW_LAG_SUPPORT 0x0092 -#define LIBIE_AQC_BIT_ROCEV2_LAG 0x01 -#define LIBIE_AQC_BIT_SRIOV_LAG 0x02 +#define LIBIE_AQC_BIT_ROCEV2_LAG BIT(0) +#define LIBIE_AQC_BIT_SRIOV_LAG BIT(1) #define LIBIE_AQC_CAPS_FLEX10 0x00F1 #define LIBIE_AQC_CAPS_CEM 0x00F2 -- cgit v1.2.3 From 37d1ade89606875c9cd6eb3b4ee416b7e1800fc4 Mon Sep 17 00:00:00 2001 From: Hans Zhang <18255117159@163.com> Date: Wed, 13 Aug 2025 22:45:24 +0800 Subject: PCI: Clean up __pci_find_next_cap_ttl() readability Refactor the __pci_find_next_cap_ttl() to improve code clarity: - Replace magic number 0x40 with PCI_STD_HEADER_SIZEOF. - Use ALIGN_DOWN() for position alignment instead of manual bitmask. - Extract PCI capability fields via FIELD_GET() with standardized masks. - Add necessary headers (linux/align.h). No functional changes intended. Signed-off-by: Hans Zhang <18255117159@163.com> Signed-off-by: Bjorn Helgaas Tested-by: Niklas Schnelle Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250813144529.303548-2-18255117159@163.com --- drivers/pci/pci.c | 9 +++++---- include/uapi/linux/pci_regs.h | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cd..40a5c87d9a6b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -432,17 +433,17 @@ static u8 __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn, pci_bus_read_config_byte(bus, devfn, pos, &pos); while ((*ttl)--) { - if (pos < 0x40) + if (pos < PCI_STD_HEADER_SIZEOF) break; - pos &= ~3; + pos = ALIGN_DOWN(pos, 4); pci_bus_read_config_word(bus, devfn, pos, &ent); - id = ent & 0xff; + id = FIELD_GET(PCI_CAP_ID_MASK, ent); if (id == 0xff) break; if (id == cap) return pos; - pos = (ent >> 8); + pos = FIELD_GET(PCI_CAP_LIST_NEXT_MASK, ent); } return 0; } diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de60..1bba99b46227 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -207,6 +207,9 @@ /* Capability lists */ +#define PCI_CAP_ID_MASK 0x00ff /* Capability ID mask */ +#define PCI_CAP_LIST_NEXT_MASK 0xff00 /* Next Capability Pointer mask */ + #define PCI_CAP_LIST_ID 0 /* Capability ID */ #define PCI_CAP_ID_PM 0x01 /* Power Management */ #define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */ -- cgit v1.2.3 From 8d90041a0d285044b89629f539ca0685e156848b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:13 -0400 Subject: dlm: handle release_option as unsigned Future patches will introduce a invalid argument check for undefined values. All values for release_option are positive integer values to not check on negative values as well we just change the parameter to unsigned int. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 8 ++++---- include/linux/dlm.h | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d986b7ef153d..8eadbf0fdf17 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -676,7 +676,7 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int release_option) +static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option) { struct dlm_lkb *lkb; unsigned long id; @@ -704,7 +704,7 @@ static int lockspace_busy(struct dlm_ls *ls, int release_option) return rv; } -static int release_lockspace(struct dlm_ls *ls, int release_option) +static int release_lockspace(struct dlm_ls *ls, unsigned int release_option) { int busy, rv; @@ -792,7 +792,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) * See DLM_RELEASE defines for release_option values and their meaning. */ -int dlm_release_lockspace(void *lockspace, int force) +int dlm_release_lockspace(void *lockspace, unsigned int release_option) { struct dlm_ls *ls; int error; @@ -803,7 +803,7 @@ int dlm_release_lockspace(void *lockspace, int force) dlm_put_lockspace(ls); mutex_lock(&ls_lock); - error = release_lockspace(ls, force); + error = release_lockspace(ls, release_option); if (!error) ls_count--; if (!ls_count) diff --git a/include/linux/dlm.h b/include/linux/dlm.h index 108eb953eb18..34015a008b80 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -122,7 +122,8 @@ int dlm_new_lockspace(const char *name, const char *cluster, * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int release_option); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, + unsigned int release_option); /* * dlm_lock -- cgit v1.2.3 From 8e40210788636619404871df07445fa4590138b4 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:14 -0400 Subject: dlm: check for undefined release_option values Checking on all undefined release_option values to return -EINVAL in case a user is providing them to dlm_release_lockspace(). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 3 +++ include/linux/dlm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 8eadbf0fdf17..ddaa76558706 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -797,6 +797,9 @@ int dlm_release_lockspace(void *lockspace, unsigned int release_option) struct dlm_ls *ls; int error; + if (release_option > __DLM_RELEASE_MAX) + return -EINVAL; + ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index 34015a008b80..7e7b45b0d097 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -113,6 +113,7 @@ int dlm_new_lockspace(const char *name, const char *cluster, #define DLM_RELEASE_NORMAL 2 #define DLM_RELEASE_NO_EVENT 3 #define DLM_RELEASE_RECOVER 4 +#define __DLM_RELEASE_MAX DLM_RELEASE_RECOVER /* * dlm_release_lockspace -- cgit v1.2.3 From dab32f2576a39d5f54f3dbbbc718d92fa5109ce9 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 7 Aug 2025 15:55:39 +0200 Subject: s390/pci: Use pci_uevent_ers() in PCI recovery Issue uevents on s390 during PCI recovery using pci_uevent_ers() as done by EEH and AER PCIe recovery routines. Signed-off-by: Niklas Schnelle Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20250807-add_err_uevents-v5-2-adf85b0620b0@linux.ibm.com --- arch/s390/pci/pci_event.c | 3 +++ drivers/pci/pci-driver.c | 2 +- include/linux/pci.h | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d930416d4c90..b95376041501 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -88,6 +88,7 @@ static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; ers_res = driver->err_handler->error_detected(pdev, pdev->error_state); + pci_uevent_ers(pdev, ers_res); if (ers_result_indicates_abort(ers_res)) pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev)); else if (ers_res == PCI_ERS_RESULT_NEED_RESET) @@ -244,6 +245,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) ers_res = PCI_ERS_RESULT_RECOVERED; if (ers_res != PCI_ERS_RESULT_RECOVERED) { + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); pr_err("%s: Automatic recovery failed; operator intervention is required\n", pci_name(pdev)); status_str = "failed (driver can't recover)"; @@ -253,6 +255,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) pr_info("%s: The device is ready to resume operations\n", pci_name(pdev)); if (driver->err_handler->resume) driver->err_handler->resume(pdev); + pci_uevent_ers(pdev, PCI_ERS_RESULT_RECOVERED); out_unlock: pci_dev_unlock(pdev); zpci_report_status(zdev, "recovery", status_str); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 6405acdb5d0f..302d61783f6c 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1582,7 +1582,7 @@ static int pci_uevent(const struct device *dev, struct kobj_uevent_env *env) return 0; } -#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) +#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) || defined(CONFIG_S390) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery diff --git a/include/linux/pci.h b/include/linux/pci.h index 59876de13860..7735acf6f349 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2764,7 +2764,7 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev) return false; } -#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH) +#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH) || defined(CONFIG_S390) void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #endif -- cgit v1.2.3 From 7fdc1d1b02e471c2ad4292705265706e003430a0 Mon Sep 17 00:00:00 2001 From: Raviteja Laggyshetty Date: Thu, 14 Aug 2025 14:54:19 +0000 Subject: dt-bindings: interconnect: document the RPMh Network-On-Chip interconnect in Glymur SoC Document the RPMh Network-On-Chip Interconnect in Glymur platform. Co-developed-by: Odelu Kukatla Signed-off-by: Odelu Kukatla Reviewed-by: "Rob Herring (Arm)" Signed-off-by: Raviteja Laggyshetty Link: https://lore.kernel.org/r/20250814-glymur-icc-v2-1-596cca6b6015@oss.qualcomm.com Signed-off-by: Georgi Djakov --- .../bindings/interconnect/qcom,glymur-rpmh.yaml | 172 +++++++++++++++++ .../dt-bindings/interconnect/qcom,glymur-rpmh.h | 205 +++++++++++++++++++++ 2 files changed, 377 insertions(+) create mode 100644 Documentation/devicetree/bindings/interconnect/qcom,glymur-rpmh.yaml create mode 100644 include/dt-bindings/interconnect/qcom,glymur-rpmh.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/interconnect/qcom,glymur-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,glymur-rpmh.yaml new file mode 100644 index 000000000000..d55a7bcf5591 --- /dev/null +++ b/Documentation/devicetree/bindings/interconnect/qcom,glymur-rpmh.yaml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interconnect/qcom,glymur-rpmh.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm RPMh Network-On-Chip Interconnect on GLYMUR + +maintainers: + - Raviteja Laggyshetty + +description: | + RPMh interconnect providers support system bandwidth requirements through + RPMh hardware accelerators known as Bus Clock Manager (BCM). The provider is + able to communicate with the BCM through the Resource State Coordinator (RSC) + associated with each execution environment. Provider nodes must point to at + least one RPMh device child node pertaining to their RSC and each provider + can map to multiple RPMh resources. + + See also: include/dt-bindings/interconnect/qcom,glymur-rpmh.h + +properties: + compatible: + enum: + - qcom,glymur-aggre1-noc + - qcom,glymur-aggre2-noc + - qcom,glymur-aggre3-noc + - qcom,glymur-aggre4-noc + - qcom,glymur-clk-virt + - qcom,glymur-cnoc-cfg + - qcom,glymur-cnoc-main + - qcom,glymur-hscnoc + - qcom,glymur-lpass-ag-noc + - qcom,glymur-lpass-lpiaon-noc + - qcom,glymur-lpass-lpicx-noc + - qcom,glymur-mc-virt + - qcom,glymur-mmss-noc + - qcom,glymur-nsinoc + - qcom,glymur-nsp-noc + - qcom,glymur-oobm-ss-noc + - qcom,glymur-pcie-east-anoc + - qcom,glymur-pcie-east-slv-noc + - qcom,glymur-pcie-west-anoc + - qcom,glymur-pcie-west-slv-noc + - qcom,glymur-system-noc + + reg: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 4 + +required: + - compatible + +allOf: + - $ref: qcom,rpmh-common.yaml# + - if: + properties: + compatible: + contains: + enum: + - qcom,glymur-clk-virt + - qcom,glymur-mc-virt + then: + properties: + reg: false + else: + required: + - reg + + - if: + properties: + compatible: + contains: + enum: + - qcom,glymur-pcie-west-anoc + then: + properties: + clocks: + items: + - description: aggre PCIE_3A WEST AXI clock + - description: aggre PCIE_3B WEST AXI clock + - description: aggre PCIE_4 WEST AXI clock + - description: aggre PCIE_6 WEST AXI clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,glymur-pcie-east-anoc + then: + properties: + clocks: + items: + - description: aggre PCIE_5 EAST AXI clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,glymur-aggre2-noc + then: + properties: + clocks: + items: + - description: aggre USB3 TERT AXI clock + - description: aggre USB4_2 AXI clock + - description: aggre UFS PHY AXI clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,glymur-aggre4-noc + then: + properties: + clocks: + items: + - description: aggre USB3 PRIM AXI clock + - description: aggre USB3 SEC AXI clock + - description: aggre USB4_0 AXI clock + - description: aggre USB4_1 AXI clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,glymur-pcie-west-anoc + - qcom,glymur-pcie-east-anoc + - qcom,glymur-aggre2-noc + - qcom,glymur-aggre4-noc + then: + required: + - clocks + else: + properties: + clocks: false + +unevaluatedProperties: false + +examples: + - | + #include + clk_virt: interconnect-0 { + compatible = "qcom,glymur-clk-virt"; + #interconnect-cells = <2>; + qcom,bcm-voters = <&apps_bcm_voter>; + }; + + aggre1_noc: interconnect@16e0000 { + compatible = "qcom,glymur-aggre1-noc"; + reg = <0x016e0000 0x14400>; + #interconnect-cells = <2>; + qcom,bcm-voters = <&apps_bcm_voter>; + }; + + aggre4_noc: interconnect@1740000 { + compatible = "qcom,glymur-aggre4-noc"; + reg = <0x01740000 0x14400>; + #interconnect-cells = <2>; + qcom,bcm-voters = <&apps_bcm_voter>; + clocks = <&gcc GCC_AGGRE_USB3_PRIM_AXI_CLK>, + <&gcc GCC_AGGRE_USB3_SEC_AXI_CLK>, + <&gcc GCC_AGGRE_USB4_0_AXI_CLK>, + <&gcc GCC_AGGRE_USB4_1_AXI_CLK>; + }; diff --git a/include/dt-bindings/interconnect/qcom,glymur-rpmh.h b/include/dt-bindings/interconnect/qcom,glymur-rpmh.h new file mode 100644 index 000000000000..6a0e754345e4 --- /dev/null +++ b/include/dt-bindings/interconnect/qcom,glymur-rpmh.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_GLYMUR_H +#define __DT_BINDINGS_INTERCONNECT_QCOM_GLYMUR_H + +#define MASTER_CRYPTO 0 +#define MASTER_SOCCP_PROC 1 +#define MASTER_QDSS_ETR 2 +#define MASTER_QDSS_ETR_1 3 +#define SLAVE_A1NOC_SNOC 4 + +#define MASTER_UFS_MEM 0 +#define MASTER_USB3_2 1 +#define MASTER_USB4_2 2 +#define SLAVE_A2NOC_SNOC 3 + +#define MASTER_QSPI_0 0 +#define MASTER_QUP_0 1 +#define MASTER_QUP_1 2 +#define MASTER_QUP_2 3 +#define MASTER_SP 4 +#define MASTER_SDCC_2 5 +#define MASTER_SDCC_4 6 +#define MASTER_USB2 7 +#define MASTER_USB3_MP 8 +#define SLAVE_A3NOC_SNOC 9 + +#define MASTER_USB3_0 0 +#define MASTER_USB3_1 1 +#define MASTER_USB4_0 2 +#define MASTER_USB4_1 3 +#define SLAVE_A4NOC_HSCNOC 4 + +#define MASTER_QUP_CORE_0 0 +#define MASTER_QUP_CORE_1 1 +#define MASTER_QUP_CORE_2 2 +#define SLAVE_QUP_CORE_0 3 +#define SLAVE_QUP_CORE_1 4 +#define SLAVE_QUP_CORE_2 5 + +#define MASTER_CNOC_CFG 0 +#define SLAVE_AHB2PHY_SOUTH 1 +#define SLAVE_AHB2PHY_NORTH 2 +#define SLAVE_AHB2PHY_2 3 +#define SLAVE_AHB2PHY_3 4 +#define SLAVE_AV1_ENC_CFG 5 +#define SLAVE_CAMERA_CFG 6 +#define SLAVE_CLK_CTL 7 +#define SLAVE_CRYPTO_0_CFG 8 +#define SLAVE_DISPLAY_CFG 9 +#define SLAVE_GFX3D_CFG 10 +#define SLAVE_IMEM_CFG 11 +#define SLAVE_PCIE_0_CFG 12 +#define SLAVE_PCIE_1_CFG 13 +#define SLAVE_PCIE_2_CFG 14 +#define SLAVE_PCIE_3A_CFG 15 +#define SLAVE_PCIE_3B_CFG 16 +#define SLAVE_PCIE_4_CFG 17 +#define SLAVE_PCIE_5_CFG 18 +#define SLAVE_PCIE_6_CFG 19 +#define SLAVE_PCIE_RSCC 20 +#define SLAVE_PDM 21 +#define SLAVE_PRNG 22 +#define SLAVE_QDSS_CFG 23 +#define SLAVE_QSPI_0 24 +#define SLAVE_QUP_0 25 +#define SLAVE_QUP_1 26 +#define SLAVE_QUP_2 27 +#define SLAVE_SDCC_2 28 +#define SLAVE_SDCC_4 29 +#define SLAVE_SMMUV3_CFG 30 +#define SLAVE_TCSR 31 +#define SLAVE_TLMM 32 +#define SLAVE_UFS_MEM_CFG 33 +#define SLAVE_USB2 34 +#define SLAVE_USB3_0 35 +#define SLAVE_USB3_1 36 +#define SLAVE_USB3_2 37 +#define SLAVE_USB3_MP 38 +#define SLAVE_USB4_0 39 +#define SLAVE_USB4_1 40 +#define SLAVE_USB4_2 41 +#define SLAVE_VENUS_CFG 42 +#define SLAVE_CNOC_PCIE_SLAVE_EAST_CFG 43 +#define SLAVE_CNOC_PCIE_SLAVE_WEST_CFG 44 +#define SLAVE_LPASS_QTB_CFG 45 +#define SLAVE_CNOC_MNOC_CFG 46 +#define SLAVE_NSP_QTB_CFG 47 +#define SLAVE_PCIE_EAST_ANOC_CFG 48 +#define SLAVE_PCIE_WEST_ANOC_CFG 49 +#define SLAVE_QDSS_STM 50 +#define SLAVE_TCU 51 + +#define MASTER_HSCNOC_CNOC 0 +#define SLAVE_AOSS 1 +#define SLAVE_IPC_ROUTER_CFG 2 +#define SLAVE_SOCCP 3 +#define SLAVE_TME_CFG 4 +#define SLAVE_APPSS 5 +#define SLAVE_CNOC_CFG 6 +#define SLAVE_BOOT_IMEM 7 +#define SLAVE_IMEM 8 + +#define MASTER_GPU_TCU 0 +#define MASTER_PCIE_TCU 1 +#define MASTER_SYS_TCU 2 +#define MASTER_APPSS_PROC 3 +#define MASTER_AGGRE_NOC_EAST 4 +#define MASTER_GFX3D 5 +#define MASTER_LPASS_GEM_NOC 6 +#define MASTER_MNOC_HF_MEM_NOC 7 +#define MASTER_MNOC_SF_MEM_NOC 8 +#define MASTER_COMPUTE_NOC 9 +#define MASTER_PCIE_EAST 10 +#define MASTER_PCIE_WEST 11 +#define MASTER_SNOC_SF_MEM_NOC 12 +#define MASTER_WLAN_Q6 13 +#define MASTER_GIC 14 +#define SLAVE_HSCNOC_CNOC 15 +#define SLAVE_LLCC 16 +#define SLAVE_PCIE_EAST 17 +#define SLAVE_PCIE_WEST 18 + +#define MASTER_LPIAON_NOC 0 +#define SLAVE_LPASS_GEM_NOC 1 + +#define MASTER_LPASS_LPINOC 0 +#define SLAVE_LPIAON_NOC_LPASS_AG_NOC 1 + +#define MASTER_LPASS_PROC 0 +#define SLAVE_LPICX_NOC_LPIAON_NOC 1 + +#define MASTER_LLCC 0 +#define SLAVE_EBI1 1 + +#define MASTER_AV1_ENC 0 +#define MASTER_CAMNOC_HF 1 +#define MASTER_CAMNOC_ICP 2 +#define MASTER_CAMNOC_SF 3 +#define MASTER_EVA 4 +#define MASTER_MDP 5 +#define MASTER_CDSP_HCP 6 +#define MASTER_VIDEO 7 +#define MASTER_VIDEO_CV_PROC 8 +#define MASTER_VIDEO_V_PROC 9 +#define MASTER_CNOC_MNOC_CFG 10 +#define SLAVE_MNOC_HF_MEM_NOC 11 +#define SLAVE_MNOC_SF_MEM_NOC 12 +#define SLAVE_SERVICE_MNOC 13 + +#define MASTER_CPUCP 0 +#define SLAVE_NSINOC_SYSTEM_NOC 1 +#define SLAVE_SERVICE_NSINOC 2 + +#define MASTER_CDSP_PROC 0 +#define SLAVE_NSP0_HSC_NOC 1 + +#define MASTER_OOBMSS_SP_PROC 0 +#define SLAVE_OOBMSS_SNOC 1 + +#define MASTER_PCIE_EAST_ANOC_CFG 0 +#define MASTER_PCIE_0 1 +#define MASTER_PCIE_1 2 +#define MASTER_PCIE_5 3 +#define SLAVE_PCIE_EAST_MEM_NOC 4 +#define SLAVE_SERVICE_PCIE_EAST_AGGRE_NOC 5 + +#define MASTER_HSCNOC_PCIE_EAST 0 +#define MASTER_CNOC_PCIE_EAST_SLAVE_CFG 1 +#define SLAVE_HSCNOC_PCIE_EAST_MS_MPU_CFG 2 +#define SLAVE_SERVICE_PCIE_EAST 3 +#define SLAVE_PCIE_0 4 +#define SLAVE_PCIE_1 5 +#define SLAVE_PCIE_5 6 + +#define MASTER_PCIE_WEST_ANOC_CFG 0 +#define MASTER_PCIE_2 1 +#define MASTER_PCIE_3A 2 +#define MASTER_PCIE_3B 3 +#define MASTER_PCIE_4 4 +#define MASTER_PCIE_6 5 +#define SLAVE_PCIE_WEST_MEM_NOC 6 +#define SLAVE_SERVICE_PCIE_WEST_AGGRE_NOC 7 + +#define MASTER_HSCNOC_PCIE_WEST 0 +#define MASTER_CNOC_PCIE_WEST_SLAVE_CFG 1 +#define SLAVE_HSCNOC_PCIE_WEST_MS_MPU_CFG 2 +#define SLAVE_SERVICE_PCIE_WEST 3 +#define SLAVE_PCIE_2 4 +#define SLAVE_PCIE_3A 5 +#define SLAVE_PCIE_3B 6 +#define SLAVE_PCIE_4 7 +#define SLAVE_PCIE_6 8 + +#define MASTER_A1NOC_SNOC 0 +#define MASTER_A2NOC_SNOC 1 +#define MASTER_A3NOC_SNOC 2 +#define MASTER_NSINOC_SNOC 3 +#define MASTER_OOBMSS 4 +#define SLAVE_SNOC_GEM_NOC_SF 5 + +#endif -- cgit v1.2.3 From f39494089aaa1022008eee245fb83ef1ae911b6d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Jul 2025 21:09:13 -0700 Subject: srcu: Move rcu_is_watching() checks to srcu_read_{,un}lock_fast() The rcu_is_watching() warnings are currently in the SRCU-tree implementations of __srcu_read_lock_fast() and __srcu_read_unlock_fast(). However, this makes it difficult to create _notrace variants of srcu_read_lock_fast() and srcu_read_unlock_fast(). This commit therefore moves these checks to srcu_read_lock_fast(), srcu_read_unlock_fast(), srcu_down_read_fast(), and srcu_up_read_fast(). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcu.h | 4 ++++ include/linux/srcutree.h | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f179700fecaf..478c73d067f7 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -275,6 +275,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * { struct srcu_ctr __percpu *retval; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); rcu_try_lock_acquire(&ssp->dep_map); @@ -295,6 +296,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast()."); srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); return __srcu_read_lock_fast(ssp); } @@ -389,6 +391,7 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock_fast(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } /** @@ -405,6 +408,7 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); __srcu_read_unlock_fast(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast()."); } /** diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index bf44d8d1e69e..043b5a67ef71 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -244,7 +244,6 @@ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct { struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) this_cpu_inc(scp->srcu_locks.counter); /* Y */ else @@ -275,7 +274,6 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ else atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -- cgit v1.2.3 From 7e2a2d060da4860af37e1000dc62a30a1551d9e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Jul 2025 09:12:16 -0700 Subject: srcu: Add srcu_read_lock_fast_notrace() and srcu_read_unlock_fast_notrace() This commit adds no-trace variants of the srcu_read_lock_fast() and srcu_read_unlock_fast() functions for tracing use. [ paulmck: Apply notrace feedback from Joel Fernandes, Steven Rostedt, and Mathieu Desnoyers. ] [ paulmck: Apply excess-notrace feedback from Boqun Feng. ] Link: https://lore.kernel.org/all/20250721162433.10454-1-paulmck@kernel.org Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcu.h | 25 +++++++++++++++++++++++++ include/linux/srcutree.h | 5 +++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 478c73d067f7..7a692bf8f99b 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -282,6 +282,20 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * return retval; } +/* + * Used by tracing, cannot be traced and cannot call lockdep. + * See srcu_read_lock_fast() for more information. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_struct *ssp) + __acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + retval = __srcu_read_lock_fast(ssp); + return retval; +} + /** * srcu_down_read_fast - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -394,6 +408,17 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } +/* + * Used by tracing, cannot be traced and cannot call lockdep. + * See srcu_read_unlock_fast() for more information. + */ +static inline void srcu_read_unlock_fast_notrace(struct srcu_struct *ssp, + struct srcu_ctr __percpu *scp) __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + __srcu_read_unlock_fast(ssp, scp); +} + /** * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 043b5a67ef71..4d2fee4d3828 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -240,7 +240,7 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * on architectures that support NMIs but do not supply NMI-safe * implementations of this_cpu_inc(). */ -static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct srcu_struct *ssp) { struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); @@ -267,7 +267,8 @@ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct * on architectures that support NMIs but do not supply NMI-safe * implementations of this_cpu_inc(). */ -static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +static inline void notrace +__srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) { barrier(); /* Avoid leaking the critical section. */ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) -- cgit v1.2.3 From cacadb630375b8c30ca4d0300812178bb884c0b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Jul 2025 09:19:39 -0700 Subject: srcu: Add guards for notrace variants of SRCU-fast readers This adds the usual scoped_guard(srcu_fast_notrace, &my_srcu) and guard(srcu_fast_notrace)(&my_srcu). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 7a692bf8f99b..ada65b58bc4c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -515,4 +515,9 @@ DEFINE_LOCK_GUARD_1(srcu_fast, struct srcu_struct, srcu_read_unlock_fast(_T->lock, _T->scp), struct srcu_ctr __percpu *scp) +DEFINE_LOCK_GUARD_1(srcu_fast_notrace, struct srcu_struct, + _T->scp = srcu_read_lock_fast_notrace(_T->lock), + srcu_read_unlock_fast_notrace(_T->lock, _T->scp), + struct srcu_ctr __percpu *scp) + #endif -- cgit v1.2.3 From 28f073b38372b99d8d33ff5e63897d28419bda20 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Mon, 16 Jun 2025 13:03:23 +0200 Subject: ice: Implement support for SRIOV VFs across Active/Active bonds This patch implements the software flows to handle SRIOV VF communication across an Active/Active link aggregate. The same restrictions apply as are in place for the support of Active/Backup bonds. - the two interfaces must be on the same NIC - the FW LLDP engine needs to be disabled - the DDP package that supports VF LAG must be loaded on device - the two interfaces must have the same QoS config - only the first interface added to the bond will have VF support - the interface with VFs must be in switchdev mode With the additional requirement of - the version of the FW on the NIC needs to have VF Active/Active support This requirement is indicated in the capabilities struct associated with the NVM loaded on the NIC. The balancing of traffic between the two interfaces is done on a queue basis. Taking the queues allocated to all of the VFs as a whole, one half of them will be distributed to each interface. When a link goes down, then the queues allocated to the down interface will migrate to the active port. When the down port comes back up, then the same queues as were originally assigned there will be moved back. Co-developed-by: Marcin Szycik Signed-off-by: Marcin Szycik Signed-off-by: Dave Ertman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 4 + drivers/net/ethernet/intel/ice/ice_common.c | 15 +- drivers/net/ethernet/intel/ice/ice_common.h | 2 +- drivers/net/ethernet/intel/ice/ice_lag.c | 772 +++++++++++++++++++++--- drivers/net/ethernet/intel/ice/ice_lag.h | 21 +- drivers/net/ethernet/intel/ice/ice_type.h | 1 + include/linux/net/intel/libie/adminq.h | 1 + 8 files changed, 713 insertions(+), 104 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2098f00b3cd3..a83fdd22cbe4 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -203,6 +203,7 @@ enum ice_feature { ICE_F_GCS, ICE_F_ROCE_LAG, ICE_F_SRIOV_LAG, + ICE_F_SRIOV_AA_LAG, ICE_F_MBX_LIMIT, ICE_F_MAX }; diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 3bd3ea3af888..caae1780fd37 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2060,6 +2060,10 @@ struct ice_aqc_cfg_txqs { #define ICE_AQC_Q_CFG_SRC_PRT_M 0x7 #define ICE_AQC_Q_CFG_DST_PRT_S 3 #define ICE_AQC_Q_CFG_DST_PRT_M (0x7 << ICE_AQC_Q_CFG_DST_PRT_S) +#define ICE_AQC_Q_CFG_MODE_M GENMASK(7, 6) +#define ICE_AQC_Q_CFG_MODE_SAME_PF 0x0 +#define ICE_AQC_Q_CFG_MODE_GIVE_OWN 0x1 +#define ICE_AQC_Q_CFG_MODE_KEEP_OWN 0x2 u8 time_out; #define ICE_AQC_Q_CFG_TIMEOUT_S 2 #define ICE_AQC_Q_CFG_TIMEOUT_M (0x1F << ICE_AQC_Q_CFG_TIMEOUT_S) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 209f42045fea..808870539667 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -2424,6 +2424,9 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps, caps->sriov_lag = number & LIBIE_AQC_BIT_SRIOV_LAG; ice_debug(hw, ICE_DBG_INIT, "%s: sriov_lag = %u\n", prefix, caps->sriov_lag); + caps->sriov_aa_lag = number & LIBIE_AQC_BIT_SRIOV_AA_LAG; + ice_debug(hw, ICE_DBG_INIT, "%s: sriov_aa_lag = %u\n", + prefix, caps->sriov_aa_lag); break; case LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE: caps->tx_sched_topo_comp_mode_en = (number == 1); @@ -4712,24 +4715,24 @@ do_aq: } /** - * ice_aq_cfg_lan_txq + * ice_aq_cfg_lan_txq - send AQ command 0x0C32 to FW * @hw: pointer to the hardware structure * @buf: buffer for command * @buf_size: size of buffer in bytes * @num_qs: number of queues being configured * @oldport: origination lport * @newport: destination lport + * @mode: cmd_type for move to use * @cd: pointer to command details structure or NULL * * Move/Configure LAN Tx queue (0x0C32) * - * There is a better AQ command to use for moving nodes, so only coding - * this one for configuring the node. + * Return: Zero on success, associated error code on failure. */ int ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf, u16 buf_size, u16 num_qs, u8 oldport, u8 newport, - struct ice_sq_cd *cd) + u8 mode, struct ice_sq_cd *cd) { struct ice_aqc_cfg_txqs *cmd; struct libie_aq_desc desc; @@ -4742,10 +4745,12 @@ ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf, if (!buf) return -EINVAL; - cmd->cmd_type = ICE_AQC_Q_CFG_TC_CHNG; + cmd->cmd_type = mode; cmd->num_qs = num_qs; cmd->port_num_chng = (oldport & ICE_AQC_Q_CFG_SRC_PRT_M); cmd->port_num_chng |= FIELD_PREP(ICE_AQC_Q_CFG_DST_PRT_M, newport); + cmd->port_num_chng |= FIELD_PREP(ICE_AQC_Q_CFG_MODE_M, + ICE_AQC_Q_CFG_MODE_KEEP_OWN); cmd->time_out = FIELD_PREP(ICE_AQC_Q_CFG_TIMEOUT_M, 5); cmd->blocked_cgds = 0; diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index 60320cdf7804..dba15ad315a6 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -270,7 +270,7 @@ ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle, int ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf, u16 buf_size, u16 num_qs, u8 oldport, u8 newport, - struct ice_sq_cd *cd); + u8 mode, struct ice_sq_cd *cd); int ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle); void ice_replay_post(struct ice_hw *hw); struct ice_q_ctx * diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 72284555b98a..80312e1dcf7f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -14,6 +14,9 @@ static const u8 lacp_train_pkt[ICE_TRAIN_PKT_LEN] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x88, 0x09, 0, 0 }; +static const u8 act_act_train_pkt[ICE_TRAIN_PKT_LEN] = { 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 }; #define ICE_RECIPE_LEN 64 #define ICE_LAG_SRIOV_CP_RECIPE 10 @@ -48,10 +51,10 @@ static void ice_lag_set_primary(struct ice_lag *lag) } /** - * ice_lag_set_backup - set PF LAG state to Backup + * ice_lag_set_bkup - set PF LAG state to Backup * @lag: LAG info struct */ -static void ice_lag_set_backup(struct ice_lag *lag) +static void ice_lag_set_bkup(struct ice_lag *lag) { struct ice_pf *pf = lag->pf; @@ -337,25 +340,15 @@ ice_lag_cfg_drop_fltr(struct ice_lag *lag, bool add) } /** - * ice_lag_cfg_pf_fltrs - set filters up for new active port + * ice_lag_cfg_pf_fltrs_act_bkup - set filters up for new active port * @lag: local interfaces lag struct - * @ptr: opaque data containing notifier event + * @bonding_info: netdev event bonding info */ static void -ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) +ice_lag_cfg_pf_fltrs_act_bkup(struct ice_lag *lag, + struct netdev_bonding_info *bonding_info) { - struct netdev_notifier_bonding_info *info = ptr; - struct netdev_bonding_info *bonding_info; - struct net_device *event_netdev; - struct device *dev; - - event_netdev = netdev_notifier_info_to_dev(ptr); - /* not for this netdev */ - if (event_netdev != lag->netdev) - return; - - bonding_info = &info->bonding_info; - dev = ice_pf_to_dev(lag->pf); + struct device *dev = ice_pf_to_dev(lag->pf); /* interface not active - remove old default VSI rule */ if (bonding_info->slave.state && lag->pf_rx_rule_id) { @@ -376,12 +369,13 @@ ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) } /** - * ice_lag_cfg_cp_fltr - configure filter for control packets + * ice_lag_cfg_lp_fltr - configure lport filters * @lag: local interface's lag struct * @add: add or remove rule + * @cp: control packet only or general PF lport rule */ static void -ice_lag_cfg_cp_fltr(struct ice_lag *lag, bool add) +ice_lag_cfg_lp_fltr(struct ice_lag *lag, bool add, bool cp) { struct ice_sw_rule_lkup_rx_tx *s_rule; struct ice_vsi *vsi = lag->pf->vsi[0]; @@ -395,8 +389,17 @@ ice_lag_cfg_cp_fltr(struct ice_lag *lag, bool add) } if (add) { - s_rule->hdr.type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX); - s_rule->recipe_id = cpu_to_le16(ICE_LAG_SRIOV_CP_RECIPE); + if (cp) { + s_rule->recipe_id = + cpu_to_le16(ICE_LAG_SRIOV_CP_RECIPE); + memcpy(s_rule->hdr_data, lacp_train_pkt, + ICE_TRAIN_PKT_LEN); + } else { + s_rule->recipe_id = cpu_to_le16(lag->act_act_recipe); + memcpy(s_rule->hdr_data, act_act_train_pkt, + ICE_TRAIN_PKT_LEN); + } + s_rule->src = cpu_to_le16(vsi->port_info->lport); s_rule->act = cpu_to_le32(ICE_FWD_TO_VSI | ICE_SINGLE_ACT_LAN_ENABLE | @@ -404,27 +407,66 @@ ice_lag_cfg_cp_fltr(struct ice_lag *lag, bool add) FIELD_PREP(ICE_SINGLE_ACT_VSI_ID_M, vsi->vsi_num)); s_rule->hdr_len = cpu_to_le16(ICE_TRAIN_PKT_LEN); - memcpy(s_rule->hdr_data, lacp_train_pkt, ICE_TRAIN_PKT_LEN); + s_rule->hdr.type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX); opc = ice_aqc_opc_add_sw_rules; } else { opc = ice_aqc_opc_remove_sw_rules; - s_rule->index = cpu_to_le16(lag->cp_rule_idx); + if (cp) + s_rule->index = cpu_to_le16(lag->cp_rule_idx); + else + s_rule->index = cpu_to_le16(lag->act_act_rule_idx); } if (ice_aq_sw_rules(&lag->pf->hw, s_rule, buf_len, 1, opc, NULL)) { - netdev_warn(lag->netdev, "Error %s CP rule for fail-over\n", - add ? "ADDING" : "REMOVING"); + netdev_warn(lag->netdev, "Error %s %s rule for aggregate\n", + add ? "ADDING" : "REMOVING", + cp ? "CONTROL PACKET" : "LPORT"); goto err_cp_free; } - if (add) - lag->cp_rule_idx = le16_to_cpu(s_rule->index); - else - lag->cp_rule_idx = 0; + if (add) { + if (cp) + lag->cp_rule_idx = le16_to_cpu(s_rule->index); + else + lag->act_act_rule_idx = le16_to_cpu(s_rule->index); + } else { + if (cp) + lag->cp_rule_idx = 0; + else + lag->act_act_rule_idx = 0; + } err_cp_free: kfree(s_rule); } +/** + * ice_lag_cfg_pf_fltrs - set filters up for PF traffic + * @lag: local interfaces lag struct + * @ptr: opaque data containing notifier event + */ +static void +ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) +{ + struct netdev_notifier_bonding_info *info = ptr; + struct netdev_bonding_info *bonding_info; + struct net_device *event_netdev; + + event_netdev = netdev_notifier_info_to_dev(ptr); + if (event_netdev != lag->netdev) + return; + + bonding_info = &info->bonding_info; + + if (lag->bond_aa) { + if (lag->need_fltr_cfg) { + ice_lag_cfg_lp_fltr(lag, true, false); + lag->need_fltr_cfg = false; + } + } else { + ice_lag_cfg_pf_fltrs_act_bkup(lag, bonding_info); + } +} + /** * ice_display_lag_info - print LAG info * @lag: LAG info struct @@ -648,7 +690,7 @@ ice_lag_move_vf_node_tc(struct ice_lag *lag, u8 oldport, u8 newport, } if (ice_aq_cfg_lan_txq(&lag->pf->hw, qbuf, qbuf_size, valq, oldport, - newport, NULL)) { + newport, ICE_AQC_Q_CFG_TC_CHNG, NULL)) { dev_warn(dev, "Failure to configure queues for LAG failover\n"); goto qbuf_err; } @@ -784,10 +826,17 @@ void ice_lag_move_new_vf_nodes(struct ice_vf *vf) if (lag->upper_netdev) ice_lag_build_netdev_list(lag, &ndlist); - if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG) && - lag->bonded && lag->primary && pri_port != act_port && - !list_empty(lag->netdev_head)) - ice_lag_move_single_vf_nodes(lag, pri_port, act_port, vsi->idx); + if (lag->bonded && lag->primary && !list_empty(lag->netdev_head)) { + if (lag->bond_aa && + ice_is_feature_supported(pf, ICE_F_SRIOV_AA_LAG)) + ice_lag_aa_failover(lag, ICE_LAGS_IDX, NULL); + + if (!lag->bond_aa && + ice_is_feature_supported(pf, ICE_F_SRIOV_LAG) && + pri_port != act_port) + ice_lag_move_single_vf_nodes(lag, pri_port, act_port, + vsi->idx); + } ice_lag_destroy_netdev_list(lag, &ndlist); @@ -851,11 +900,20 @@ u8 ice_lag_prepare_vf_reset(struct ice_lag *lag) u8 pri_prt, act_prt; if (lag && lag->bonded && lag->primary && lag->upper_netdev) { - pri_prt = lag->pf->hw.port_info->lport; - act_prt = lag->active_port; - if (act_prt != pri_prt && act_prt != ICE_LAG_INVALID_PORT) { - ice_lag_move_vf_nodes_cfg(lag, act_prt, pri_prt); - return act_prt; + if (!lag->bond_aa) { + pri_prt = lag->pf->hw.port_info->lport; + act_prt = lag->active_port; + if (act_prt != pri_prt && + act_prt != ICE_LAG_INVALID_PORT) { + ice_lag_move_vf_nodes_cfg(lag, act_prt, pri_prt); + return act_prt; + } + } else { + if (lag->port_bitmap & ICE_LAGS_M) { + lag->port_bitmap &= ~ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, NULL); + lag->port_bitmap |= ICE_LAGS_M; + } } } @@ -873,10 +931,15 @@ void ice_lag_complete_vf_reset(struct ice_lag *lag, u8 act_prt) { u8 pri_prt; - if (lag && lag->bonded && lag->primary && - act_prt != ICE_LAG_INVALID_PORT) { - pri_prt = lag->pf->hw.port_info->lport; - ice_lag_move_vf_nodes_cfg(lag, pri_prt, act_prt); + if (lag && lag->bonded && lag->primary) { + if (!lag->bond_aa) { + pri_prt = lag->pf->hw.port_info->lport; + if (act_prt != ICE_LAG_INVALID_PORT) + ice_lag_move_vf_nodes_cfg(lag, pri_prt, + act_prt); + } else { + ice_lag_aa_failover(lag, ICE_LAGS_IDX, NULL); + } } } @@ -912,7 +975,7 @@ static void ice_lag_info_event(struct ice_lag *lag, void *ptr) } if (bonding_info->slave.state) - ice_lag_set_backup(lag); + ice_lag_set_bkup(lag); else ice_lag_set_primary(lag); @@ -920,6 +983,295 @@ lag_out: ice_display_lag_info(lag); } +/** + * ice_lag_aa_qbuf_recfg - fill a single queue buffer for recfg cmd + * @hw: HW struct that contains the queue context + * @qbuf: pointer to single queue buffer + * @vsi_num: index of the VF VSI in PF space + * @qnum: queue index + * + * Return: Zero on success, error code on failure. + */ +static int +ice_lag_aa_qbuf_recfg(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *qbuf, + u16 vsi_num, int qnum) +{ + struct ice_pf *pf = hw->back; + struct ice_q_ctx *q_ctx; + u16 q_id; + + q_ctx = ice_get_lan_q_ctx(hw, vsi_num, 0, qnum); + if (!q_ctx) { + dev_dbg(ice_hw_to_dev(hw), "LAG queue %d no Q context\n", qnum); + return -ENOENT; + } + + if (q_ctx->q_teid == ICE_INVAL_TEID) { + dev_dbg(ice_hw_to_dev(hw), "LAG queue %d INVAL TEID\n", qnum); + return -EINVAL; + } + + if (q_ctx->q_handle == ICE_INVAL_Q_HANDLE) { + dev_dbg(ice_hw_to_dev(hw), "LAG queue %d INVAL Q HANDLE\n", qnum); + return -EINVAL; + } + + q_id = pf->vsi[vsi_num]->txq_map[q_ctx->q_handle]; + qbuf->queue_info[0].q_handle = cpu_to_le16(q_id); + qbuf->queue_info[0].tc = 0; + qbuf->queue_info[0].q_teid = cpu_to_le32(q_ctx->q_teid); + + return 0; +} + +/** + * ice_lag_aa_move_vf_qs - Move some/all VF queues to destination + * @lag: primary interface's lag struct + * @dest: index of destination port + * @vsi_num: index of VF VSI in PF space + * @all: if true move all queues to destination + * @odd: VF wide q indicator for odd/even + * @e_pf: PF struct for the event interface + * + * the parameter "all" is to control whether we are splitting the queues + * between two interfaces or moving them all to the destination interface + */ +static void ice_lag_aa_move_vf_qs(struct ice_lag *lag, u8 dest, u16 vsi_num, + bool all, bool *odd, struct ice_pf *e_pf) +{ + DEFINE_RAW_FLEX(struct ice_aqc_cfg_txqs_buf, qbuf, queue_info, 1); + struct ice_hw *old_hw, *new_hw, *pri_hw, *sec_hw; + struct device *dev = ice_pf_to_dev(lag->pf); + struct ice_vsi_ctx *pv_ctx, *sv_ctx; + struct ice_lag_netdev_list ndlist; + u16 num_q, qbuf_size, sec_vsi_num; + u8 pri_lport, sec_lport; + u32 pvf_teid, svf_teid; + u16 vf_id; + + vf_id = lag->pf->vsi[vsi_num]->vf->vf_id; + /* If sec_vf[] not defined, then no second interface to share with */ + if (lag->sec_vf[vf_id]) + sec_vsi_num = lag->sec_vf[vf_id]->idx; + else + return; + + pri_lport = lag->bond_lport_pri; + sec_lport = lag->bond_lport_sec; + + if (pri_lport == ICE_LAG_INVALID_PORT || + sec_lport == ICE_LAG_INVALID_PORT) + return; + + if (!e_pf) + ice_lag_build_netdev_list(lag, &ndlist); + + pri_hw = &lag->pf->hw; + if (e_pf && lag->pf != e_pf) + sec_hw = &e_pf->hw; + else + sec_hw = ice_lag_find_hw_by_lport(lag, sec_lport); + + if (!pri_hw || !sec_hw) + return; + + if (dest == ICE_LAGP_IDX) { + struct ice_vsi *vsi; + + vsi = ice_get_main_vsi(lag->pf); + if (!vsi) + return; + + old_hw = sec_hw; + new_hw = pri_hw; + ice_lag_config_eswitch(lag, vsi->netdev); + } else { + struct ice_pf *sec_pf = sec_hw->back; + struct ice_vsi *vsi; + + vsi = ice_get_main_vsi(sec_pf); + if (!vsi) + return; + + old_hw = pri_hw; + new_hw = sec_hw; + ice_lag_config_eswitch(lag, vsi->netdev); + } + + pv_ctx = ice_get_vsi_ctx(pri_hw, vsi_num); + if (!pv_ctx) { + dev_warn(dev, "Unable to locate primary VSI %d context for LAG failover\n", + vsi_num); + return; + } + + sv_ctx = ice_get_vsi_ctx(sec_hw, sec_vsi_num); + if (!sv_ctx) { + dev_warn(dev, "Unable to locate secondary VSI %d context for LAG failover\n", + vsi_num); + return; + } + + num_q = pv_ctx->num_lan_q_entries[0]; + qbuf_size = __struct_size(qbuf); + + /* Suspend traffic for primary VSI VF */ + pvf_teid = le32_to_cpu(pv_ctx->sched.vsi_node[0]->info.node_teid); + ice_sched_suspend_resume_elems(pri_hw, 1, &pvf_teid, true); + + /* Suspend traffic for secondary VSI VF */ + svf_teid = le32_to_cpu(sv_ctx->sched.vsi_node[0]->info.node_teid); + ice_sched_suspend_resume_elems(sec_hw, 1, &svf_teid, true); + + for (int i = 0; i < num_q; i++) { + struct ice_sched_node *n_prt, *q_node, *parent; + struct ice_port_info *pi, *new_pi; + struct ice_vsi_ctx *src_ctx; + struct ice_sched_node *p; + struct ice_q_ctx *q_ctx; + u16 dst_vsi_num; + + pi = old_hw->port_info; + new_pi = new_hw->port_info; + + *odd = !(*odd); + if ((dest == ICE_LAGP_IDX && *odd && !all) || + (dest == ICE_LAGS_IDX && !(*odd) && !all) || + lag->q_home[vf_id][i] == dest) + continue; + + if (dest == ICE_LAGP_IDX) + dst_vsi_num = vsi_num; + else + dst_vsi_num = sec_vsi_num; + + n_prt = ice_sched_get_free_qparent(new_hw->port_info, + dst_vsi_num, 0, + ICE_SCHED_NODE_OWNER_LAN); + if (!n_prt) + continue; + + q_ctx = ice_get_lan_q_ctx(pri_hw, vsi_num, 0, i); + if (!q_ctx) + continue; + + if (dest == ICE_LAGP_IDX) + src_ctx = sv_ctx; + else + src_ctx = pv_ctx; + + q_node = ice_sched_find_node_by_teid(src_ctx->sched.vsi_node[0], + q_ctx->q_teid); + if (!q_node) + continue; + + qbuf->src_parent_teid = q_node->info.parent_teid; + qbuf->dst_parent_teid = n_prt->info.node_teid; + + /* Move the node in the HW/FW */ + if (ice_lag_aa_qbuf_recfg(pri_hw, qbuf, vsi_num, i)) + continue; + + if (dest == ICE_LAGP_IDX) + ice_aq_cfg_lan_txq(pri_hw, qbuf, qbuf_size, 1, + sec_lport, pri_lport, + ICE_AQC_Q_CFG_MOVE_TC_CHNG, + NULL); + else + ice_aq_cfg_lan_txq(pri_hw, qbuf, qbuf_size, 1, + pri_lport, sec_lport, + ICE_AQC_Q_CFG_MOVE_TC_CHNG, + NULL); + + /* Move the node in the SW */ + parent = q_node->parent; + if (!parent) + continue; + + for (int n = 0; n < parent->num_children; n++) { + int j; + + if (parent->children[n] != q_node) + continue; + + for (j = n + 1; j < parent->num_children; + j++) { + parent->children[j - 1] = + parent->children[j]; + } + parent->children[j] = NULL; + parent->num_children--; + break; + } + + p = pi->sib_head[0][q_node->tx_sched_layer]; + while (p) { + if (p->sibling == q_node) { + p->sibling = q_node->sibling; + break; + } + p = p->sibling; + } + + if (pi->sib_head[0][q_node->tx_sched_layer] == q_node) + pi->sib_head[0][q_node->tx_sched_layer] = + q_node->sibling; + + q_node->parent = n_prt; + q_node->info.parent_teid = n_prt->info.node_teid; + q_node->sibling = NULL; + p = new_pi->sib_head[0][q_node->tx_sched_layer]; + if (p) { + while (p) { + if (!p->sibling) { + p->sibling = q_node; + break; + } + p = p->sibling; + } + } else { + new_pi->sib_head[0][q_node->tx_sched_layer] = + q_node; + } + + n_prt->children[n_prt->num_children++] = q_node; + lag->q_home[vf_id][i] = dest; + } + + ice_sched_suspend_resume_elems(pri_hw, 1, &pvf_teid, false); + ice_sched_suspend_resume_elems(sec_hw, 1, &svf_teid, false); + + if (!e_pf) + ice_lag_destroy_netdev_list(lag, &ndlist); +} + +/** + * ice_lag_aa_failover - move VF queues in A/A mode + * @lag: primary lag struct + * @dest: index of destination port + * @e_pf: PF struct for event port + */ +void ice_lag_aa_failover(struct ice_lag *lag, u8 dest, struct ice_pf *e_pf) +{ + bool odd = true, all = false; + int i; + + /* Primary can be a target if down (cleanup), but secondary can't */ + if (dest == ICE_LAGS_IDX && !(lag->port_bitmap & ICE_LAGS_M)) + return; + + /* Move all queues to a destination if only one port is active, + * or no ports are active and dest is primary. + */ + if ((lag->port_bitmap ^ (ICE_LAGP_M | ICE_LAGS_M)) || + (!lag->port_bitmap && dest == ICE_LAGP_IDX)) + all = true; + + ice_for_each_vsi(lag->pf, i) + if (lag->pf->vsi[i] && lag->pf->vsi[i]->type == ICE_VSI_VF) + ice_lag_aa_move_vf_qs(lag, dest, i, all, &odd, e_pf); +} + /** * ice_lag_reclaim_vf_tc - move scheduling nodes back to primary interface * @lag: primary interface lag struct @@ -982,7 +1334,7 @@ ice_lag_reclaim_vf_tc(struct ice_lag *lag, struct ice_hw *src_hw, u16 vsi_num, if (ice_aq_cfg_lan_txq(hw, qbuf, qbuf_size, numq, src_hw->port_info->lport, hw->port_info->lport, - NULL)) { + ICE_AQC_Q_CFG_TC_CHNG, NULL)) { dev_warn(dev, "Failure to configure queues for LAG failover\n"); goto reclaim_qerr; } @@ -1053,14 +1405,15 @@ static void ice_lag_link(struct ice_lag *lag) lag->bonded = true; lag->role = ICE_LAG_UNSET; + lag->need_fltr_cfg = true; netdev_info(lag->netdev, "Shared SR-IOV resources in bond are active\n"); } /** - * ice_lag_unlink - handle unlink event + * ice_lag_act_bkup_unlink - handle unlink event for A/B bond * @lag: LAG info struct */ -static void ice_lag_unlink(struct ice_lag *lag) +static void ice_lag_act_bkup_unlink(struct ice_lag *lag) { u8 pri_port, act_port, loc_port; struct ice_pf *pf = lag->pf; @@ -1096,10 +1449,32 @@ static void ice_lag_unlink(struct ice_lag *lag) } } } +} - lag->bonded = false; - lag->role = ICE_LAG_NONE; - lag->upper_netdev = NULL; +/** + * ice_lag_aa_unlink - handle unlink event for Active-Active bond + * @lag: LAG info struct + */ +static void ice_lag_aa_unlink(struct ice_lag *lag) +{ + struct ice_lag *pri_lag; + + if (lag->primary) { + pri_lag = lag; + lag->port_bitmap &= ~ICE_LAGP_M; + } else { + pri_lag = ice_lag_find_primary(lag); + if (pri_lag) + pri_lag->port_bitmap &= ICE_LAGS_M; + } + + if (pri_lag) { + ice_lag_aa_failover(pri_lag, ICE_LAGP_IDX, lag->pf); + if (lag->primary) + pri_lag->bond_lport_pri = ICE_LAG_INVALID_PORT; + else + pri_lag->bond_lport_sec = ICE_LAG_INVALID_PORT; + } } /** @@ -1115,10 +1490,20 @@ static void ice_lag_link_unlink(struct ice_lag *lag, void *ptr) if (netdev != lag->netdev) return; - if (info->linking) + if (info->linking) { ice_lag_link(lag); - else - ice_lag_unlink(lag); + } else { + if (lag->bond_aa) + ice_lag_aa_unlink(lag); + else + ice_lag_act_bkup_unlink(lag); + + lag->bonded = false; + lag->role = ICE_LAG_NONE; + lag->upper_netdev = NULL; + lag->bond_aa = false; + lag->need_fltr_cfg = false; + } } /** @@ -1320,6 +1705,11 @@ static void ice_lag_init_feature_support_flag(struct ice_pf *pf) ice_set_feature_support(pf, ICE_F_SRIOV_LAG); else ice_clear_feature_support(pf, ICE_F_SRIOV_LAG); + + if (caps->sriov_aa_lag && ice_pkg_has_lport_extract(&pf->hw)) + ice_set_feature_support(pf, ICE_F_SRIOV_AA_LAG); + else + ice_clear_feature_support(pf, ICE_F_SRIOV_AA_LAG); } /** @@ -1353,6 +1743,9 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr) /* Configure primary's SWID to be shared */ ice_lag_primary_swid(lag, true); primary_lag = lag; + lag->bond_lport_pri = lag->pf->hw.port_info->lport; + lag->bond_lport_sec = ICE_LAG_INVALID_PORT; + lag->port_bitmap = 0; } else { u16 swid; @@ -1362,16 +1755,29 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr) swid = primary_lag->pf->hw.port_info->sw_id; ice_lag_set_swid(swid, lag, true); ice_lag_add_prune_list(primary_lag, lag->pf); - ice_lag_cfg_drop_fltr(lag, true); + primary_lag->bond_lport_sec = + lag->pf->hw.port_info->lport; } /* add filter for primary control packets */ - ice_lag_cfg_cp_fltr(lag, true); + ice_lag_cfg_lp_fltr(lag, true, true); } else { if (!primary_lag && lag->primary) primary_lag = lag; + if (primary_lag) { + for (int i = 0; i < ICE_MAX_SRIOV_VFS; i++) { + if (primary_lag->sec_vf[i]) { + ice_vsi_release(primary_lag->sec_vf[i]); + primary_lag->sec_vf[i] = NULL; + } + } + } + if (!lag->primary) { ice_lag_set_swid(0, lag, false); + if (primary_lag) + primary_lag->bond_lport_sec = + ICE_LAG_INVALID_PORT; } else { if (primary_lag && lag->primary) { ice_lag_primary_swid(lag, false); @@ -1379,7 +1785,7 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr) } } /* remove filter for control packets */ - ice_lag_cfg_cp_fltr(lag, false); + ice_lag_cfg_lp_fltr(lag, false, !lag->bond_aa); } } @@ -1405,18 +1811,34 @@ static void ice_lag_monitor_link(struct ice_lag *lag, void *ptr) if (!netif_is_same_ice(lag->pf, event_netdev)) return; + if (info->upper_dev != lag->upper_netdev) + return; + + if (info->linking) + return; + pf = lag->pf; prim_hw = &pf->hw; prim_port = prim_hw->port_info->lport; - if (info->upper_dev != lag->upper_netdev) - return; - - if (!info->linking) { - /* Since there are only two interfaces allowed in SRIOV+LAG, if - * one port is leaving, then nodes need to be on primary - * interface. - */ + /* Since there are only two interfaces allowed in SRIOV+LAG, if + * one port is leaving, then nodes need to be on primary + * interface. + */ + if (lag->bond_aa) { + struct ice_netdev_priv *e_ndp; + struct ice_pf *e_pf; + + e_ndp = netdev_priv(event_netdev); + e_pf = e_ndp->vsi->back; + + if (lag->bond_lport_pri != ICE_LAG_INVALID_PORT && + lag->port_bitmap & ICE_LAGS_M) { + lag->port_bitmap &= ~ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, e_pf); + lag->bond_lport_sec = ICE_LAG_INVALID_PORT; + } + } else { if (prim_port != lag->active_port && lag->active_port != ICE_LAG_INVALID_PORT) { active_hw = ice_lag_find_hw_by_lport(lag, @@ -1428,44 +1850,32 @@ static void ice_lag_monitor_link(struct ice_lag *lag, void *ptr) } /** - * ice_lag_monitor_active - main PF keep track of which port is active + * ice_lag_monitor_act_bkup - keep track of which port is active in A/B LAG * @lag: lag info struct - * @ptr: opaque data containing notifier event + * @b_info: bonding info + * @event_netdev: net_device got target netdev * * This function is for the primary PF to monitor changes in which port is * active and handle changes for SRIOV VF functionality */ -static void ice_lag_monitor_active(struct ice_lag *lag, void *ptr) +static void ice_lag_monitor_act_bkup(struct ice_lag *lag, + struct netdev_bonding_info *b_info, + struct net_device *event_netdev) { - struct netdev_notifier_bonding_info *info = ptr; - struct net_device *event_netdev, *event_upper; - struct netdev_bonding_info *bonding_info; struct ice_netdev_priv *event_np; struct ice_pf *pf, *event_pf; u8 prim_port, event_port; - if (!lag->primary) - return; - pf = lag->pf; if (!pf) return; - event_netdev = netdev_notifier_info_to_dev(ptr); - rcu_read_lock(); - event_upper = netdev_master_upper_dev_get_rcu(event_netdev); - rcu_read_unlock(); - if (!netif_is_ice(event_netdev) || event_upper != lag->upper_netdev) - return; - event_np = netdev_priv(event_netdev); event_pf = event_np->vsi->back; event_port = event_pf->hw.port_info->lport; prim_port = pf->hw.port_info->lport; - bonding_info = &info->bonding_info; - - if (!bonding_info->slave.state) { + if (!b_info->slave.state) { /* if no port is currently active, then nodes and filters exist * on primary port, check if we need to move them */ @@ -1501,6 +1911,128 @@ static void ice_lag_monitor_active(struct ice_lag *lag, void *ptr) } } +/** + * ice_lag_aa_clear_spoof - adjust the placeholder VSI spoofing for A/A LAG + * @vsi: placeholder VSI to adjust + */ +static void ice_lag_aa_clear_spoof(struct ice_vsi *vsi) +{ + ice_vsi_update_security(vsi, ice_vsi_ctx_clear_antispoof); +} + +/** + * ice_lag_monitor_act_act - Keep track of active ports in A/A LAG + * @lag: lag struct for primary interface + * @b_info: bonding_info for event + * @event_netdev: net_device for target netdev + */ +static void ice_lag_monitor_act_act(struct ice_lag *lag, + struct netdev_bonding_info *b_info, + struct net_device *event_netdev) +{ + struct ice_netdev_priv *event_np; + u8 prim_port, event_port; + struct ice_pf *event_pf; + + event_np = netdev_priv(event_netdev); + event_pf = event_np->vsi->back; + event_port = event_pf->hw.port_info->lport; + prim_port = lag->pf->hw.port_info->lport; + + if (b_info->slave.link == BOND_LINK_UP) { + /* Port is coming up */ + if (prim_port == event_port) { + /* Processing event for primary interface */ + if (lag->bond_lport_pri == ICE_LAG_INVALID_PORT) + return; + + if (!(lag->port_bitmap & ICE_LAGP_M)) { + /* Primary port was not marked up before, move + * some|all VF queues to it and mark as up + */ + lag->port_bitmap |= ICE_LAGP_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, event_pf); + } + } else { + if (lag->bond_lport_sec == ICE_LAG_INVALID_PORT) + return; + + /* Create placeholder VSIs on secondary PF. + * The placeholder is necessary so that we have + * an element that represents the VF on the secondary + * interface's scheduling tree. This will be a tree + * root for scheduling nodes when they are moved to + * the secondary interface. + */ + if (!lag->sec_vf[0]) { + struct ice_vsi_cfg_params params = {}; + struct ice_vsi *nvsi; + struct ice_vf *vf; + unsigned int bkt; + + params.type = ICE_VSI_VF; + params.port_info = event_pf->hw.port_info; + params.flags = ICE_VSI_FLAG_INIT; + + ice_for_each_vf(lag->pf, bkt, vf) { + params.vf = vf; + nvsi = ice_vsi_setup(event_pf, + ¶ms); + ice_lag_aa_clear_spoof(nvsi); + lag->sec_vf[vf->vf_id] = nvsi; + } + } + + if (!(lag->port_bitmap & ICE_LAGS_M)) { + /* Secondary port was not marked up before, + * move some|all VF queues to it and mark as up + */ + lag->port_bitmap |= ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGS_IDX, event_pf); + } + } + } else { + /* Port is going down */ + if (prim_port == event_port) { + lag->port_bitmap &= ~ICE_LAGP_M; + ice_lag_aa_failover(lag, ICE_LAGS_IDX, event_pf); + } else { + lag->port_bitmap &= ~ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, event_pf); + } + } +} + +/** + * ice_lag_monitor_info - Calls relevant A/A or A/B monitoring function + * @lag: lag info struct + * @ptr: opaque data containing notifier event + * + * This function is for the primary PF to monitor changes in which port is + * active and handle changes for SRIOV VF functionality + */ +static void ice_lag_monitor_info(struct ice_lag *lag, void *ptr) +{ + struct netdev_notifier_bonding_info *info = ptr; + struct net_device *event_netdev, *event_upper; + struct netdev_bonding_info *bonding_info; + + if (!lag->primary) + return; + + event_netdev = netdev_notifier_info_to_dev(ptr); + bonding_info = &info->bonding_info; + rcu_read_lock(); + event_upper = netdev_master_upper_dev_get_rcu(event_netdev); + rcu_read_unlock(); + if (!netif_is_ice(event_netdev) || event_upper != lag->upper_netdev) + return; + + if (lag->bond_aa) + ice_lag_monitor_act_act(lag, bonding_info, event_netdev); + else + ice_lag_monitor_act_bkup(lag, bonding_info, event_netdev); +} /** * ice_lag_chk_comp - evaluate bonded interface for feature support * @lag: lag info struct @@ -1516,6 +2048,14 @@ ice_lag_chk_comp(struct ice_lag *lag, void *ptr) struct device *dev; int count = 0; + /* All members need to know if bond A/A or A/B */ + bonding_info = &info->bonding_info; + lag->bond_mode = bonding_info->master.bond_mode; + if (lag->bond_mode != BOND_MODE_ACTIVEBACKUP) + lag->bond_aa = true; + else + lag->bond_aa = false; + if (!lag->primary) return true; @@ -1536,12 +2076,9 @@ ice_lag_chk_comp(struct ice_lag *lag, void *ptr) return false; } - bonding_info = &info->bonding_info; - lag->bond_mode = bonding_info->master.bond_mode; - if (lag->bond_mode != BOND_MODE_ACTIVEBACKUP) { - dev_info(dev, "Bond Mode not ACTIVE-BACKUP - VF LAG disabled\n"); + if (lag->bond_aa && !ice_is_feature_supported(lag->pf, + ICE_F_SRIOV_AA_LAG)) return false; - } list_for_each(tmp, lag->netdev_head) { struct ice_dcbx_cfg *dcb_cfg, *peer_dcb_cfg; @@ -1699,6 +2236,26 @@ static void ice_lag_disable_sriov_bond(struct ice_lag *lag) struct ice_pf *pf = np->vsi->back; ice_clear_feature_support(pf, ICE_F_SRIOV_LAG); + ice_clear_feature_support(pf, ICE_F_SRIOV_AA_LAG); +} + +/** + * ice_lag_preset_drop_fltr - preset drop filter for A/B bonds + * @lag: local lag struct + * @ptr: opaque data containing event + * + * Sets the initial drop filter for secondary interface in an + * active-backup bond + */ +static void ice_lag_preset_drop_fltr(struct ice_lag *lag, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (netdev != lag->netdev || lag->primary || !lag->need_fltr_cfg) + return; + + ice_lag_cfg_drop_fltr(lag, true); + lag->need_fltr_cfg = false; } /** @@ -1739,10 +2296,12 @@ static void ice_lag_process_event(struct work_struct *work) ice_lag_unregister(lag_work->lag, netdev); goto lag_cleanup; } - ice_lag_monitor_active(lag_work->lag, - &lag_work->info.bonding_info); ice_lag_cfg_pf_fltrs(lag_work->lag, &lag_work->info.bonding_info); + ice_lag_preset_drop_fltr(lag_work->lag, + &lag_work->info.bonding_info); + ice_lag_monitor_info(lag_work->lag, + &lag_work->info.bonding_info); } ice_lag_info_event(lag_work->lag, &lag_work->info.bonding_info); break; @@ -1993,7 +2552,8 @@ ice_lag_move_vf_nodes_tc_sync(struct ice_lag *lag, struct ice_hw *dest_hw, } if (ice_aq_cfg_lan_txq(hw, qbuf, qbuf_size, numq, hw->port_info->lport, - dest_hw->port_info->lport, NULL)) { + dest_hw->port_info->lport, + ICE_AQC_Q_CFG_TC_CHNG, NULL)) { dev_warn(dev, "Failure to configure queues for LAG reset rebuild\n"); goto sync_qerr; } @@ -2089,9 +2649,13 @@ int ice_init_lag(struct ice_pf *pf) lag->netdev = vsi->netdev; lag->role = ICE_LAG_NONE; lag->active_port = ICE_LAG_INVALID_PORT; + lag->port_bitmap = 0x0; lag->bonded = false; + lag->bond_aa = false; + lag->need_fltr_cfg = false; lag->upper_netdev = NULL; lag->notif_block.notifier_call = NULL; + memset(lag->sec_vf, 0, sizeof(lag->sec_vf)); err = ice_register_lag_handler(lag); if (err) { @@ -2109,6 +2673,11 @@ int ice_init_lag(struct ice_pf *pf) if (err) goto free_rcp_res; + err = ice_create_lag_recipe(&pf->hw, &lag->act_act_recipe, + ice_lport_rcp, 1); + if (err) + goto free_lport_res; + /* associate recipes to profiles */ for (n = 0; n < ICE_PROFID_IPV6_GTPU_IPV6_TCP_INNER; n++) { err = ice_aq_get_recipe_to_profile(&pf->hw, n, @@ -2118,7 +2687,8 @@ int ice_init_lag(struct ice_pf *pf) if (recipe_bits & BIT(ICE_SW_LKUP_DFLT)) { recipe_bits |= BIT(lag->pf_recipe) | - BIT(lag->lport_recipe); + BIT(lag->lport_recipe) | + BIT(lag->act_act_recipe); ice_aq_map_recipe_to_profile(&pf->hw, n, recipe_bits, NULL); } @@ -2129,9 +2699,13 @@ int ice_init_lag(struct ice_pf *pf) dev_dbg(dev, "INIT LAG complete\n"); return 0; +free_lport_res: + ice_free_hw_res(&pf->hw, ICE_AQC_RES_TYPE_RECIPE, 1, + &lag->lport_recipe); + free_rcp_res: ice_free_hw_res(&pf->hw, ICE_AQC_RES_TYPE_RECIPE, 1, - &pf->lag->pf_recipe); + &lag->pf_recipe); lag_error: kfree(lag); pf->lag = NULL; @@ -2216,11 +2790,15 @@ void ice_lag_rebuild(struct ice_pf *pf) ice_lag_move_vf_nodes_sync(prim_lag, &pf->hw); } - ice_lag_cfg_cp_fltr(lag, true); + if (!lag->bond_aa) { + ice_lag_cfg_lp_fltr(lag, true, true); + if (lag->pf_rx_rule_id) + if (ice_lag_cfg_dflt_fltr(lag, true)) + dev_err(ice_pf_to_dev(pf), "Error adding default VSI rule in rebuild\n"); + } else { + ice_lag_cfg_lp_fltr(lag, true, false); + } - if (lag->pf_rx_rule_id) - if (ice_lag_cfg_dflt_fltr(lag, true)) - dev_err(ice_pf_to_dev(pf), "Error adding default VSI rule in rebuild\n"); ice_clear_rdma_cap(pf); lag_rebuild_out: diff --git a/drivers/net/ethernet/intel/ice/ice_lag.h b/drivers/net/ethernet/intel/ice/ice_lag.h index 69347d9f986b..e2a0a782bdd7 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.h +++ b/drivers/net/ethernet/intel/ice/ice_lag.h @@ -14,7 +14,11 @@ enum ice_lag_role { ICE_LAG_UNSET }; -#define ICE_LAG_INVALID_PORT 0xFF +#define ICE_LAG_INVALID_PORT 0xFF +#define ICE_LAGP_IDX 0 +#define ICE_LAGS_IDX 1 +#define ICE_LAGP_M 0x1 +#define ICE_LAGS_M 0x2 #define ICE_LAG_RESET_RETRIES 5 #define ICE_SW_DEFAULT_PROFILE 0 @@ -41,12 +45,26 @@ struct ice_lag { u8 active_port; /* lport value for the current active port */ u8 bonded:1; /* currently bonded */ u8 primary:1; /* this is primary */ + u8 bond_aa:1; /* is this bond active-active */ + u8 need_fltr_cfg:1; /* fltrs for A/A bond still need to be make */ + u8 port_bitmap:2; /* bitmap of active ports */ + u8 bond_lport_pri; /* lport values for primary PF */ + u8 bond_lport_sec; /* lport values for secondary PF */ + + /* q_home keeps track of which interface the q is currently on */ + u8 q_home[ICE_MAX_SRIOV_VFS][ICE_MAX_RSS_QS_PER_VF]; + + /* placeholder VSI for hanging VF queues from on secondary interface */ + struct ice_vsi *sec_vf[ICE_MAX_SRIOV_VFS]; + u16 pf_recipe; u16 lport_recipe; + u16 act_act_recipe; u16 pf_rx_rule_id; u16 pf_tx_rule_id; u16 cp_rule_idx; u16 lport_rule_idx; + u16 act_act_rule_idx; u8 role; }; @@ -65,6 +83,7 @@ struct ice_lag_work { }; void ice_lag_move_new_vf_nodes(struct ice_vf *vf); +void ice_lag_aa_failover(struct ice_lag *lag, u8 dest, struct ice_pf *e_pf); int ice_init_lag(struct ice_pf *pf); void ice_deinit_lag(struct ice_pf *pf); void ice_lag_rebuild(struct ice_pf *pf); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 2b07d5e48847..8d19efc1df72 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -296,6 +296,7 @@ struct ice_hw_common_caps { bool roce_lag; bool sriov_lag; + bool sriov_aa_lag; bool nvm_update_pending_nvm; bool nvm_update_pending_orom; diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index dbe93f940ef0..ba62f703df43 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -194,6 +194,7 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps); #define LIBIE_AQC_CAPS_FW_LAG_SUPPORT 0x0092 #define LIBIE_AQC_BIT_ROCEV2_LAG BIT(0) #define LIBIE_AQC_BIT_SRIOV_LAG BIT(1) +#define LIBIE_AQC_BIT_SRIOV_AA_LAG BIT(2) #define LIBIE_AQC_CAPS_FLEX10 0x00F1 #define LIBIE_AQC_CAPS_CEM 0x00F2 -- cgit v1.2.3 From 41a6e8ab18642741437da932c2f5762b185e928c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 13 Aug 2025 12:44:17 +0300 Subject: devlink/port: Check attributes early and constify Constify the devlink port attributes to indicate they are read only and does not depend on anything else. Therefore, validate it early before setting in the devlink port. Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20250813094417.7269-3-parav@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 +- net/devlink/port.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index b32c9ceeb81d..3119d053bc4d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1743,7 +1743,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *devlink_port_attrs); + const struct devlink_port_attrs *attrs); void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, diff --git a/net/devlink/port.c b/net/devlink/port.c index aaca1b23aa5f..93d8a25bb920 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1356,13 +1356,13 @@ static void __devlink_port_attrs_set(struct devlink_port *devlink_port, * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *attrs) + const struct devlink_port_attrs *attrs) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + WARN_ON(attrs->splittable && attrs->split); devlink_port->attrs = *attrs; __devlink_port_attrs_set(devlink_port, attrs->flavour); - WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); -- cgit v1.2.3 From 448f97fba9013ffa13f5dd82febd18836b189499 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:13 +0200 Subject: perf: Convert mmap() refcounts to refcount_t The recently fixed reference count leaks could have been detected by using refcount_t and refcount_t would have mitigated the potential overflow at least. Now that the code is properly structured, convert the mmap() related mmap_count variants over to refcount_t. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104020.071507932@infradead.org --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 40 ++++++++++++++++++++-------------------- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d96025683..bfbf9ea53f25 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; diff --git a/kernel/events/core.c b/kernel/events/core.c index f6211ab18503..ea357044d780 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3968,7 +3968,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6704,11 +6704,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6743,7 +6743,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6763,10 +6763,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6992,19 +6992,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (data_page_nr(event->rb) != nr_pages) return -EINVAL; - if (atomic_inc_not_zero(&event->rb->mmap_count)) { + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer * multiple times. */ perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } /* * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the + * refcount_dec_and_mutex_lock() remove the * event and continue as if !event->rb */ ring_buffer_attach(event, NULL); @@ -7023,7 +7023,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - atomic_set(&rb->mmap_count, 1); + refcount_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; @@ -7034,7 +7034,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, perf_event_update_userpage(event); perf_mmap_account(vma, user_extra, extra); - atomic_set(&event->mmap_count, 1); + refcount_set(&event->mmap_count, 1); return 0; } @@ -7081,15 +7081,15 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, if (!is_power_of_2(nr_pages)) return -EINVAL; - if (!atomic_inc_not_zero(&rb->mmap_count)) + if (!refcount_inc_not_zero(&rb->mmap_count)) return -EINVAL; if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); + refcount_inc(&rb->aux_mmap_count); } else { if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return -EPERM; } @@ -7101,16 +7101,16 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, rb_flags); if (ret) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return ret; } - atomic_set(&rb->aux_mmap_count, 1); + refcount_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; } perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } @@ -13254,7 +13254,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13267,7 +13267,7 @@ set: goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8d..d9cc57083091 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824f..20a905023736 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) -- cgit v1.2.3 From 2335b3f56690f76ac34b972fcaef368bab1f76f2 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 16 Jun 2025 23:41:53 -0700 Subject: net/mlx5: mlx5_ifc, Add hardware definitions needed for adjacent vports Next patches will implement the discovery and creation of adjacent functions vports, this patch introduces the hardware structures definitions needed for the driver implementation. Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch Reviewed-by: Parav Pandit Reviewed-by: Jack Morgenstein Signed-off-by: Alexei Lazar --- include/linux/mlx5/mlx5_ifc.h | 133 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 129 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8360d9011d4f..44d497272162 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -189,6 +189,9 @@ enum { MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS = 0x727, MLX5_CMD_OP_RELEASE_XRQ_ERROR = 0x729, MLX5_CMD_OP_MODIFY_XRQ = 0x72a, + MLX5_CMD_OPCODE_QUERY_DELEGATED_VHCA = 0x732, + MLX5_CMD_OPCODE_CREATE_ESW_VPORT = 0x733, + MLX5_CMD_OPCODE_DESTROY_ESW_VPORT = 0x734, MLX5_CMD_OP_QUERY_ESW_FUNCTIONS = 0x740, MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, @@ -2207,7 +2210,19 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_440[0x8]; u8 max_num_eqs_24b[0x18]; - u8 reserved_at_460[0x3a0]; + + u8 reserved_at_460[0x160]; + + u8 query_adjacent_functions_id[0x1]; + u8 ingress_egress_esw_vport_connect[0x1]; + u8 function_id_type_vhca_id[0x1]; + u8 reserved_at_5c3[0xd]; + u8 delegate_vhca_management_profiles[0x10]; + + u8 delegated_vhca_max[0x10]; + u8 delegate_vhca_max[0x10]; + + u8 reserved_at_600[0x200]; }; enum mlx5_ifc_flow_destination_type { @@ -5159,7 +5174,9 @@ struct mlx5_ifc_set_hca_cap_in_bits { u8 other_function[0x1]; u8 ec_vf_function[0x1]; - u8 reserved_at_42[0xe]; + u8 reserved_at_42[0x1]; + u8 function_id_type[0x1]; + u8 reserved_at_44[0xc]; u8 function_id[0x10]; u8 reserved_at_60[0x20]; @@ -6357,7 +6374,9 @@ struct mlx5_ifc_query_hca_cap_in_bits { u8 other_function[0x1]; u8 ec_vf_function[0x1]; - u8 reserved_at_42[0xe]; + u8 reserved_at_42[0x1]; + u8 function_id_type[0x1]; + u8 reserved_at_44[0xc]; u8 function_id[0x10]; u8 reserved_at_60[0x20]; @@ -6983,6 +7002,28 @@ struct mlx5_ifc_query_esw_vport_context_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_destroy_esw_vport_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; +}; + +struct mlx5_ifc_destroy_esw_vport_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 vport_num[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_modify_esw_vport_context_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7484,6 +7525,85 @@ struct mlx5_ifc_query_adapter_in_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_function_vhca_rid_info_reg_bits { + u8 host_number[0x8]; + u8 host_pci_device_function[0x8]; + u8 host_pci_bus[0x8]; + u8 reserved_at_18[0x3]; + u8 pci_bus_assigned[0x1]; + u8 function_type[0x4]; + + u8 parent_pci_device_function[0x8]; + u8 parent_pci_bus[0x8]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x10]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_delegated_function_vhca_rid_info_bits { + struct mlx5_ifc_function_vhca_rid_info_reg_bits function_vhca_rid_info; + + u8 reserved_at_80[0x18]; + u8 manage_profile[0x8]; + + u8 reserved_at_a0[0x60]; +}; + +struct mlx5_ifc_query_delegated_vhca_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 functions_count[0x10]; + + u8 reserved_at_80[0x80]; + + struct mlx5_ifc_delegated_function_vhca_rid_info_bits + delegated_function_vhca_rid_info[]; +}; + +struct mlx5_ifc_query_delegated_vhca_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_esw_vport_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 vport_num[0x10]; +}; + +struct mlx5_ifc_create_esw_vport_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 managed_vhca_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_qp_2rst_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7611,7 +7731,12 @@ struct mlx5_ifc_modify_vport_state_in_bits { u8 reserved_at_41[0xf]; u8 vport_number[0x10]; - u8 reserved_at_60[0x18]; + u8 reserved_at_60[0x10]; + u8 ingress_connect[0x1]; + u8 egress_connect[0x1]; + u8 ingress_connect_valid[0x1]; + u8 egress_connect_valid[0x1]; + u8 reserved_at_74[0x4]; u8 admin_state[0x4]; u8 reserved_at_7c[0x4]; }; -- cgit v1.2.3 From 40653f280b2640e5caa94eeedee43e0f1df97704 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 16 Jun 2025 17:28:20 -0700 Subject: {rdma,net}/mlx5: export mlx5_vport_get_vhca_id vhca id is already cached in the vport structure no need to query on every mlx5 layer, use the mlx5_vport_get_vhca_id, where possible. Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch Reviewed-by: Parav Pandit Signed-off-by: Alexei Lazar Reviewed-by: Feng Liu Reviewed-by: Tariq Toukan --- drivers/infiniband/hw/mlx5/std_types.c | 27 ++++------------------ .../mellanox/mlx5/core/diag/reporter_vnic.c | 2 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 -- .../ethernet/mellanox/mlx5/core/steering/hws/cmd.c | 16 +++++++++---- .../mellanox/mlx5/core/steering/sws/dr_cmd.c | 16 +++++++++---- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 5 +++- include/linux/mlx5/vport.h | 2 ++ 7 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bdb568411091..2fcf553044e1 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -83,33 +83,14 @@ static int fill_vport_icm_addr(struct mlx5_core_dev *mdev, u16 vport, static int fill_vport_vhca_id(struct mlx5_core_dev *mdev, u16 vport, struct mlx5_ib_uapi_query_port *info) { - size_t out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); - u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; - void *out; - int err; - - out = kzalloc(out_sz, GFP_KERNEL); - if (!out) - return -ENOMEM; + int err = mlx5_vport_get_vhca_id(mdev, vport, &info->vport_vhca_id); - MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, true); - MLX5_SET(query_hca_cap_in, in, function_id, vport); - MLX5_SET(query_hca_cap_in, in, op_mod, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | - HCA_CAP_OPMOD_GET_CUR); - - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_sz); if (err) - goto out; - - info->vport_vhca_id = MLX5_GET(query_hca_cap_out, out, - capability.cmd_hca_cap.vhca_id); + return err; info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; -out: - kfree(out); - return err; + + return 0; } static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index 86253a89c24c..32bb769f1829 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#include + #include "reporter_vnic.h" #include "en_stats.h" #include "devlink.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index b6d53db27cd5..81857c6f6bf7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -447,8 +447,6 @@ int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap #define mlx5_vport_get_other_func_general_cap(dev, vport, out) \ mlx5_vport_get_other_func_cap(dev, vport, out, MLX5_CAP_GENERAL) -int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id); - static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 9c83753e4592..d447574d86fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -1199,22 +1199,28 @@ out: int mlx5hws_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_function, u16 vport_number, u16 *gvmi) { - bool ec_vf_func = other_function ? mlx5_core_is_ec_vf_vport(mdev, vport_number) : false; u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int err; + if (other_function) { + err = mlx5_vport_get_vhca_id(mdev, vport_number, gvmi); + if (!err) + return 0; + + mlx5_core_err(mdev, "Failed to get vport vhca id for vport %d\n", + vport_number); + return err; + } + + /* get vhca_id for `this` function */ out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); if (!out) return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, other_function); - MLX5_SET(query_hca_cap_in, in, function_id, - mlx5_vport_to_func_id(mdev, vport_number, ec_vf_func)); - MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); MLX5_SET(query_hca_cap_in, in, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c index baefb9a3fa05..bf99b933fd14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c @@ -2,6 +2,7 @@ /* Copyright (c) 2019 Mellanox Technologies. */ #include "dr_types.h" +#include "eswitch.h" int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, bool other_vport, @@ -34,21 +35,28 @@ int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_vport, u16 vport_number, u16 *gvmi) { - bool ec_vf_func = other_vport ? mlx5_core_is_ec_vf_vport(mdev, vport_number) : false; u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int err; + if (other_vport) { + err = mlx5_vport_get_vhca_id(mdev, vport_number, gvmi); + if (!err) + return 0; + + mlx5_core_err(mdev, "Failed to get vport vhca id for vport %d\n", + vport_number); + return err; + } + + /* get vhca_id for `this` function */ out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); if (!out) return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, other_vport); - MLX5_SET(query_hca_cap_in, in, function_id, mlx5_vport_to_func_id(mdev, vport_number, ec_vf_func)); - MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); MLX5_SET(query_hca_cap_in, in, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 231bedc6a252..2ed2e530b07d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1239,7 +1239,9 @@ int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id) void *hca_caps; int err; - *vhca_id = 0; + /* try get vhca_id via eswitch */ + if (mlx5_esw_vport_vhca_id(dev->priv.eswitch, vport, vhca_id)) + return 0; query_ctx = kzalloc(query_out_sz, GFP_KERNEL); if (!query_ctx) @@ -1256,6 +1258,7 @@ out_free: kfree(query_ctx); return err; } +EXPORT_SYMBOL_GPL(mlx5_vport_get_vhca_id); int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 vport, u16 opmod) diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c36cc6d82926..c87b9507cfa1 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -135,4 +135,6 @@ int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev); int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 vport, void *out, u16 opmod); +int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id); + #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 450bbe43ef90a213d66fac1def64050d9d9ada8e Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:12:32 +0000 Subject: crypto: ccp - New bit-field definitions for SNP_PLATFORM_STATUS command Define new bit-field definitions returned by SNP_PLATFORM_STATUS command such as new capabilities like SNP_FEATURE_INFO command availability, ciphertext hiding enabled and capability. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- include/uapi/linux/psp-sev.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index eeb20dfb1fda..c2fd324623c4 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -185,6 +185,10 @@ struct sev_user_data_get_id2 { * @mask_chip_id: whether chip id is present in attestation reports or not * @mask_chip_key: whether attestation reports are signed or not * @vlek_en: VLEK (Version Loaded Endorsement Key) hashstick is loaded + * @feature_info: whether SNP_FEATURE_INFO command is available + * @rapl_dis: whether RAPL is disabled + * @ciphertext_hiding_cap: whether platform has ciphertext hiding capability + * @ciphertext_hiding_en: whether ciphertext hiding is enabled * @rsvd1: reserved * @guest_count: the number of guest currently managed by the firmware * @current_tcb_version: current TCB version @@ -200,7 +204,11 @@ struct sev_user_data_snp_status { __u32 mask_chip_id:1; /* Out */ __u32 mask_chip_key:1; /* Out */ __u32 vlek_en:1; /* Out */ - __u32 rsvd1:29; + __u32 feature_info:1; /* Out */ + __u32 rapl_dis:1; /* Out */ + __u32 ciphertext_hiding_cap:1; /* Out */ + __u32 ciphertext_hiding_en:1; /* Out */ + __u32 rsvd1:25; __u32 guest_count; /* Out */ __u64 current_tcb_version; /* Out */ __u64 reported_tcb_version; /* Out */ -- cgit v1.2.3 From 33cfb80d1910b41d1a25cef89b159c945aff0f24 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:13:10 +0000 Subject: crypto: ccp - Add support for SNP_FEATURE_INFO command The FEATURE_INFO command provides hypervisors with a programmatic means to learn about the supported features of the currently loaded firmware. This command mimics the CPUID instruction relative to sub-leaf input and the four unsigned integer output values. To obtain information regarding the features present in the currently loaded SEV firmware, use the SNP_FEATURE_INFO command. Cache the SNP platform status and feature information from CPUID 0x8000_0024 in the sev_device structure. If SNP is enabled, utilize this cached SNP platform status for the API major, minor and build version. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sev-dev.h | 3 ++ include/linux/psp-sev.h | 29 ++++++++++++++++++ 3 files changed, 104 insertions(+) (limited to 'include') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 528013be1c0a..a3941254d61f 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -233,6 +233,7 @@ static int sev_cmd_buffer_len(int cmd) case SEV_CMD_SNP_GUEST_REQUEST: return sizeof(struct sev_data_snp_guest_request); case SEV_CMD_SNP_CONFIG: return sizeof(struct sev_user_data_snp_config); case SEV_CMD_SNP_COMMIT: return sizeof(struct sev_data_snp_commit); + case SEV_CMD_SNP_FEATURE_INFO: return sizeof(struct sev_data_snp_feature_info); default: return 0; } @@ -1073,6 +1074,67 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +static int snp_get_platform_data(struct sev_device *sev, int *error) +{ + struct sev_data_snp_feature_info snp_feat_info; + struct snp_feature_info *feat_info; + struct sev_data_snp_addr buf; + struct page *page; + int rc; + + /* + * This function is expected to be called before SNP is + * initialized. + */ + if (sev->snp_initialized) + return -EINVAL; + + buf.address = __psp_pa(&sev->snp_plat_status); + rc = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, error); + if (rc) { + dev_err(sev->dev, "SNP PLATFORM_STATUS command failed, ret = %d, error = %#x\n", + rc, *error); + return rc; + } + + sev->api_major = sev->snp_plat_status.api_major; + sev->api_minor = sev->snp_plat_status.api_minor; + sev->build = sev->snp_plat_status.build_id; + + /* + * Do feature discovery of the currently loaded firmware, + * and cache feature information from CPUID 0x8000_0024, + * sub-function 0. + */ + if (!sev->snp_plat_status.feature_info) + return 0; + + /* + * Use dynamically allocated structure for the SNP_FEATURE_INFO + * command to ensure structure is 8-byte aligned, and does not + * cross a page boundary. + */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + feat_info = page_address(page); + snp_feat_info.length = sizeof(snp_feat_info); + snp_feat_info.ecx_in = 0; + snp_feat_info.feature_info_paddr = __psp_pa(feat_info); + + rc = sev_do_cmd(SEV_CMD_SNP_FEATURE_INFO, &snp_feat_info, error); + if (!rc) + sev->snp_feat_info_0 = *feat_info; + else + dev_err(sev->dev, "SNP FEATURE_INFO command failed, ret = %d, error = %#x\n", + rc, *error); + + __free_page(page); + + return rc; +} + static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) { struct sev_data_range_list *range_list = arg; @@ -1599,6 +1661,16 @@ static int sev_get_api_version(void) struct sev_user_data_status status; int error = 0, ret; + /* + * Cache SNP platform status and SNP feature information + * if SNP is available. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) { + ret = snp_get_platform_data(sev, &error); + if (ret) + return 1; + } + ret = sev_platform_status(&status, &error); if (ret) { dev_err(sev->dev, diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 24dd8ff8afaa..5aed2595c9ae 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -58,6 +58,9 @@ struct sev_device { bool snp_initialized; struct sev_user_data_status sev_plat_status; + + struct sev_user_data_snp_status snp_plat_status; + struct snp_feature_info snp_feat_info_0; }; int sev_dev_init(struct psp_device *psp); diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 0f5f94137f6d..5fb6ae0f51cc 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -107,6 +107,7 @@ enum sev_cmd { SEV_CMD_SNP_DOWNLOAD_FIRMWARE_EX = 0x0CA, SEV_CMD_SNP_COMMIT = 0x0CB, SEV_CMD_SNP_VLEK_LOAD = 0x0CD, + SEV_CMD_SNP_FEATURE_INFO = 0x0CE, SEV_CMD_MAX, }; @@ -814,6 +815,34 @@ struct sev_data_snp_commit { u32 len; } __packed; +/** + * struct sev_data_snp_feature_info - SEV_SNP_FEATURE_INFO structure + * + * @length: len of the command buffer read by the PSP + * @ecx_in: subfunction index + * @feature_info_paddr : System Physical Address of the FEATURE_INFO structure + */ +struct sev_data_snp_feature_info { + u32 length; + u32 ecx_in; + u64 feature_info_paddr; +} __packed; + +/** + * struct feature_info - FEATURE_INFO structure + * + * @eax: output of SNP_FEATURE_INFO command + * @ebx: output of SNP_FEATURE_INFO command + * @ecx: output of SNP_FEATURE_INFO command + * #edx: output of SNP_FEATURE_INFO command + */ +struct snp_feature_info { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} __packed; + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** -- cgit v1.2.3 From 45d59bd4a3e0f0475b3646e8b9936d34794e503d Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:13:27 +0000 Subject: crypto: ccp - Introduce new API interface to indicate SEV-SNP Ciphertext hiding feature Implement an API that checks the overall feature support for SEV-SNP ciphertext hiding. This API verifies both the support of the SEV firmware for the feature and its enablement in the platform's BIOS. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 21 +++++++++++++++++++++ include/linux/psp-sev.h | 5 +++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index a3941254d61f..58c9e040e9ac 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1074,6 +1074,27 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +bool sev_is_snp_ciphertext_hiding_supported(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + + if (!psp || !psp->sev_data) + return false; + + sev = psp->sev_data; + + /* + * Feature information indicates if CipherTextHiding feature is + * supported by the SEV firmware and additionally platform status + * indicates if CipherTextHiding feature is enabled in the + * Platform BIOS. + */ + return ((sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) && + sev->snp_plat_status.ciphertext_hiding_cap); +} +EXPORT_SYMBOL_GPL(sev_is_snp_ciphertext_hiding_supported); + static int snp_get_platform_data(struct sev_device *sev, int *error) { struct sev_data_snp_feature_info snp_feat_info; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 5fb6ae0f51cc..d83185b4268b 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -843,6 +843,8 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** @@ -986,6 +988,7 @@ void *psp_copy_user_blob(u64 uaddr, u32 len); void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); +bool sev_is_snp_ciphertext_hiding_supported(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -1022,6 +1025,8 @@ static inline void snp_free_firmware_page(void *addr) { } static inline void sev_platform_shutdown(void) { } +static inline bool sev_is_snp_ciphertext_hiding_supported(void) { return false; } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ -- cgit v1.2.3 From c9760b0fca6bfa250c02e14bfe81c542f3626a72 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:13:55 +0000 Subject: crypto: ccp - Add support to enable CipherTextHiding on SNP_INIT_EX To enable ciphertext hiding, it must be specified in the SNP_INIT_EX command as part of SNP initialization. Modify the sev_platform_init_args structure, which is used as input to sev_platform_init(), to include a field that, when non-zero, indicates that ciphertext hiding should be enabled and specifies the maximum ASID that can be used for an SEV-SNP guest. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 12 +++++++++--- include/linux/psp-sev.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 58c9e040e9ac..334405461657 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1186,7 +1186,7 @@ static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) return 0; } -static int __sev_snp_init_locked(int *error) +static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid) { struct psp_device *psp = psp_master; struct sev_data_snp_init_ex data; @@ -1247,6 +1247,12 @@ static int __sev_snp_init_locked(int *error) } memset(&data, 0, sizeof(data)); + + if (max_snp_asid) { + data.ciphertext_hiding_en = 1; + data.max_snp_asid = max_snp_asid; + } + data.init_rmp = 1; data.list_paddr_en = 1; data.list_paddr = __psp_pa(snp_range_list); @@ -1433,7 +1439,7 @@ static int _sev_platform_init_locked(struct sev_platform_init_args *args) if (sev->sev_plat_status.state == SEV_STATE_INIT) return 0; - rc = __sev_snp_init_locked(&args->error); + rc = __sev_snp_init_locked(&args->error, args->max_snp_asid); if (rc && rc != -ENODEV) return rc; @@ -1516,7 +1522,7 @@ static int snp_move_to_init_state(struct sev_issue_cmd *argp, bool *shutdown_req { int error, rc; - rc = __sev_snp_init_locked(&error); + rc = __sev_snp_init_locked(&error, 0); if (rc) { argp->error = SEV_RET_INVALID_PLATFORM_STATE; return rc; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index d83185b4268b..e0dbcb4b4fd9 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -748,10 +748,13 @@ struct sev_data_snp_guest_request { struct sev_data_snp_init_ex { u32 init_rmp:1; u32 list_paddr_en:1; - u32 rsvd:30; + u32 rapl_dis:1; + u32 ciphertext_hiding_en:1; + u32 rsvd:28; u32 rsvd1; u64 list_paddr; - u8 rsvd2[48]; + u16 max_snp_asid; + u8 rsvd2[46]; } __packed; /** @@ -800,10 +803,13 @@ struct sev_data_snp_shutdown_ex { * @probe: True if this is being called as part of CCP module probe, which * will defer SEV_INIT/SEV_INIT_EX firmware initialization until needed * unless psp_init_on_probe module param is set + * @max_snp_asid: When non-zero, enable ciphertext hiding and specify the + * maximum ASID that can be used for an SEV-SNP guest. */ struct sev_platform_init_args { int error; bool probe; + unsigned int max_snp_asid; }; /** -- cgit v1.2.3 From b76c739c3d11d1dacc8efe7fa873bee28ac991f1 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 22 Jul 2025 16:52:38 -0500 Subject: iio: fix iio_push_to_buffers_with_ts() typo Replace iio_push_to_buffer_with_ts() with iio_push_to_buffers_with_ts() in some documentation comments in iio.h. The latter is the correct name of the function, the former doesn't exist. Signed-off-by: David Lechner Link: https://patch.msgid.link/20250722-iio-fix-iio_push_to_buffer_with_ts-typo-v1-1-6ac9efb856d3@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d11668f14a3e..2f5560646ee4 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -779,7 +779,7 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) * them safe for use with non-coherent DMA. * * A number of drivers also use this on buffers that include a 64-bit timestamp - * that is used with iio_push_to_buffer_with_ts(). Therefore, in the case where + * that is used with iio_push_to_buffers_with_ts(). Therefore, in the case where * DMA alignment is not sufficient for proper timestamp alignment, we align to * 8 bytes instead. */ @@ -794,7 +794,7 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) * @name: identifier name of the buffer * @count: number of elements in the buffer * - * Declares a buffer that is safe to use with iio_push_to_buffer_with_ts(). In + * Declares a buffer that is safe to use with iio_push_to_buffers_with_ts(). In * addition to allocating enough space for @count elements of @type, it also * allocates space for a s64 timestamp at the end of the buffer and ensures * proper alignment of the timestamp. -- cgit v1.2.3 From 4847d1187402a5027d9a04393f12d52a5a1d7f98 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 14 Aug 2025 09:24:41 +0200 Subject: console: introduce console_lock guard()s Having this, guards like these work: guard(console_lock)(); or scoped_guard(console_lock) { ... } See e.g. "vc_screen: use guard()s" later in this series. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20250814072456.182853-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 8f10d0a85bb4..031a58dc2b91 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -666,6 +666,8 @@ void vcs_remove_sysfs(int index); */ extern atomic_t ignore_console_lock_warning; +DEFINE_LOCK_GUARD_0(console_lock, console_lock(), console_unlock()); + extern void console_init(void); /* For deferred console takeover */ -- cgit v1.2.3 From e8398b8aed50382c21fcec77e80a5314e7c45c25 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 14 Aug 2025 09:24:42 +0200 Subject: tty: introduce tty_port_tty guard() Having this, guards like these work: scoped_guard(tty_port_tty, port) tty_wakeup(scoped_tty()); See e.g. "tty_port: use scoped_guard()" later in this series. The definitions depend on CONFIG_TTY. It's due to tty_kref_put(). On !CONFIG_TTY, it is an inline and its declaration would conflict. The guards are not needed in that case, of course. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20250814072456.182853-3-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 332ddb93603e..660c254f1efe 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -270,4 +270,18 @@ static inline void tty_port_tty_vhangup(struct tty_port *port) __tty_port_tty_hangup(port, false, false); } +#ifdef CONFIG_TTY +void tty_kref_put(struct tty_struct *tty); +__DEFINE_CLASS_IS_CONDITIONAL(tty_port_tty, true); +__DEFINE_UNLOCK_GUARD(tty_port_tty, struct tty_struct, tty_kref_put(_T->lock)); +static inline class_tty_port_tty_t class_tty_port_tty_constructor(struct tty_port *tport) +{ + class_tty_port_tty_t _t = { + .lock = tty_port_tty_get(tport), + }; + return _t; +} +#define scoped_tty() ((struct tty_struct *)(__guard_ptr(tty_port_tty)(&scope))) +#endif + #endif -- cgit v1.2.3 From 0fd60b689b0dacce659253ec15cb3d3bf660e30b Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 14 Aug 2025 09:24:43 +0200 Subject: serial: introduce uart_port_lock() guard()s Having this, guards like these work: guard(uart_port_lock_irq)(&up->port); or scoped_guard(uart_port_lock_irqsave, port) { ... } See e.g. "serial: 8250: use guard()s" later in this series. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20250814072456.182853-4-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 84b4648ead7e..666430b47899 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -788,6 +788,19 @@ static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned lo spin_unlock_irqrestore(&up->lock, flags); } +DEFINE_GUARD(uart_port_lock, struct uart_port *, uart_port_lock(_T), uart_port_unlock(_T)); +DEFINE_GUARD_COND(uart_port_lock, _try, uart_port_trylock(_T)); + +DEFINE_GUARD(uart_port_lock_irq, struct uart_port *, uart_port_lock_irq(_T), + uart_port_unlock_irq(_T)); + +DEFINE_LOCK_GUARD_1(uart_port_lock_irqsave, struct uart_port, + uart_port_lock_irqsave(_T->lock, &_T->flags), + uart_port_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags); +DEFINE_LOCK_GUARD_1_COND(uart_port_lock_irqsave, _try, + uart_port_trylock_irqsave(_T->lock, &_T->flags)); + static inline int serial_port_in(struct uart_port *up, int offset) { return up->serial_in(up, offset); -- cgit v1.2.3 From 292cb391479d50f4379a0abab34324de92c82a92 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 13 Aug 2025 23:03:52 -0700 Subject: software node: Constify node_group in registration functions The software_node_register_node_group() and software_node_unregister_node_group() functions take in essence an array of pointers to software_node structs. Since the functions do not modify the array declare the argument as constant, so that static arrays can be declared as const and annotated as __initconst. Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/2zny5grbgtwbplynxffxg6dkgjgqf45aigwmgxio5stesdr3wi@gf2zamk5amic Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 5 ++--- include/linux/property.h | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index deda7f35a059..be1e9e61a7bf 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -844,7 +844,7 @@ swnode_register(const struct software_node *node, struct swnode *parent, * of this function or by ordering the array such that parent comes before * child. */ -int software_node_register_node_group(const struct software_node **node_group) +int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; @@ -877,8 +877,7 @@ EXPORT_SYMBOL_GPL(software_node_register_node_group); * remove the nodes individually, in the correct order (child before * parent). */ -void software_node_unregister_node_group( - const struct software_node **node_group) +void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e..d1e80b3c9918 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -574,8 +574,8 @@ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name); -int software_node_register_node_group(const struct software_node **node_group); -void software_node_unregister_node_group(const struct software_node **node_group); +int software_node_register_node_group(const struct software_node * const *node_group); +void software_node_unregister_node_group(const struct software_node * const *node_group); int software_node_register(const struct software_node *node); void software_node_unregister(const struct software_node *node); -- cgit v1.2.3 From 5576d8098052952a6c95af86ad3dcb341554ac75 Mon Sep 17 00:00:00 2001 From: Inbaraj E Date: Thu, 14 Aug 2025 19:39:32 +0530 Subject: dt-bindings: clock: Add CAM_CSI clock macro for FSD CAM_CSI block has ACLK, PCLK and PLL clocks. PCLK id is already assigned. To use PCLK and PLL clock in driver add id macro for CAM_CSI_PLL and CAM_CSI_PCLK. Signed-off-by: Inbaraj E Link: https://lore.kernel.org/r/20250814140943.22531-2-inbaraj.e@samsung.com Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/fsd-clk.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/fsd-clk.h b/include/dt-bindings/clock/fsd-clk.h index 3f7b64d93558..58fdec8f4c2a 100644 --- a/include/dt-bindings/clock/fsd-clk.h +++ b/include/dt-bindings/clock/fsd-clk.h @@ -139,5 +139,18 @@ #define CAM_CSI2_1_IPCLKPORT_I_ACLK 10 #define CAM_CSI2_2_IPCLKPORT_I_ACLK 11 #define CAM_CSI2_3_IPCLKPORT_I_ACLK 12 +#define CAM_CSI_PLL 13 +#define CAM_CSI0_0_IPCLKPORT_I_PCLK 14 +#define CAM_CSI0_1_IPCLKPORT_I_PCLK 15 +#define CAM_CSI0_2_IPCLKPORT_I_PCLK 16 +#define CAM_CSI0_3_IPCLKPORT_I_PCLK 17 +#define CAM_CSI1_0_IPCLKPORT_I_PCLK 18 +#define CAM_CSI1_1_IPCLKPORT_I_PCLK 19 +#define CAM_CSI1_2_IPCLKPORT_I_PCLK 20 +#define CAM_CSI1_3_IPCLKPORT_I_PCLK 21 +#define CAM_CSI2_0_IPCLKPORT_I_PCLK 22 +#define CAM_CSI2_1_IPCLKPORT_I_PCLK 23 +#define CAM_CSI2_2_IPCLKPORT_I_PCLK 24 +#define CAM_CSI2_3_IPCLKPORT_I_PCLK 25 #endif /*_DT_BINDINGS_CLOCK_FSD_H */ -- cgit v1.2.3 From 894af4a1cde61c3401f237184fb770f72ff12df8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Apr 2025 13:56:01 +0200 Subject: objtool: Validate kCFI calls Validate that all indirect calls adhere to kCFI rules. Notably doing nocfi indirect call to a cfi function is broken. Apparently some Rust 'core' code violates this and explodes when ran with FineIBT. All the ANNOTATE_NOCFI_SYM sites are prime targets for attackers. - runtime EFI is especially henous because it also needs to disable IBT. Basically calling unknown code without CFI protection at runtime is a massice security issue. - Kexec image handover; if you can exploit this, you get to keep it :-) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Acked-by: Sean Christopherson Link: https://lkml.kernel.org/r/20250714103441.496787279@infradead.org --- arch/x86/kernel/machine_kexec_64.c | 4 ++++ arch/x86/kvm/vmx/vmenter.S | 4 ++++ arch/x86/platform/efi/efi_stub_64.S | 4 ++++ drivers/misc/lkdtm/perms.c | 5 +++++ include/linux/objtool.h | 10 +++++++++ include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 42 +++++++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/elf.h | 1 + 9 files changed, 72 insertions(+) (limited to 'include') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 697fb99406e6..8593760c255a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -453,6 +453,10 @@ void __nocfi machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } +/* + * Handover to the next kernel, no CFI concern. + */ +ANNOTATE_NOCFI_SYM(machine_kexec); /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0a6cf5bff2aa..bc255d709d8a 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -361,6 +361,10 @@ SYM_FUNC_END(vmread_error_trampoline) .section .text, "ax" +#ifndef CONFIG_X86_FRED + SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) + +#endif diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 2206b8bc47b8..f0a5fba0717e 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -11,6 +11,10 @@ #include SYM_FUNC_START(__efi_call) + /* + * The EFI code doesn't have any CFI, annotate away the CFI violation. + */ + ANNOTATE_NOCFI_SYM pushq %rbp movq %rsp, %rbp and $~0xf, %rsp diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c index 6c24426104ba..e1f5e9abb301 100644 --- a/drivers/misc/lkdtm/perms.c +++ b/drivers/misc/lkdtm/perms.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,10 @@ static noinline __nocfi void execute_location(void *dst, bool write) func(); pr_err("FAIL: func returned\n"); } +/* + * Explicitly doing the wrong thing for testing. + */ +ANNOTATE_NOCFI_SYM(execute_location); static void execute_user_location(void *dst) { diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 366ad004d794..46ebaa46e6c5 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -184,6 +184,15 @@ * WARN using UD2. */ #define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) +/* + * This should not be used; it annotates away CFI violations. There are a few + * valid use cases like kexec handover to the next kernel image, and there is + * no security concern there. + * + * There are also a few real issues annotated away, like EFI because we can't + * control the EFI code. + */ +#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -194,6 +203,7 @@ #define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN #define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE +#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI #endif #if defined(CONFIG_NOINSTR_VALIDATION) && \ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index df5d9fa84dba..aceac94632c8 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -65,5 +65,6 @@ struct unwind_hint { #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 #define ANNOTYPE_REACHABLE 8 +#define ANNOTYPE_NOCFI 9 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index df5d9fa84dba..aceac94632c8 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -65,5 +65,6 @@ struct unwind_hint { #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 #define ANNOTYPE_REACHABLE 8 +#define ANNOTYPE_NOCFI 9 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db1..79eab61cd944 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2392,6 +2392,8 @@ static int __annotate_ifc(struct objtool_file *file, int type, struct instructio static int __annotate_late(struct objtool_file *file, int type, struct instruction *insn) { + struct symbol *sym; + switch (type) { case ANNOTYPE_NOENDBR: /* early */ @@ -2433,6 +2435,15 @@ static int __annotate_late(struct objtool_file *file, int type, struct instructi insn->dead_end = false; break; + case ANNOTYPE_NOCFI: + sym = insn->sym; + if (!sym) { + ERROR_INSN(insn, "dodgy NOCFI annotation"); + return -1; + } + insn->sym->nocfi = 1; + break; + default: ERROR_INSN(insn, "Unknown annotation type: %d", type); return -1; @@ -4002,6 +4013,37 @@ static int validate_retpoline(struct objtool_file *file) warnings++; } + if (!opts.cfi) + return warnings; + + /* + * kCFI call sites look like: + * + * movl $(-0x12345678), %r10d + * addl -4(%r11), %r10d + * jz 1f + * ud2 + * 1: cs call __x86_indirect_thunk_r11 + * + * Verify all indirect calls are kCFI adorned by checking for the + * UD2. Notably, doing __nocfi calls to regular (cfi) functions is + * broken. + */ + list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + struct symbol *sym = insn->sym; + + if (sym && (sym->type == STT_NOTYPE || + sym->type == STT_FUNC) && !sym->nocfi) { + struct instruction *prev = + prev_insn_same_sym(file, insn); + + if (!prev || prev->type != INSN_BUG) { + WARN_INSN(insn, "no-cfi indirect call!"); + warnings++; + } + } + } + return warnings; } diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 0a2fa3ac0079..df8434d3b744 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -70,6 +70,7 @@ struct symbol { u8 local_label : 1; u8 frame_pointer : 1; u8 ignore : 1; + u8 nocfi : 1; struct list_head pv_target; struct reloc *relocs; struct section *group_sec; -- cgit v1.2.3 From 89d912e494f786e79f69ed9d567a8842c71dbb03 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:27 +0200 Subject: bpf: Add dynptr type for skb metadata Add a dynptr type, similar to skb dynptr, but for the skb metadata access. The dynptr provides an alternative to __sk_buff->data_meta for accessing the custom metadata area allocated using the bpf_xdp_adjust_meta() helper. More importantly, it abstracts away the fact where the storage for the custom metadata lives, which opens up the way to persist the metadata by relocating it as the skb travels through the network stack layers. Writes to skb metadata invalidate any existing skb payload and metadata slices. While this is more restrictive that needed at the moment, it leaves the door open to reallocating the metadata on writes, and should be only a minor inconvenience to the users. Only the program types which can access __sk_buff->data_meta today are allowed to create a dynptr for skb metadata at the moment. We need to modify the network stack to persist the metadata across layers before opening up access to other BPF hooks. Once more BPF hooks gain access to skb_meta dynptr, we will also need to add a read-only variant of the helper similar to bpf_dynptr_from_skb_rdonly. skb_meta dynptr ops are stubbed out and implemented by subsequent changes. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-1-8a39e636e0fb@cloudflare.com --- include/linux/bpf.h | 7 ++++++- kernel/bpf/helpers.c | 7 +++++++ kernel/bpf/log.c | 2 ++ kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802..ec527b476dba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -767,12 +767,15 @@ enum bpf_type_flag { */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ + DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1358,6 +1361,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, + /* Points to skb_metadata_end()-skb_metadata_len() */ + BPF_DYNPTR_TYPE_SKB_META, }; int bpf_dynptr_check_size(u32 size); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..9552b32208c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1780,6 +1780,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1836,6 +1838,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1882,6 +1886,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -2710,6 +2715,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return NULL; /* not implemented */ default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 38050f4ee400..e4983c1303e7 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; + case BPF_DYNPTR_TYPE_SKB_META: + return "skb_meta"; case BPF_DYNPTR_TYPE_INVALID: return ""; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..5964bed40ffb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; default: return 0; } @@ -2274,7 +2278,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -11641,7 +11646,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -12228,6 +12234,7 @@ enum special_kfunc_type { KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12277,9 +12284,11 @@ BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -13253,6 +13262,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788..31b4b50dbadf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12007,6 +12007,36 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12181,6 +12211,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) @@ -12202,6 +12236,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12276,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); -- cgit v1.2.3 From 6877cd392baecf816c2ba896a9d42874628004a5 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:28 +0200 Subject: bpf: Enable read/write access to skb metadata through a dynptr Now that we can create a dynptr to skb metadata, make reads to the metadata area possible with bpf_dynptr_read() or through a bpf_dynptr_slice(), and make writes to the metadata area possible with bpf_dynptr_write() or through a bpf_dynptr_slice_rdwr(). Note that for cloned skbs which share data with the original, we limit the skb metadata dynptr to be read-only since we don't unclone on a bpf_dynptr_write to metadata. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-2-8a39e636e0fb@cloudflare.com --- include/linux/filter.h | 6 ++++++ kernel/bpf/helpers.c | 10 +++++++--- net/core/filter.c | 16 ++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e..9ed21b65e2e9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1784,6 +1784,7 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1818,6 +1819,11 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi unsigned long len, bool flush) { } + +static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9552b32208c5..cdffd74ddbe6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1781,7 +1781,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1839,7 +1840,10 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + if (flags) + return -EINVAL; + memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -2716,7 +2720,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, return buffer__opt; } case BPF_DYNPTR_TYPE_SKB_META: - return NULL; /* not implemented */ + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/net/core/filter.c b/net/core/filter.c index 31b4b50dbadf..63f3baee2daf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11990,6 +11990,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12017,6 +12027,9 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * + * If passed @skb_ is a clone which shares the data with the original, the + * dynptr will be read-only. This limitation may be lifted in the future. + * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null @@ -12034,6 +12047,9 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + if (skb_cloned(skb)) + bpf_dynptr_set_rdonly(ptr); + return 0; } -- cgit v1.2.3 From e5eb72c92eb724aa14c50c7d92d1a576dd50d7e6 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 Aug 2025 19:32:18 +0200 Subject: scsi: libsas: Add dev_parent_is_expander() helper Many libsas drivers check if the parent of the device is an expander. Create a helper that the libsas drivers will use in follow up commits. Suggested-by: Damien Le Moal Signed-off-by: Niklas Cassel Link: https://lore.kernel.org/r/20250814173215.1765055-15-cassel@kernel.org Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Jason Yan Signed-off-by: Martin K. Petersen --- drivers/scsi/libsas/sas_expander.c | 5 +---- include/scsi/libsas.h | 8 ++++++++ 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 869b5d4db44c..d953225f6cc2 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -1313,10 +1313,7 @@ static int sas_check_parent_topology(struct domain_device *child) int i; int res = 0; - if (!child->parent) - return 0; - - if (!dev_is_expander(child->parent->dev_type)) + if (!dev_parent_is_expander(child)) return 0; parent_ex = &child->parent->ex_dev; diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ba460b6c0374..8d38565e99fa 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -203,6 +203,14 @@ static inline bool dev_is_expander(enum sas_device_type type) type == SAS_FANOUT_EXPANDER_DEVICE; } +static inline bool dev_parent_is_expander(struct domain_device *dev) +{ + if (!dev->parent) + return false; + + return dev_is_expander(dev->parent->dev_type); +} + static inline void INIT_SAS_WORK(struct sas_work *sw, void (*fn)(struct work_struct *)) { INIT_WORK(&sw->work, fn); -- cgit v1.2.3 From dcb8d01b65fb5a891ddbbedcbe6eff0b8ec37867 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:39 +0300 Subject: dt-bindings: power: qcom-rpmpd: split RPMh domains definitions Historically both RPM and RPMh domain definitions were a part of the same, qcom-rpmpd.h header. Now as we have a separate header for RPMh definitions, qcom,rpmhpd.h, move all RPMh power domain definitions to that header. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Reviewed-by: Konrad Dybcio Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-1-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom,rpmhpd.h | 233 ++++++++++++++++++++++++++++++++ include/dt-bindings/power/qcom-rpmpd.h | 228 +------------------------------ 2 files changed, 234 insertions(+), 227 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/power/qcom,rpmhpd.h b/include/dt-bindings/power/qcom,rpmhpd.h index e54ffa361451..73cceb88953f 100644 --- a/include/dt-bindings/power/qcom,rpmhpd.h +++ b/include/dt-bindings/power/qcom,rpmhpd.h @@ -29,4 +29,237 @@ #define RPMHPD_NSP2 19 #define RPMHPD_GMXC 20 +/* RPMh Power Domain performance levels */ +#define RPMH_REGULATOR_LEVEL_RETENTION 16 +#define RPMH_REGULATOR_LEVEL_MIN_SVS 48 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 +#define RPMH_REGULATOR_LEVEL_LOW_SVS 64 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 +#define RPMH_REGULATOR_LEVEL_SVS 128 +#define RPMH_REGULATOR_LEVEL_SVS_L0 144 +#define RPMH_REGULATOR_LEVEL_SVS_L1 192 +#define RPMH_REGULATOR_LEVEL_SVS_L2 224 +#define RPMH_REGULATOR_LEVEL_NOM 256 +#define RPMH_REGULATOR_LEVEL_NOM_L0 288 +#define RPMH_REGULATOR_LEVEL_NOM_L1 320 +#define RPMH_REGULATOR_LEVEL_NOM_L2 336 +#define RPMH_REGULATOR_LEVEL_TURBO 384 +#define RPMH_REGULATOR_LEVEL_TURBO_L0 400 +#define RPMH_REGULATOR_LEVEL_TURBO_L1 416 +#define RPMH_REGULATOR_LEVEL_TURBO_L2 432 +#define RPMH_REGULATOR_LEVEL_TURBO_L3 448 +#define RPMH_REGULATOR_LEVEL_TURBO_L4 452 +#define RPMH_REGULATOR_LEVEL_TURBO_L5 456 +#define RPMH_REGULATOR_LEVEL_SUPER_TURBO 464 +#define RPMH_REGULATOR_LEVEL_SUPER_TURBO_NO_CPR 480 + +/* + * Platform-specific power domain bindings. Don't add new entries here, use + * RPMHPD_* above. + */ + +/* SA8775P Power Domain Indexes */ +#define SA8775P_CX 0 +#define SA8775P_CX_AO 1 +#define SA8775P_DDR 2 +#define SA8775P_EBI 3 +#define SA8775P_GFX 4 +#define SA8775P_LCX 5 +#define SA8775P_LMX 6 +#define SA8775P_MMCX 7 +#define SA8775P_MMCX_AO 8 +#define SA8775P_MSS 9 +#define SA8775P_MX 10 +#define SA8775P_MX_AO 11 +#define SA8775P_MXC 12 +#define SA8775P_MXC_AO 13 +#define SA8775P_NSP0 14 +#define SA8775P_NSP1 15 +#define SA8775P_XO 16 + +/* SDM670 Power Domain Indexes */ +#define SDM670_MX 0 +#define SDM670_MX_AO 1 +#define SDM670_CX 2 +#define SDM670_CX_AO 3 +#define SDM670_LMX 4 +#define SDM670_LCX 5 +#define SDM670_GFX 6 +#define SDM670_MSS 7 + +/* SDM845 Power Domain Indexes */ +#define SDM845_EBI 0 +#define SDM845_MX 1 +#define SDM845_MX_AO 2 +#define SDM845_CX 3 +#define SDM845_CX_AO 4 +#define SDM845_LMX 5 +#define SDM845_LCX 6 +#define SDM845_GFX 7 +#define SDM845_MSS 8 + +/* SDX55 Power Domain Indexes */ +#define SDX55_MSS 0 +#define SDX55_MX 1 +#define SDX55_CX 2 + +/* SDX65 Power Domain Indexes */ +#define SDX65_MSS 0 +#define SDX65_MX 1 +#define SDX65_MX_AO 2 +#define SDX65_CX 3 +#define SDX65_CX_AO 4 +#define SDX65_MXC 5 + +/* SM6350 Power Domain Indexes */ +#define SM6350_CX 0 +#define SM6350_GFX 1 +#define SM6350_LCX 2 +#define SM6350_LMX 3 +#define SM6350_MSS 4 +#define SM6350_MX 5 + +/* SM8150 Power Domain Indexes */ +#define SM8150_MSS 0 +#define SM8150_EBI 1 +#define SM8150_LMX 2 +#define SM8150_LCX 3 +#define SM8150_GFX 4 +#define SM8150_MX 5 +#define SM8150_MX_AO 6 +#define SM8150_CX 7 +#define SM8150_CX_AO 8 +#define SM8150_MMCX 9 +#define SM8150_MMCX_AO 10 + +/* SA8155P is a special case, kept for backwards compatibility */ +#define SA8155P_CX SM8150_CX +#define SA8155P_CX_AO SM8150_CX_AO +#define SA8155P_EBI SM8150_EBI +#define SA8155P_GFX SM8150_GFX +#define SA8155P_MSS SM8150_MSS +#define SA8155P_MX SM8150_MX +#define SA8155P_MX_AO SM8150_MX_AO + +/* SM8250 Power Domain Indexes */ +#define SM8250_CX 0 +#define SM8250_CX_AO 1 +#define SM8250_EBI 2 +#define SM8250_GFX 3 +#define SM8250_LCX 4 +#define SM8250_LMX 5 +#define SM8250_MMCX 6 +#define SM8250_MMCX_AO 7 +#define SM8250_MX 8 +#define SM8250_MX_AO 9 + +/* SM8350 Power Domain Indexes */ +#define SM8350_CX 0 +#define SM8350_CX_AO 1 +#define SM8350_EBI 2 +#define SM8350_GFX 3 +#define SM8350_LCX 4 +#define SM8350_LMX 5 +#define SM8350_MMCX 6 +#define SM8350_MMCX_AO 7 +#define SM8350_MX 8 +#define SM8350_MX_AO 9 +#define SM8350_MXC 10 +#define SM8350_MXC_AO 11 +#define SM8350_MSS 12 + +/* SM8450 Power Domain Indexes */ +#define SM8450_CX 0 +#define SM8450_CX_AO 1 +#define SM8450_EBI 2 +#define SM8450_GFX 3 +#define SM8450_LCX 4 +#define SM8450_LMX 5 +#define SM8450_MMCX 6 +#define SM8450_MMCX_AO 7 +#define SM8450_MX 8 +#define SM8450_MX_AO 9 +#define SM8450_MXC 10 +#define SM8450_MXC_AO 11 +#define SM8450_MSS 12 + +/* SM8550 Power Domain Indexes */ +#define SM8550_CX 0 +#define SM8550_CX_AO 1 +#define SM8550_EBI 2 +#define SM8550_GFX 3 +#define SM8550_LCX 4 +#define SM8550_LMX 5 +#define SM8550_MMCX 6 +#define SM8550_MMCX_AO 7 +#define SM8550_MX 8 +#define SM8550_MX_AO 9 +#define SM8550_MXC 10 +#define SM8550_MXC_AO 11 +#define SM8550_MSS 12 +#define SM8550_NSP 13 + +/* QDU1000/QRU1000 Power Domain Indexes */ +#define QDU1000_EBI 0 +#define QDU1000_MSS 1 +#define QDU1000_CX 2 +#define QDU1000_MX 3 + +/* SC7180 Power Domain Indexes */ +#define SC7180_CX 0 +#define SC7180_CX_AO 1 +#define SC7180_GFX 2 +#define SC7180_MX 3 +#define SC7180_MX_AO 4 +#define SC7180_LMX 5 +#define SC7180_LCX 6 +#define SC7180_MSS 7 + +/* SC7280 Power Domain Indexes */ +#define SC7280_CX 0 +#define SC7280_CX_AO 1 +#define SC7280_EBI 2 +#define SC7280_GFX 3 +#define SC7280_MX 4 +#define SC7280_MX_AO 5 +#define SC7280_LMX 6 +#define SC7280_LCX 7 +#define SC7280_MSS 8 + +/* SC8180X Power Domain Indexes */ +#define SC8180X_CX 0 +#define SC8180X_CX_AO 1 +#define SC8180X_EBI 2 +#define SC8180X_GFX 3 +#define SC8180X_LCX 4 +#define SC8180X_LMX 5 +#define SC8180X_MMCX 6 +#define SC8180X_MMCX_AO 7 +#define SC8180X_MSS 8 +#define SC8180X_MX 9 +#define SC8180X_MX_AO 10 + +/* SC8280XP Power Domain Indexes */ +#define SC8280XP_CX 0 +#define SC8280XP_CX_AO 1 +#define SC8280XP_DDR 2 +#define SC8280XP_EBI 3 +#define SC8280XP_GFX 4 +#define SC8280XP_LCX 5 +#define SC8280XP_LMX 6 +#define SC8280XP_MMCX 7 +#define SC8280XP_MMCX_AO 8 +#define SC8280XP_MSS 9 +#define SC8280XP_MX 10 +#define SC8280XP_MXC 12 +#define SC8280XP_MX_AO 11 +#define SC8280XP_NSP 13 +#define SC8280XP_QPHY 14 +#define SC8280XP_XO 15 + #endif diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index f15bcee7c928..d303b3b37f18 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -4,66 +4,7 @@ #ifndef _DT_BINDINGS_POWER_QCOM_RPMPD_H #define _DT_BINDINGS_POWER_QCOM_RPMPD_H -/* SA8775P Power Domain Indexes */ -#define SA8775P_CX 0 -#define SA8775P_CX_AO 1 -#define SA8775P_DDR 2 -#define SA8775P_EBI 3 -#define SA8775P_GFX 4 -#define SA8775P_LCX 5 -#define SA8775P_LMX 6 -#define SA8775P_MMCX 7 -#define SA8775P_MMCX_AO 8 -#define SA8775P_MSS 9 -#define SA8775P_MX 10 -#define SA8775P_MX_AO 11 -#define SA8775P_MXC 12 -#define SA8775P_MXC_AO 13 -#define SA8775P_NSP0 14 -#define SA8775P_NSP1 15 -#define SA8775P_XO 16 - -/* SDM670 Power Domain Indexes */ -#define SDM670_MX 0 -#define SDM670_MX_AO 1 -#define SDM670_CX 2 -#define SDM670_CX_AO 3 -#define SDM670_LMX 4 -#define SDM670_LCX 5 -#define SDM670_GFX 6 -#define SDM670_MSS 7 - -/* SDM845 Power Domain Indexes */ -#define SDM845_EBI 0 -#define SDM845_MX 1 -#define SDM845_MX_AO 2 -#define SDM845_CX 3 -#define SDM845_CX_AO 4 -#define SDM845_LMX 5 -#define SDM845_LCX 6 -#define SDM845_GFX 7 -#define SDM845_MSS 8 - -/* SDX55 Power Domain Indexes */ -#define SDX55_MSS 0 -#define SDX55_MX 1 -#define SDX55_CX 2 - -/* SDX65 Power Domain Indexes */ -#define SDX65_MSS 0 -#define SDX65_MX 1 -#define SDX65_MX_AO 2 -#define SDX65_CX 3 -#define SDX65_CX_AO 4 -#define SDX65_MXC 5 - -/* SM6350 Power Domain Indexes */ -#define SM6350_CX 0 -#define SM6350_GFX 1 -#define SM6350_LCX 2 -#define SM6350_LMX 3 -#define SM6350_MSS 4 -#define SM6350_MX 5 +#include /* SM6375 Power Domain Indexes */ #define SM6375_VDDCX 0 @@ -77,173 +18,6 @@ #define SM6375_VDD_LPI_CX 8 #define SM6375_VDD_LPI_MX 9 -/* SM8150 Power Domain Indexes */ -#define SM8150_MSS 0 -#define SM8150_EBI 1 -#define SM8150_LMX 2 -#define SM8150_LCX 3 -#define SM8150_GFX 4 -#define SM8150_MX 5 -#define SM8150_MX_AO 6 -#define SM8150_CX 7 -#define SM8150_CX_AO 8 -#define SM8150_MMCX 9 -#define SM8150_MMCX_AO 10 - -/* SA8155P is a special case, kept for backwards compatibility */ -#define SA8155P_CX SM8150_CX -#define SA8155P_CX_AO SM8150_CX_AO -#define SA8155P_EBI SM8150_EBI -#define SA8155P_GFX SM8150_GFX -#define SA8155P_MSS SM8150_MSS -#define SA8155P_MX SM8150_MX -#define SA8155P_MX_AO SM8150_MX_AO - -/* SM8250 Power Domain Indexes */ -#define SM8250_CX 0 -#define SM8250_CX_AO 1 -#define SM8250_EBI 2 -#define SM8250_GFX 3 -#define SM8250_LCX 4 -#define SM8250_LMX 5 -#define SM8250_MMCX 6 -#define SM8250_MMCX_AO 7 -#define SM8250_MX 8 -#define SM8250_MX_AO 9 - -/* SM8350 Power Domain Indexes */ -#define SM8350_CX 0 -#define SM8350_CX_AO 1 -#define SM8350_EBI 2 -#define SM8350_GFX 3 -#define SM8350_LCX 4 -#define SM8350_LMX 5 -#define SM8350_MMCX 6 -#define SM8350_MMCX_AO 7 -#define SM8350_MX 8 -#define SM8350_MX_AO 9 -#define SM8350_MXC 10 -#define SM8350_MXC_AO 11 -#define SM8350_MSS 12 - -/* SM8450 Power Domain Indexes */ -#define SM8450_CX 0 -#define SM8450_CX_AO 1 -#define SM8450_EBI 2 -#define SM8450_GFX 3 -#define SM8450_LCX 4 -#define SM8450_LMX 5 -#define SM8450_MMCX 6 -#define SM8450_MMCX_AO 7 -#define SM8450_MX 8 -#define SM8450_MX_AO 9 -#define SM8450_MXC 10 -#define SM8450_MXC_AO 11 -#define SM8450_MSS 12 - -/* SM8550 Power Domain Indexes */ -#define SM8550_CX 0 -#define SM8550_CX_AO 1 -#define SM8550_EBI 2 -#define SM8550_GFX 3 -#define SM8550_LCX 4 -#define SM8550_LMX 5 -#define SM8550_MMCX 6 -#define SM8550_MMCX_AO 7 -#define SM8550_MX 8 -#define SM8550_MX_AO 9 -#define SM8550_MXC 10 -#define SM8550_MXC_AO 11 -#define SM8550_MSS 12 -#define SM8550_NSP 13 - -/* QDU1000/QRU1000 Power Domain Indexes */ -#define QDU1000_EBI 0 -#define QDU1000_MSS 1 -#define QDU1000_CX 2 -#define QDU1000_MX 3 - -/* SC7180 Power Domain Indexes */ -#define SC7180_CX 0 -#define SC7180_CX_AO 1 -#define SC7180_GFX 2 -#define SC7180_MX 3 -#define SC7180_MX_AO 4 -#define SC7180_LMX 5 -#define SC7180_LCX 6 -#define SC7180_MSS 7 - -/* SC7280 Power Domain Indexes */ -#define SC7280_CX 0 -#define SC7280_CX_AO 1 -#define SC7280_EBI 2 -#define SC7280_GFX 3 -#define SC7280_MX 4 -#define SC7280_MX_AO 5 -#define SC7280_LMX 6 -#define SC7280_LCX 7 -#define SC7280_MSS 8 - -/* SC8180X Power Domain Indexes */ -#define SC8180X_CX 0 -#define SC8180X_CX_AO 1 -#define SC8180X_EBI 2 -#define SC8180X_GFX 3 -#define SC8180X_LCX 4 -#define SC8180X_LMX 5 -#define SC8180X_MMCX 6 -#define SC8180X_MMCX_AO 7 -#define SC8180X_MSS 8 -#define SC8180X_MX 9 -#define SC8180X_MX_AO 10 - -/* SC8280XP Power Domain Indexes */ -#define SC8280XP_CX 0 -#define SC8280XP_CX_AO 1 -#define SC8280XP_DDR 2 -#define SC8280XP_EBI 3 -#define SC8280XP_GFX 4 -#define SC8280XP_LCX 5 -#define SC8280XP_LMX 6 -#define SC8280XP_MMCX 7 -#define SC8280XP_MMCX_AO 8 -#define SC8280XP_MSS 9 -#define SC8280XP_MX 10 -#define SC8280XP_MXC 12 -#define SC8280XP_MX_AO 11 -#define SC8280XP_NSP 13 -#define SC8280XP_QPHY 14 -#define SC8280XP_XO 15 - -/* SDM845 Power Domain performance levels */ -#define RPMH_REGULATOR_LEVEL_RETENTION 16 -#define RPMH_REGULATOR_LEVEL_MIN_SVS 48 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 -#define RPMH_REGULATOR_LEVEL_LOW_SVS 64 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 -#define RPMH_REGULATOR_LEVEL_SVS 128 -#define RPMH_REGULATOR_LEVEL_SVS_L0 144 -#define RPMH_REGULATOR_LEVEL_SVS_L1 192 -#define RPMH_REGULATOR_LEVEL_SVS_L2 224 -#define RPMH_REGULATOR_LEVEL_NOM 256 -#define RPMH_REGULATOR_LEVEL_NOM_L0 288 -#define RPMH_REGULATOR_LEVEL_NOM_L1 320 -#define RPMH_REGULATOR_LEVEL_NOM_L2 336 -#define RPMH_REGULATOR_LEVEL_TURBO 384 -#define RPMH_REGULATOR_LEVEL_TURBO_L0 400 -#define RPMH_REGULATOR_LEVEL_TURBO_L1 416 -#define RPMH_REGULATOR_LEVEL_TURBO_L2 432 -#define RPMH_REGULATOR_LEVEL_TURBO_L3 448 -#define RPMH_REGULATOR_LEVEL_TURBO_L4 452 -#define RPMH_REGULATOR_LEVEL_TURBO_L5 456 -#define RPMH_REGULATOR_LEVEL_SUPER_TURBO 464 -#define RPMH_REGULATOR_LEVEL_SUPER_TURBO_NO_CPR 480 - /* MDM9607 Power Domains */ #define MDM9607_VDDCX 0 #define MDM9607_VDDCX_AO 1 -- cgit v1.2.3 From e6e1e3b6b8f9b9b78aa0dccdde431145cefb05f5 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:40 +0300 Subject: dt-bindings: power: qcom-rpmpd: sort out entries After removing RPMh PD indices, it becomes obvious that several entries don't follow the alphabetic sorting order. Move them in order to keep the file sorted. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-2-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom-rpmpd.h | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index d303b3b37f18..65f7d5ecc352 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -6,18 +6,6 @@ #include -/* SM6375 Power Domain Indexes */ -#define SM6375_VDDCX 0 -#define SM6375_VDDCX_AO 1 -#define SM6375_VDDCX_VFL 2 -#define SM6375_VDDMX 3 -#define SM6375_VDDMX_AO 4 -#define SM6375_VDDMX_VFL 5 -#define SM6375_VDDGX 6 -#define SM6375_VDDGX_AO 7 -#define SM6375_VDD_LPI_CX 8 -#define SM6375_VDD_LPI_MX 9 - /* MDM9607 Power Domains */ #define MDM9607_VDDCX 0 #define MDM9607_VDDCX_AO 1 @@ -130,6 +118,16 @@ #define MSM8998_SSCMX 8 #define MSM8998_SSCMX_VFL 9 +/* QCM2290 Power Domains */ +#define QCM2290_VDDCX 0 +#define QCM2290_VDDCX_AO 1 +#define QCM2290_VDDCX_VFL 2 +#define QCM2290_VDDMX 3 +#define QCM2290_VDDMX_AO 4 +#define QCM2290_VDDMX_VFL 5 +#define QCM2290_VDD_LPI_CX 6 +#define QCM2290_VDD_LPI_MX 7 + /* QCS404 Power Domains */ #define QCS404_VDDMX 0 #define QCS404_VDDMX_AO 1 @@ -169,15 +167,17 @@ #define SM6125_VDDMX_AO 4 #define SM6125_VDDMX_VFL 5 -/* QCM2290 Power Domains */ -#define QCM2290_VDDCX 0 -#define QCM2290_VDDCX_AO 1 -#define QCM2290_VDDCX_VFL 2 -#define QCM2290_VDDMX 3 -#define QCM2290_VDDMX_AO 4 -#define QCM2290_VDDMX_VFL 5 -#define QCM2290_VDD_LPI_CX 6 -#define QCM2290_VDD_LPI_MX 7 +/* SM6375 Power Domain Indexes */ +#define SM6375_VDDCX 0 +#define SM6375_VDDCX_AO 1 +#define SM6375_VDDCX_VFL 2 +#define SM6375_VDDMX 3 +#define SM6375_VDDMX_AO 4 +#define SM6375_VDDMX_VFL 5 +#define SM6375_VDDGX 6 +#define SM6375_VDDGX_AO 7 +#define SM6375_VDD_LPI_CX 8 +#define SM6375_VDD_LPI_MX 9 /* RPM SMD Power Domain performance levels */ #define RPM_SMD_LEVEL_RETENTION 16 -- cgit v1.2.3 From 94838f383a050e124c044e74a954777b8f2e6c17 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2025 19:13:41 +0300 Subject: dt-bindings: power: qcom-rpmpd: add generic bindings for RPM power domains Some of the Qualcomm RPM PD controllers use a common set of indices for power domains. Add generic indices for Qualcomm RPM power domain controllers. Signed-off-by: Dmitry Baryshkov Acked-by: Rob Herring (Arm) Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20250718-rework-rpmhpd-rpmpd-v1-3-eedca108e540@oss.qualcomm.com Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom-rpmpd.h | 121 +++++++++++++++++++-------------- 1 file changed, 70 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index 65f7d5ecc352..4371ac941f29 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -6,18 +6,37 @@ #include +/* Generic RPM Power Domain Indexes */ +#define RPMPD_VDDCX 0 +#define RPMPD_VDDCX_AO 1 +/* VFC and VFL are mutually exclusive and can not be present on the same platform */ +#define RPMPD_VDDCX_VFC 2 +#define RPMPD_VDDCX_VFL 2 +#define RPMPD_VDDMX 3 +#define RPMPD_VDDMX_AO 4 +#define RPMPD_VDDMX_VFL 5 +#define RPMPD_SSCCX 6 +#define RPMPD_SSCCX_VFL 7 +#define RPMPD_SSCMX 8 +#define RPMPD_SSCMX_VFL 9 + +/* + * Platform-specific power domain bindings. Don't add new entries here, use + * RPMPD_* above. + */ + /* MDM9607 Power Domains */ -#define MDM9607_VDDCX 0 -#define MDM9607_VDDCX_AO 1 -#define MDM9607_VDDCX_VFL 2 -#define MDM9607_VDDMX 3 -#define MDM9607_VDDMX_AO 4 -#define MDM9607_VDDMX_VFL 5 +#define MDM9607_VDDCX RPMPD_VDDCX +#define MDM9607_VDDCX_AO RPMPD_VDDCX_AO +#define MDM9607_VDDCX_VFL RPMPD_VDDCX_VFL +#define MDM9607_VDDMX RPMPD_VDDMX +#define MDM9607_VDDMX_AO RPMPD_VDDMX_AO +#define MDM9607_VDDMX_VFL RPMPD_VDDMX_VFL /* MSM8226 Power Domain Indexes */ -#define MSM8226_VDDCX 0 -#define MSM8226_VDDCX_AO 1 -#define MSM8226_VDDCX_VFC 2 +#define MSM8226_VDDCX RPMPD_VDDCX +#define MSM8226_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8226_VDDCX_VFC RPMPD_VDDCX_VFC /* MSM8939 Power Domains */ #define MSM8939_VDDMDCX 0 @@ -30,11 +49,11 @@ #define MSM8939_VDDMX_AO 7 /* MSM8916 Power Domain Indexes */ -#define MSM8916_VDDCX 0 -#define MSM8916_VDDCX_AO 1 -#define MSM8916_VDDCX_VFC 2 -#define MSM8916_VDDMX 3 -#define MSM8916_VDDMX_AO 4 +#define MSM8916_VDDCX RPMPD_VDDCX +#define MSM8916_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8916_VDDCX_VFC RPMPD_VDDCX_VFC +#define MSM8916_VDDMX RPMPD_VDDMX +#define MSM8916_VDDMX_AO RPMPD_VDDMX_AO /* MSM8909 Power Domain Indexes */ #define MSM8909_VDDCX MSM8916_VDDCX @@ -44,11 +63,11 @@ #define MSM8909_VDDMX_AO MSM8916_VDDMX_AO /* MSM8917 Power Domain Indexes */ -#define MSM8917_VDDCX 0 -#define MSM8917_VDDCX_AO 1 -#define MSM8917_VDDCX_VFL 2 -#define MSM8917_VDDMX 3 -#define MSM8917_VDDMX_AO 4 +#define MSM8917_VDDCX RPMPD_VDDCX +#define MSM8917_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8917_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8917_VDDMX RPMPD_VDDMX +#define MSM8917_VDDMX_AO RPMPD_VDDMX_AO /* MSM8937 Power Domain Indexes */ #define MSM8937_VDDCX MSM8917_VDDCX @@ -81,12 +100,12 @@ #define MSM8974_VDDGFX_VFC 4 /* MSM8976 Power Domain Indexes */ -#define MSM8976_VDDCX 0 -#define MSM8976_VDDCX_AO 1 -#define MSM8976_VDDCX_VFL 2 -#define MSM8976_VDDMX 3 -#define MSM8976_VDDMX_AO 4 -#define MSM8976_VDDMX_VFL 5 +#define MSM8976_VDDCX RPMPD_VDDCX +#define MSM8976_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8976_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8976_VDDMX RPMPD_VDDMX +#define MSM8976_VDDMX_AO RPMPD_VDDMX_AO +#define MSM8976_VDDMX_VFL RPMPD_VDDMX_VFL /* MSM8994 Power Domain Indexes */ #define MSM8994_VDDCX 0 @@ -107,16 +126,16 @@ #define MSM8996_VDDSSCX_VFC 6 /* MSM8998 Power Domain Indexes */ -#define MSM8998_VDDCX 0 -#define MSM8998_VDDCX_AO 1 -#define MSM8998_VDDCX_VFL 2 -#define MSM8998_VDDMX 3 -#define MSM8998_VDDMX_AO 4 -#define MSM8998_VDDMX_VFL 5 -#define MSM8998_SSCCX 6 -#define MSM8998_SSCCX_VFL 7 -#define MSM8998_SSCMX 8 -#define MSM8998_SSCMX_VFL 9 +#define MSM8998_VDDCX RPMPD_VDDCX +#define MSM8998_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8998_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8998_VDDMX RPMPD_VDDMX +#define MSM8998_VDDMX_AO RPMPD_VDDMX_AO +#define MSM8998_VDDMX_VFL RPMPD_VDDMX_VFL +#define MSM8998_SSCCX RPMPD_SSCCX +#define MSM8998_SSCCX_VFL RPMPD_SSCCX_VFL +#define MSM8998_SSCMX RPMPD_SSCMX +#define MSM8998_SSCMX_VFL RPMPD_SSCMX_VFL /* QCM2290 Power Domains */ #define QCM2290_VDDCX 0 @@ -138,16 +157,16 @@ #define QCS404_LPIMX_VFL 6 /* SDM660 Power Domains */ -#define SDM660_VDDCX 0 -#define SDM660_VDDCX_AO 1 -#define SDM660_VDDCX_VFL 2 -#define SDM660_VDDMX 3 -#define SDM660_VDDMX_AO 4 -#define SDM660_VDDMX_VFL 5 -#define SDM660_SSCCX 6 -#define SDM660_SSCCX_VFL 7 -#define SDM660_SSCMX 8 -#define SDM660_SSCMX_VFL 9 +#define SDM660_VDDCX RPMPD_VDDCX +#define SDM660_VDDCX_AO RPMPD_VDDCX_AO +#define SDM660_VDDCX_VFL RPMPD_VDDCX_VFL +#define SDM660_VDDMX RPMPD_VDDMX +#define SDM660_VDDMX_AO RPMPD_VDDMX_AO +#define SDM660_VDDMX_VFL RPMPD_VDDMX_VFL +#define SDM660_SSCCX RPMPD_SSCCX +#define SDM660_SSCCX_VFL RPMPD_SSCCX_VFL +#define SDM660_SSCMX RPMPD_SSCMX +#define SDM660_SSCMX_VFL RPMPD_SSCMX_VFL /* SM6115 Power Domains */ #define SM6115_VDDCX 0 @@ -160,12 +179,12 @@ #define SM6115_VDD_LPI_MX 7 /* SM6125 Power Domains */ -#define SM6125_VDDCX 0 -#define SM6125_VDDCX_AO 1 -#define SM6125_VDDCX_VFL 2 -#define SM6125_VDDMX 3 -#define SM6125_VDDMX_AO 4 -#define SM6125_VDDMX_VFL 5 +#define SM6125_VDDCX RPMPD_VDDCX +#define SM6125_VDDCX_AO RPMPD_VDDCX_AO +#define SM6125_VDDCX_VFL RPMPD_VDDCX_VFL +#define SM6125_VDDMX RPMPD_VDDMX +#define SM6125_VDDMX_AO RPMPD_VDDMX_AO +#define SM6125_VDDMX_VFL RPMPD_VDDMX_VFL /* SM6375 Power Domain Indexes */ #define SM6375_VDDCX 0 -- cgit v1.2.3 From 807221d3c5ff6e3c91ff57bc82a0b7a541462e20 Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Tue, 12 Aug 2025 14:35:21 +0800 Subject: misc: rtsx_pci: Add separate CD/WP pin polarity reversal support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the Card Detect (CD) and Write Protect (WP) pins shared the same reverse polarity setting in the configuration space. This meant both signals were reversed together, without the ability to configure them individually. This patch introduces two new parameters: sd_cd_reverse_en – enable reverse polarity for the CD pin. sd_wp_reverse_en – enable reverse polarity for the WP pin. With this change, the controller can now support: 1.Reversing both CD and WP pins together (original behavior). 2.Reversing CD and WP pins separately (newly added behavior), if supported by the configuration space. This provides greater flexibility when dealing with devices that have independent polarity requirements for CD and WP pins. Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20250812063521.2427696-1-ricky_wu@realtek.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rts5227.c | 13 ++++++++++--- drivers/misc/cardreader/rts5228.c | 12 ++++++++++-- drivers/misc/cardreader/rts5249.c | 16 +++++++++++++--- drivers/misc/cardreader/rts5264.c | 20 ++++++++++++++++---- drivers/misc/cardreader/rts5264.h | 1 + drivers/misc/cardreader/rtsx_pcr.h | 2 ++ include/linux/rtsx_pci.h | 2 ++ 7 files changed, 54 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/misc/cardreader/rts5227.c b/drivers/misc/cardreader/rts5227.c index cd512284bfb3..46444bb47f65 100644 --- a/drivers/misc/cardreader/rts5227.c +++ b/drivers/misc/cardreader/rts5227.c @@ -79,6 +79,10 @@ static void rts5227_fetch_vendor_settings(struct rtsx_pcr *pcr) pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg); if (rtsx_reg_check_reverse_socket(reg)) pcr->flags |= PCR_REVERSE_SOCKET; + if (rtsx_reg_check_cd_reverse(reg)) + pcr->option.sd_cd_reverse_en = 1; + if (rtsx_reg_check_wp_reverse(reg)) + pcr->option.sd_wp_reverse_en = 1; } static void rts5227_init_from_cfg(struct rtsx_pcr *pcr) @@ -127,8 +131,10 @@ static int rts5227_extra_init_hw(struct rtsx_pcr *pcr) /* Configure force_clock_req */ if (pcr->flags & PCR_REVERSE_SOCKET) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x30, 0x30); - else - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x30, 0x00); + else { + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } if (CHK_PCI_PID(pcr, 0x522A)) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, RTS522A_AUTOLOAD_CFG1, @@ -350,6 +356,8 @@ void rts5227_init_params(struct rtsx_pcr *pcr) pcr->ms_pull_ctl_disable_tbl = rts5227_ms_pull_ctl_disable_tbl; pcr->reg_pm_ctrl3 = PM_CTRL3; + pcr->option.sd_cd_reverse_en = 0; + pcr->option.sd_wp_reverse_en = 0; } static int rts522a_optimize_phy(struct rtsx_pcr *pcr) @@ -508,5 +516,4 @@ void rts522a_init_params(struct rtsx_pcr *pcr) pcr->hw_param.interrupt_en |= SD_OC_INT_EN; pcr->hw_param.ocp_glitch = SD_OCP_GLITCH_10M; pcr->option.sd_800mA_ocp_thd = RTS522A_OCP_THD_800; - } diff --git a/drivers/misc/cardreader/rts5228.c b/drivers/misc/cardreader/rts5228.c index 0c7f10bcf6f1..db7e735ac24f 100644 --- a/drivers/misc/cardreader/rts5228.c +++ b/drivers/misc/cardreader/rts5228.c @@ -84,6 +84,10 @@ static void rtsx5228_fetch_vendor_settings(struct rtsx_pcr *pcr) pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg); if (rtsx_reg_check_reverse_socket(reg)) pcr->flags |= PCR_REVERSE_SOCKET; + if (rtsx_reg_check_cd_reverse(reg)) + pcr->option.sd_cd_reverse_en = 1; + if (rtsx_reg_check_wp_reverse(reg)) + pcr->option.sd_wp_reverse_en = 1; } static int rts5228_optimize_phy(struct rtsx_pcr *pcr) @@ -432,8 +436,10 @@ static int rts5228_extra_init_hw(struct rtsx_pcr *pcr) if (pcr->flags & PCR_REVERSE_SOCKET) rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x30); - else - rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00); + else { + rtsx_pci_write_register(pcr, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_write_register(pcr, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } /* * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced @@ -720,4 +726,6 @@ void rts5228_init_params(struct rtsx_pcr *pcr) hw_param->interrupt_en |= SD_OC_INT_EN; hw_param->ocp_glitch = SD_OCP_GLITCH_800U; option->sd_800mA_ocp_thd = RTS5228_LDO1_OCP_THD_930; + option->sd_cd_reverse_en = 0; + option->sd_wp_reverse_en = 0; } diff --git a/drivers/misc/cardreader/rts5249.c b/drivers/misc/cardreader/rts5249.c index 6c81040e18be..38aefd8db452 100644 --- a/drivers/misc/cardreader/rts5249.c +++ b/drivers/misc/cardreader/rts5249.c @@ -60,6 +60,7 @@ static void rtsx_base_fetch_vendor_settings(struct rtsx_pcr *pcr) pci_read_config_dword(pdev, PCR_SETTING_REG1, ®); pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", PCR_SETTING_REG1, reg); + pci_write_config_dword(pdev, 0x718, 0x0007C000); if (!rtsx_vendor_setting_valid(reg)) { pcr_dbg(pcr, "skip fetch vendor setting\n"); @@ -82,6 +83,10 @@ static void rtsx_base_fetch_vendor_settings(struct rtsx_pcr *pcr) pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg); if (rtsx_reg_check_reverse_socket(reg)) pcr->flags |= PCR_REVERSE_SOCKET; + if (rtsx_reg_check_cd_reverse(reg)) + pcr->option.sd_cd_reverse_en = 1; + if (rtsx_reg_check_wp_reverse(reg)) + pcr->option.sd_wp_reverse_en = 1; } static void rts5249_init_from_cfg(struct rtsx_pcr *pcr) @@ -254,9 +259,11 @@ static int rts5249_extra_init_hw(struct rtsx_pcr *pcr) /* Configure driving */ rts5249_fill_driving(pcr, OUTPUT_3V3); if (pcr->flags & PCR_REVERSE_SOCKET) - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0xB0, 0xB0); - else - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0xB0, 0x80); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x30, 0x30); + else { + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } rtsx_pci_send_cmd(pcr, CMD_TIMEOUT_DEF); @@ -572,6 +579,9 @@ void rts5249_init_params(struct rtsx_pcr *pcr) option->ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5249_DEF; option->ltr_l1off_snooze_sspwrgate = LTR_L1OFF_SNOOZE_SSPWRGATE_5249_DEF; + + option->sd_cd_reverse_en = 0; + option->sd_wp_reverse_en = 0; } static int rts524a_write_phy(struct rtsx_pcr *pcr, u8 addr, u16 val) diff --git a/drivers/misc/cardreader/rts5264.c b/drivers/misc/cardreader/rts5264.c index d050c9fff7ac..99a2d5ea6421 100644 --- a/drivers/misc/cardreader/rts5264.c +++ b/drivers/misc/cardreader/rts5264.c @@ -527,8 +527,16 @@ static void rts5264_init_from_hw(struct rtsx_pcr *pcr) pcr->rtd3_en = rts5264_reg_to_rtd3(lval2); - if (rts5264_reg_check_reverse_socket(lval2)) - pcr->flags |= PCR_REVERSE_SOCKET; + if (rts5264_reg_check_reverse_socket(lval2)) { + if (is_version_higher_than(pcr, PID_5264, RTS5264_IC_VER_B)) + pcr->option.sd_cd_reverse_en = 1; + else + pcr->flags |= PCR_REVERSE_SOCKET; + } + + if (rts5264_reg_check_wp_reverse(lval2) && + is_version_higher_than(pcr, PID_5264, RTS5264_IC_VER_B)) + pcr->option.sd_wp_reverse_en = 1; pci_read_config_dword(pdev, setting_reg1, &lval1); pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", setting_reg1, lval1); @@ -622,8 +630,10 @@ static int rts5264_extra_init_hw(struct rtsx_pcr *pcr) if (pcr->flags & PCR_REVERSE_SOCKET) rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x30); - else - rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00); + else { + rtsx_pci_write_register(pcr, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_write_register(pcr, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } /* * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced @@ -957,4 +967,6 @@ void rts5264_init_params(struct rtsx_pcr *pcr) hw_param->interrupt_en |= (SD_OC_INT_EN | SD_OVP_INT_EN); hw_param->ocp_glitch = SD_OCP_GLITCH_800U | SDVIO_OCP_GLITCH_800U; option->sd_800mA_ocp_thd = RTS5264_LDO1_OCP_THD_1150; + option->sd_cd_reverse_en = 0; + option->sd_wp_reverse_en = 0; } diff --git a/drivers/misc/cardreader/rts5264.h b/drivers/misc/cardreader/rts5264.h index f3e81daa708d..611ee253367c 100644 --- a/drivers/misc/cardreader/rts5264.h +++ b/drivers/misc/cardreader/rts5264.h @@ -14,6 +14,7 @@ #define rts5264_reg_to_aspm(reg) \ (((~(reg) >> 28) & 0x02) | (((reg) >> 28) & 0x01)) #define rts5264_reg_check_reverse_socket(reg) ((reg) & 0x04) +#define rts5264_reg_check_wp_reverse(reg) ((reg) & 0x8000) #define rts5264_reg_to_sd30_drive_sel_1v8(reg) (((reg) >> 22) & 0x03) #define rts5264_reg_to_sd30_drive_sel_3v3(reg) (((reg) >> 16) & 0x03) #define rts5264_reg_to_rtd3(reg) ((reg) & 0x08) diff --git a/drivers/misc/cardreader/rtsx_pcr.h b/drivers/misc/cardreader/rtsx_pcr.h index 8e5951b61143..40562ff2be13 100644 --- a/drivers/misc/cardreader/rtsx_pcr.h +++ b/drivers/misc/cardreader/rtsx_pcr.h @@ -100,6 +100,8 @@ static inline u8 map_sd_drive(int idx) #define rtsx_reg_to_sd30_drive_sel_3v3(reg) (((reg) >> 5) & 0x03) #define rtsx_reg_to_card_drive_sel(reg) ((((reg) >> 25) & 0x01) << 6) #define rtsx_reg_check_reverse_socket(reg) ((reg) & 0x4000) +#define rtsx_reg_check_cd_reverse(reg) ((reg) & 0x800000) +#define rtsx_reg_check_wp_reverse(reg) ((reg) & 0x400000) #define rts5209_reg_to_aspm(reg) (((reg) >> 5) & 0x03) #define rts5209_reg_check_ms_pmos(reg) (!((reg) & 0x08)) #define rts5209_reg_to_sd30_drive_sel_1v8(reg) (((reg) >> 3) & 0x07) diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index 3b4c36705a9b..3c5689356004 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1160,6 +1160,8 @@ struct rtsx_cr_option { bool ocp_en; u8 sd_400mA_ocp_thd; u8 sd_800mA_ocp_thd; + u8 sd_cd_reverse_en; + u8 sd_wp_reverse_en; }; /* -- cgit v1.2.3 From f5597840ac907858ad2a462b00e4a68fd199121e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 14 Jul 2025 23:34:14 +0800 Subject: char: misc: Disallow registering miscdevice whose minor > MISC_DYNAMIC_MINOR Currently, It is allowed to register miscdevice with minor > 255 which is defined by macro MISC_DYNAMIC_MINOR, and cause: - Chaos regarding division and management of minor codes. - Registering failure if the minor was allocated to other dynamic request. Fortunately, in-kernel users have not had such usage yet. Fix by refusing to register miscdevice whose minor > 255. Also bring in a very simple minor code space division and management: < 255 : Fixed minor code == 255 : Indicator to request dynamic minor code > 255 : Dynamic minor code requested, 1048320 minor codes totally And all fixed minors allocated should be registered in 'linux/miscdevice.h' Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250714-rfc_miscdev-v6-3-2ed949665bde@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/misc.c | 6 ++++++ include/linux/miscdevice.h | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 558302a64dd9..b8e66466184f 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -210,6 +210,12 @@ int misc_register(struct miscdevice *misc) int err = 0; bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR); + if (misc->minor > MISC_DYNAMIC_MINOR) { + pr_err("Invalid fixed minor %d for miscdevice '%s'\n", + misc->minor, misc->name); + return -EINVAL; + } + INIT_LIST_HEAD(&misc->list); mutex_lock(&misc_mtx); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 3e6deb00fc85..565b88efeb23 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -71,6 +71,14 @@ #define USERIO_MINOR 240 #define VHOST_VSOCK_MINOR 241 #define RFKILL_MINOR 242 + +/* + * Misc char device minor code space division related to below macro: + * + * < 255 : Fixed minor code + * == 255 : Indicator to request dynamic minor code + * > 255 : Dynamic minor code requested, 1048320 minor codes totally. + */ #define MISC_DYNAMIC_MINOR 255 struct miscdevice { -- cgit v1.2.3 From d7f8d0758b975db8406c91cf242d46cd9611ba3e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 14 Jul 2025 23:34:18 +0800 Subject: char: misc: Register fixed minor EISA_EEPROM_MINOR in linux/miscdevice.h Move fixed minor EISA_EEPROM_MINOR definition to linux/miscdevice.h. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250714-rfc_miscdev-v6-7-2ed949665bde@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/parisc/eisa_eeprom.c | 2 -- include/linux/miscdevice.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c index 443b15422fc1..601cbb22574f 100644 --- a/drivers/parisc/eisa_eeprom.c +++ b/drivers/parisc/eisa_eeprom.c @@ -15,8 +15,6 @@ #include #include -#define EISA_EEPROM_MINOR 241 - static loff_t eisa_eeprom_llseek(struct file *file, loff_t offset, int origin) { return fixed_size_llseek(file, offset, origin, HPEE_MAX_LENGTH); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 565b88efeb23..7d0aa718499c 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -70,6 +70,7 @@ #define UHID_MINOR 239 #define USERIO_MINOR 240 #define VHOST_VSOCK_MINOR 241 +#define EISA_EEPROM_MINOR 241 #define RFKILL_MINOR 242 /* -- cgit v1.2.3 From 63740349eba78f242bcbf60d5244d7f2b2600853 Mon Sep 17 00:00:00 2001 From: Li Li Date: Sun, 27 Jul 2025 18:29:06 +0000 Subject: binder: introduce transaction reports via netlink Introduce a generic netlink multicast event to report binder transaction failures to userspace. This allows subscribers to monitor these events and take appropriate actions, such as stopping a misbehaving application that is spamming a service with huge amount of transactions. The multicast event contains full details of the failed transactions, including the sender/target PIDs, payload size and specific error code. This interface is defined using a YAML spec, from which the UAPI and kernel headers and source are auto-generated. Signed-off-by: Li Li Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20250727182932.2499194-4-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/netlink/specs/binder.yaml | 93 +++++++++++++++++++++++++++++ MAINTAINERS | 1 + drivers/android/Kconfig | 1 + drivers/android/Makefile | 2 +- drivers/android/binder.c | 85 ++++++++++++++++++++++++-- drivers/android/binder_netlink.c | 31 ++++++++++ drivers/android/binder_netlink.h | 20 +++++++ include/uapi/linux/android/binder_netlink.h | 37 ++++++++++++ 8 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 Documentation/netlink/specs/binder.yaml create mode 100644 drivers/android/binder_netlink.c create mode 100644 drivers/android/binder_netlink.h create mode 100644 include/uapi/linux/android/binder_netlink.h (limited to 'include') diff --git a/Documentation/netlink/specs/binder.yaml b/Documentation/netlink/specs/binder.yaml new file mode 100644 index 000000000000..140b77a6afee --- /dev/null +++ b/Documentation/netlink/specs/binder.yaml @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Copyright 2025 Google LLC +# +--- +name: binder +protocol: genetlink +uapi-header: linux/android/binder_netlink.h +doc: Binder interface over generic netlink + +attribute-sets: + - + name: report + doc: | + Attributes included within a transaction failure report. The elements + correspond directly with the specific transaction that failed, along + with the error returned to the sender e.g. BR_DEAD_REPLY. + + attributes: + - + name: error + type: u32 + doc: The enum binder_driver_return_protocol returned to the sender. + - + name: context + type: string + doc: The binder context where the transaction occurred. + - + name: from_pid + type: u32 + doc: The PID of the sender process. + - + name: from_tid + type: u32 + doc: The TID of the sender thread. + - + name: to_pid + type: u32 + doc: | + The PID of the recipient process. This attribute may not be present + if the target could not be determined. + - + name: to_tid + type: u32 + doc: | + The TID of the recipient thread. This attribute may not be present + if the target could not be determined. + - + name: is_reply + type: flag + doc: When present, indicates the failed transaction is a reply. + - + name: flags + type: u32 + doc: The bitmask of enum transaction_flags from the transaction. + - + name: code + type: u32 + doc: The application-defined code from the transaction. + - + name: data_size + type: u32 + doc: The transaction payload size in bytes. + +operations: + list: + - + name: report + doc: | + A multicast event sent to userspace subscribers to notify them about + binder transaction failures. The generated report provides the full + details of the specific transaction that failed. The intention is for + programs to monitor these events and react to the failures as needed. + + attribute-set: report + mcgrp: report + event: + attributes: + - error + - context + - from_pid + - from_tid + - to_pid + - to_tid + - is_reply + - flags + - code + - data_size + +mcast-groups: + list: + - + name: report diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..febc92cca659 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1790,6 +1790,7 @@ M: Suren Baghdasaryan L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git +F: Documentation/netlink/specs/binder.yaml F: drivers/android/ ANDROID GOLDFISH PIC DRIVER diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 5b3b8041f827..75af3cf472c8 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -4,6 +4,7 @@ menu "Android" config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" depends on MMU + depends on NET default n help Binder is used in Android for both communication between processes, diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c5d47be0276c..f422f91e026b 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -2,5 +2,5 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o -obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o +obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o binder_netlink.o obj-$(CONFIG_ANDROID_BINDER_ALLOC_KUNIT_TEST) += tests/ diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e4f9cbc6c93f..edfa15d39d6f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -74,6 +74,7 @@ #include +#include "binder_netlink.h" #include "binder_internal.h" #include "binder_trace.h" @@ -2993,6 +2994,67 @@ static void binder_set_txn_from_error(struct binder_transaction *t, int id, binder_thread_dec_tmpref(from); } +/** + * binder_netlink_report() - report a transaction failure via netlink + * @proc: the binder proc sending the transaction + * @t: the binder transaction that failed + * @data_size: the user provided data size for the transaction + * @error: enum binder_driver_return_protocol returned to sender + */ +static void binder_netlink_report(struct binder_proc *proc, + struct binder_transaction *t, + u32 data_size, + u32 error) +{ + const char *context = proc->context->name; + struct sk_buff *skb; + void *hdr; + + if (!genl_has_listeners(&binder_nl_family, &init_net, + BINDER_NLGRP_REPORT)) + return; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return; + + hdr = genlmsg_put(skb, 0, 0, &binder_nl_family, 0, BINDER_CMD_REPORT); + if (!hdr) + goto free_skb; + + if (nla_put_u32(skb, BINDER_A_REPORT_ERROR, error) || + nla_put_string(skb, BINDER_A_REPORT_CONTEXT, context) || + nla_put_u32(skb, BINDER_A_REPORT_FROM_PID, t->from_pid) || + nla_put_u32(skb, BINDER_A_REPORT_FROM_TID, t->from_tid)) + goto cancel_skb; + + if (t->to_proc && + nla_put_u32(skb, BINDER_A_REPORT_TO_PID, t->to_proc->pid)) + goto cancel_skb; + + if (t->to_thread && + nla_put_u32(skb, BINDER_A_REPORT_TO_TID, t->to_thread->pid)) + goto cancel_skb; + + if (t->is_reply && nla_put_flag(skb, BINDER_A_REPORT_IS_REPLY)) + goto cancel_skb; + + if (nla_put_u32(skb, BINDER_A_REPORT_FLAGS, t->flags) || + nla_put_u32(skb, BINDER_A_REPORT_CODE, t->code) || + nla_put_u32(skb, BINDER_A_REPORT_DATA_SIZE, data_size)) + goto cancel_skb; + + genlmsg_end(skb, hdr); + genlmsg_multicast(&binder_nl_family, skb, 0, BINDER_NLGRP_REPORT, + GFP_KERNEL); + return; + +cancel_skb: + genlmsg_cancel(skb, hdr); +free_skb: + nlmsg_free(skb); +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -3679,10 +3741,13 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_copy_data_failed; } - if (t->buffer->oneway_spam_suspect) + if (t->buffer->oneway_spam_suspect) { tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; - else + binder_netlink_report(proc, t, tr->data_size, + BR_ONEWAY_SPAM_SUSPECT); + } else { tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + } if (reply) { binder_enqueue_thread_work(thread, tcomplete); @@ -3730,8 +3795,11 @@ static void binder_transaction(struct binder_proc *proc, * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ - if (return_error == BR_TRANSACTION_PENDING_FROZEN) + if (return_error == BR_TRANSACTION_PENDING_FROZEN) { tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; + binder_netlink_report(proc, t, tr->data_size, + return_error); + } binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) @@ -3789,6 +3857,8 @@ err_invalid_target_handle: binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } + + binder_netlink_report(proc, t, tr->data_size, return_error); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: @@ -7059,12 +7129,19 @@ static int __init binder_init(void) } } - ret = init_binderfs(); + ret = genl_register_family(&binder_nl_family); if (ret) goto err_init_binder_device_failed; + ret = init_binderfs(); + if (ret) + goto err_init_binderfs_failed; + return ret; +err_init_binderfs_failed: + genl_unregister_family(&binder_nl_family); + err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); diff --git a/drivers/android/binder_netlink.c b/drivers/android/binder_netlink.c new file mode 100644 index 000000000000..d05397a50ca6 --- /dev/null +++ b/drivers/android/binder_netlink.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "binder_netlink.h" + +#include + +/* Ops table for binder */ +static const struct genl_split_ops binder_nl_ops[] = { +}; + +static const struct genl_multicast_group binder_nl_mcgrps[] = { + [BINDER_NLGRP_REPORT] = { "report", }, +}; + +struct genl_family binder_nl_family __ro_after_init = { + .name = BINDER_FAMILY_NAME, + .version = BINDER_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = binder_nl_ops, + .n_split_ops = ARRAY_SIZE(binder_nl_ops), + .mcgrps = binder_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(binder_nl_mcgrps), +}; diff --git a/drivers/android/binder_netlink.h b/drivers/android/binder_netlink.h new file mode 100644 index 000000000000..882c7a6b537e --- /dev/null +++ b/drivers/android/binder_netlink.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_BINDER_GEN_H +#define _LINUX_BINDER_GEN_H + +#include +#include + +#include + +enum { + BINDER_NLGRP_REPORT, +}; + +extern struct genl_family binder_nl_family; + +#endif /* _LINUX_BINDER_GEN_H */ diff --git a/include/uapi/linux/android/binder_netlink.h b/include/uapi/linux/android/binder_netlink.h new file mode 100644 index 000000000000..b218f96d6668 --- /dev/null +++ b/include/uapi/linux/android/binder_netlink.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ANDROID_BINDER_NETLINK_H +#define _UAPI_LINUX_ANDROID_BINDER_NETLINK_H + +#define BINDER_FAMILY_NAME "binder" +#define BINDER_FAMILY_VERSION 1 + +enum { + BINDER_A_REPORT_ERROR = 1, + BINDER_A_REPORT_CONTEXT, + BINDER_A_REPORT_FROM_PID, + BINDER_A_REPORT_FROM_TID, + BINDER_A_REPORT_TO_PID, + BINDER_A_REPORT_TO_TID, + BINDER_A_REPORT_IS_REPLY, + BINDER_A_REPORT_FLAGS, + BINDER_A_REPORT_CODE, + BINDER_A_REPORT_DATA_SIZE, + + __BINDER_A_REPORT_MAX, + BINDER_A_REPORT_MAX = (__BINDER_A_REPORT_MAX - 1) +}; + +enum { + BINDER_CMD_REPORT = 1, + + __BINDER_CMD_MAX, + BINDER_CMD_MAX = (__BINDER_CMD_MAX - 1) +}; + +#define BINDER_MCGRP_REPORT "report" + +#endif /* _UAPI_LINUX_ANDROID_BINDER_NETLINK_H */ -- cgit v1.2.3 From 1d6249c1ce826fcf03c695973095eb4a50fb7fd2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 11 Aug 2025 11:13:35 +0200 Subject: sysfs: remove bin_attribute::read_new/write_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These transitional fields are now unused and unnecessary. Remove them and their logic in the sysfs core. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250811-sysfs-const-bin_attr-final-v4-1-7b6053fd58bb@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 22 +++++----------------- include/linux/sysfs.h | 4 ---- 2 files changed, 5 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ca143d2f22a..3825e780cc58 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -97,12 +97,9 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, count = size - pos; } - if (!battr->read && !battr->read_new) + if (!battr->read) return -EIO; - if (battr->read_new) - return battr->read_new(of->file, kobj, battr, buf, pos, count); - return battr->read(of->file, kobj, battr, buf, pos, count); } @@ -161,12 +158,9 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (!count) return 0; - if (!battr->write && !battr->write_new) + if (!battr->write) return -EIO; - if (battr->write_new) - return battr->write_new(of->file, kobj, battr, buf, pos, count); - return battr->write(of->file, kobj, battr, buf, pos, count); } @@ -335,19 +329,13 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct kernfs_ops *ops; struct kernfs_node *kn; - if (battr->read && battr->read_new) - return -EINVAL; - - if (battr->write && battr->write_new) - return -EINVAL; - if (battr->mmap) ops = &sysfs_bin_kfops_mmap; - else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) + else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; - else if (battr->read || battr->read_new) + else if (battr->read) ops = &sysfs_bin_kfops_ro; - else if (battr->write || battr->write_new) + else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index f418aae4f113..7544f6d81c05 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -308,12 +308,8 @@ struct bin_attribute { struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, - char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*write_new)(struct file *, struct kobject *, - const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, -- cgit v1.2.3 From 44d454fcffa8b08d6d66df132121c1d387fa85db Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 11 Aug 2025 11:13:36 +0200 Subject: sysfs: remove attribute_group::bin_attrs_new MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This transitional field is now unused and unnecessary. Remove it. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250811-sysfs-const-bin_attr-final-v4-2-7b6053fd58bb@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 7544f6d81c05..9a25a2911652 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -106,10 +106,7 @@ struct attribute_group { const struct bin_attribute *, int); struct attribute **attrs; - union { - const struct bin_attribute *const *bin_attrs; - const struct bin_attribute *const *bin_attrs_new; - }; + const struct bin_attribute *const *bin_attrs; }; #define SYSFS_PREALLOC 010000 @@ -293,7 +290,7 @@ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .bin_attrs_new = _name##_attrs, \ + .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) -- cgit v1.2.3 From 12cc0ff3cdd95f2bc0ffdc63bcd9da231eb33199 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 19 Aug 2025 11:01:46 +0100 Subject: ASoC: qcom: audioreach: deprecate AR_TKN_U32_MODULE_[IN/OUT]_PORTS Deprecate usage of AR_TKN_U32_MODULE_IN_PORTS and AR_TKN_U32_MODULE_OUT_PORTS as the connectivity of modules is taken care by AR_TKN_U32_MODULE_SRC_OP_PORT_ID* and AR_TKN_U32_MODULE_DST_IN_PORT_ID* Also this property is never used in the drivers. Signed-off-by: Srinivas Kandagatla Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250819100151.1294047-2-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- include/uapi/sound/snd_ar_tokens.h | 4 ++-- sound/soc/qcom/qdsp6/audioreach.h | 3 --- sound/soc/qcom/qdsp6/topology.c | 10 +--------- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/snd_ar_tokens.h b/include/uapi/sound/snd_ar_tokens.h index b9b9093b4396..bc0b1bede00c 100644 --- a/include/uapi/sound/snd_ar_tokens.h +++ b/include/uapi/sound/snd_ar_tokens.h @@ -184,8 +184,8 @@ enum ar_event_types { #define AR_TKN_U32_MODULE_INSTANCE_ID 201 #define AR_TKN_U32_MODULE_MAX_IP_PORTS 202 #define AR_TKN_U32_MODULE_MAX_OP_PORTS 203 -#define AR_TKN_U32_MODULE_IN_PORTS 204 -#define AR_TKN_U32_MODULE_OUT_PORTS 205 +#define AR_TKN_U32_MODULE_IN_PORTS 204 /* deprecated */ +#define AR_TKN_U32_MODULE_OUT_PORTS 205 /* deprecated */ #define AR_TKN_U32_MODULE_SRC_OP_PORT_ID 206 #define AR_TKN_U32_MODULE_DST_IN_PORT_ID 207 #define AR_TKN_U32_MODULE_SRC_INSTANCE_ID 208 diff --git a/sound/soc/qcom/qdsp6/audioreach.h b/sound/soc/qcom/qdsp6/audioreach.h index 61a69df4f50f..9b30177463e6 100644 --- a/sound/soc/qcom/qdsp6/audioreach.h +++ b/sound/soc/qcom/qdsp6/audioreach.h @@ -707,9 +707,6 @@ struct audioreach_module { uint32_t max_ip_port; uint32_t max_op_port; - uint32_t in_port; - uint32_t out_port; - uint32_t num_connections; /* Connections */ uint32_t src_mod_inst_id; diff --git a/sound/soc/qcom/qdsp6/topology.c b/sound/soc/qcom/qdsp6/topology.c index 83319a928f29..a3b0f558260c 100644 --- a/sound/soc/qcom/qdsp6/topology.c +++ b/sound/soc/qcom/qdsp6/topology.c @@ -412,7 +412,7 @@ static struct audioreach_module *audioreach_parse_common_tokens(struct q6apm *ap struct snd_soc_tplg_private *private, struct snd_soc_dapm_widget *w) { - uint32_t max_ip_port = 0, max_op_port = 0, in_port = 0, out_port = 0; + uint32_t max_ip_port = 0, max_op_port = 0; uint32_t src_mod_op_port_id[AR_MAX_MOD_LINKS] = { 0, }; uint32_t dst_mod_inst_id[AR_MAX_MOD_LINKS] = { 0, }; uint32_t dst_mod_ip_port_id[AR_MAX_MOD_LINKS] = { 0, }; @@ -455,12 +455,6 @@ static struct audioreach_module *audioreach_parse_common_tokens(struct q6apm *ap case AR_TKN_U32_MODULE_MAX_OP_PORTS: max_op_port = le32_to_cpu(mod_elem->value); break; - case AR_TKN_U32_MODULE_IN_PORTS: - in_port = le32_to_cpu(mod_elem->value); - break; - case AR_TKN_U32_MODULE_OUT_PORTS: - out_port = le32_to_cpu(mod_elem->value); - break; case AR_TKN_U32_MODULE_SRC_INSTANCE_ID: src_mod_inst_id = le32_to_cpu(mod_elem->value); break; @@ -550,8 +544,6 @@ static struct audioreach_module *audioreach_parse_common_tokens(struct q6apm *ap mod->module_id = module_id; mod->max_ip_port = max_ip_port; mod->max_op_port = max_op_port; - mod->in_port = in_port; - mod->out_port = out_port; mod->src_mod_inst_id = src_mod_inst_id; for (pn = 0; pn < mod->max_op_port; pn++) { if (src_mod_op_port_id[pn] && dst_mod_inst_id[pn] && -- cgit v1.2.3 From f07b81b573b28e5cae5c1482001ad0d6c0b7c051 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 19 Aug 2025 11:01:47 +0100 Subject: ASoC: qcom: audioreach: add documentation for i2s interface type Add documentation of possible values for I2S interface types, currently this is only documented for DMA module. Signed-off-by: Srinivas Kandagatla Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250819100151.1294047-3-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- include/uapi/sound/snd_ar_tokens.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/sound/snd_ar_tokens.h b/include/uapi/sound/snd_ar_tokens.h index bc0b1bede00c..92cf72a6fdd4 100644 --- a/include/uapi/sound/snd_ar_tokens.h +++ b/include/uapi/sound/snd_ar_tokens.h @@ -118,6 +118,12 @@ enum ar_event_types { * LPAIF_WSA = 2, * LPAIF_VA = 3, * LPAIF_AXI = 4 + * Possible values for MI2S + * I2S_INTF_TYPE_PRIMARY = 0, + * I2S_INTF_TYPE_SECONDARY = 1, + * I2S_INTF_TYPE_TERTIARY = 2, + * I2S_INTF_TYPE_QUATERNARY = 3, + * I2S_INTF_TYPE_QUINARY = 4, * * %AR_TKN_U32_MODULE_FMT_INTERLEAVE: PCM Interleaving * PCM_INTERLEAVED = 1, -- cgit v1.2.3 From c7ed4c2debfd192f6071f4ab33c092d419abb941 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 19 Aug 2025 11:01:48 +0100 Subject: ASoC: qcom: audioreach: add support for static calibration This change adds support for static calibration data via ASoC topology file. This static calibration data could include binary blob of data that is required by specific module and is not part of topology tokens. Reason for adding this support is to allow loading module specific data that can not be part of the tplg tokens, example, Echo and Noise cancelling module needs a blob of calibration data to function correctly. This support is also one of the building block for adding speaker protection support. Tested this with Single Mic ECNS(Echo and Noise Cancellation). tplg can now contain this calibration data like: SectionWidget."stream2.SMECNS_V224" { ... data [ ... "stream2.SMECNS_V224_cfg_data" ] } SectionData."stream2.SMECNS_V224_cfg_data" { words "0x00000330, 0x01001006,0x00000000,0x00000000, 0x00004145,0x08001026,0x00000004,0x00000000, ..." } } Signed-off-by: Srinivas Kandagatla Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250819100151.1294047-4-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- include/uapi/sound/snd_ar_tokens.h | 10 ++++++++++ sound/soc/qcom/qdsp6/audioreach.h | 2 ++ sound/soc/qcom/qdsp6/topology.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'include') diff --git a/include/uapi/sound/snd_ar_tokens.h b/include/uapi/sound/snd_ar_tokens.h index 92cf72a6fdd4..6b8102eaa121 100644 --- a/include/uapi/sound/snd_ar_tokens.h +++ b/include/uapi/sound/snd_ar_tokens.h @@ -3,6 +3,8 @@ #ifndef __SND_AR_TOKENS_H__ #define __SND_AR_TOKENS_H__ +#include + #define APM_SUB_GRAPH_PERF_MODE_LOW_POWER 0x1 #define APM_SUB_GRAPH_PERF_MODE_LOW_LATENCY 0x2 @@ -238,4 +240,12 @@ enum ar_event_types { #define AR_TKN_U32_MODULE_LOG_TAP_POINT_ID 260 #define AR_TKN_U32_MODULE_LOG_MODE 261 +#define SND_SOC_AR_TPLG_MODULE_CFG_TYPE 0x01001006 +struct audioreach_module_priv_data { + __le32 size; /* size in bytes of the array, including all elements */ + __le32 type; /* SND_SOC_AR_TPLG_MODULE_CFG_TYPE */ + __le32 priv[2]; /* Private data for future expansion */ + __le32 data[0]; /* config data */ +}; + #endif /* __SND_AR_TOKENS_H__ */ diff --git a/sound/soc/qcom/qdsp6/audioreach.h b/sound/soc/qcom/qdsp6/audioreach.h index 9b30177463e6..617bda051cf8 100644 --- a/sound/soc/qcom/qdsp6/audioreach.h +++ b/sound/soc/qcom/qdsp6/audioreach.h @@ -4,6 +4,7 @@ #define __AUDIOREACH_H__ #include #include +#include #include struct q6apm; struct q6apm_graph; @@ -742,6 +743,7 @@ struct audioreach_module { struct list_head node; struct audioreach_container *container; struct snd_soc_dapm_widget *widget; + struct audioreach_module_priv_data *data; }; struct audioreach_module_config { diff --git a/sound/soc/qcom/qdsp6/topology.c b/sound/soc/qcom/qdsp6/topology.c index a3b0f558260c..ec51fabd98cb 100644 --- a/sound/soc/qcom/qdsp6/topology.c +++ b/sound/soc/qcom/qdsp6/topology.c @@ -305,6 +305,34 @@ static struct snd_soc_tplg_vendor_array *audioreach_get_module_array( return NULL; } +static struct audioreach_module_priv_data *audioreach_get_module_priv_data( + struct snd_soc_tplg_private *private) +{ + int sz; + + for (sz = 0; sz < le32_to_cpu(private->size); ) { + struct snd_soc_tplg_vendor_array *mod_array; + + mod_array = (struct snd_soc_tplg_vendor_array *)((u8 *)private->array + sz); + if (mod_array->type == SND_SOC_AR_TPLG_MODULE_CFG_TYPE) { + struct audioreach_module_priv_data *pdata; + + pdata = kzalloc(struct_size(pdata, data, le32_to_cpu(mod_array->size)), + GFP_KERNEL); + if (!pdata) + return ERR_PTR(-ENOMEM); + + memcpy(pdata, ((u8 *)private->data + sz), struct_size(pdata, data, + le32_to_cpu(mod_array->size))); + return pdata; + } + + sz = sz + le32_to_cpu(mod_array->size); + } + + return NULL; +} + static struct audioreach_sub_graph *audioreach_parse_sg_tokens(struct q6apm *apm, struct snd_soc_tplg_private *private) { @@ -582,6 +610,8 @@ static int audioreach_widget_load_module_common(struct snd_soc_component *compon if (IS_ERR(mod)) return PTR_ERR(mod); + mod->data = audioreach_get_module_priv_data(&tplg_w->priv); + dobj = &w->dobj; dobj->private = mod; @@ -939,6 +969,7 @@ static int audioreach_widget_unload(struct snd_soc_component *scomp, cont->num_modules--; list_del(&mod->node); + kfree(mod->data); kfree(mod); /* Graph Info has N sub-graphs, sub-graph has N containers, Container has N Modules */ if (list_empty(&cont->modules_list)) { /* if no modules in the container then remove it */ -- cgit v1.2.3 From 74f44ad07d1063933c237a7db16f6a4036643d60 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 30 Jul 2025 17:46:14 +0100 Subject: mmc: tmio: Add 64-bit read/write support for SD_BUF0 in polling mode As per the RZ/{G2L,G3E} HW manual SD_BUF0 can be accessed by 16/32/64 bits. Most of the data transfer in SD/SDIO/eMMC mode is more than 8 bytes. During testing it is found that, if the DMA buffer is not aligned to 128 bit it fallback to PIO mode. In such cases, 64-bit access is much more efficient than the current 16-bit. Tested-by: Wolfram Sang Reviewed-by: Wolfram Sang Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20250730164618.233117-2-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc.h | 15 +++++++++++++++ drivers/mmc/host/tmio_mmc_core.c | 33 +++++++++++++++++++++++++++++++++ include/linux/platform_data/tmio.h | 3 +++ 3 files changed, 51 insertions(+) (limited to 'include') diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index d730b7633ae1..c8cdb1c0722e 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -242,6 +243,20 @@ static inline void sd_ctrl_read32_rep(struct tmio_mmc_host *host, int addr, ioread32_rep(host->ctl + (addr << host->bus_shift), buf, count); } +#ifdef CONFIG_64BIT +static inline void sd_ctrl_read64_rep(struct tmio_mmc_host *host, int addr, + u64 *buf, int count) +{ + readsq(host->ctl + (addr << host->bus_shift), buf, count); +} + +static inline void sd_ctrl_write64_rep(struct tmio_mmc_host *host, int addr, + const u64 *buf, int count) +{ + writesq(host->ctl + (addr << host->bus_shift), buf, count); +} +#endif + static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val) { diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 21c2f9095bac..775e0d9353d5 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -349,6 +349,39 @@ static void tmio_mmc_transfer_data(struct tmio_mmc_host *host, /* * Transfer the data */ +#ifdef CONFIG_64BIT + if (host->pdata->flags & TMIO_MMC_64BIT_DATA_PORT) { + u64 *buf64 = (u64 *)buf; + u64 data = 0; + + if (count >= 8) { + if (is_read) + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + else + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + } + + /* if count was multiple of 8 */ + if (!(count & 0x7)) + return; + + buf64 += count >> 3; + count %= 8; + + if (is_read) { + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, &data, 1); + memcpy(buf64, &data, count); + } else { + memcpy(&data, buf64, count); + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, &data, 1); + } + + return; + } +#endif + if (host->pdata->flags & TMIO_MMC_32BIT_DATA_PORT) { u32 data = 0; u32 *buf32 = (u32 *)buf; diff --git a/include/linux/platform_data/tmio.h b/include/linux/platform_data/tmio.h index b060124ba1ae..426291713b83 100644 --- a/include/linux/platform_data/tmio.h +++ b/include/linux/platform_data/tmio.h @@ -47,6 +47,9 @@ /* Some controllers have a CBSY bit */ #define TMIO_MMC_HAVE_CBSY BIT(11) +/* Some controllers have a 64-bit wide data port register */ +#define TMIO_MMC_64BIT_DATA_PORT BIT(12) + struct tmio_mmc_data { void *chan_priv_tx; void *chan_priv_rx; -- cgit v1.2.3 From d2e6fb2c31a07f34e5e7533df11431cb0d2ecf9f Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Tue, 12 Aug 2025 11:08:11 +0800 Subject: misc: rtsx: usb card reader: add OCP support This patch adds support for Over Current Protection (OCP) to the Realtek USB card reader driver. The OCP mechanism protects the hardware by detecting and handling current overload conditions. This implementation includes: - Register configurations to enable OCP monitoring. - Handling of OCP interrupt events and associated error reporting. - Card power management changes in response to OCP triggers. This enhancement improves the robustness of the driver when operating in environments where electrical anomalies may occur, particularly with SD and MS card interfaces. Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20250812030811.2426112-1-ricky_wu@realtek.com Signed-off-by: Ulf Hansson --- drivers/memstick/host/rtsx_usb_ms.c | 5 ++++- drivers/misc/cardreader/rtsx_usb.c | 7 +++++++ drivers/mmc/host/rtsx_usb_sdmmc.c | 33 ++++++++++++++++++++++++++++++--- include/linux/rtsx_usb.h | 11 +++++++++++ 4 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/memstick/host/rtsx_usb_ms.c b/drivers/memstick/host/rtsx_usb_ms.c index 3878136227e4..9389e9643c24 100644 --- a/drivers/memstick/host/rtsx_usb_ms.c +++ b/drivers/memstick/host/rtsx_usb_ms.c @@ -216,7 +216,10 @@ static int ms_power_off(struct rtsx_usb_ms *host) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_CLK_EN, MS_CLK_EN, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, MS_OUTPUT_EN, 0); - + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK, POWER_OFF); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK | LDO3318_PWR_MASK, POWER_OFF | LDO_SUSPEND); err = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (err < 0) return err; diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index d007a4455ce5..1830e9ed2521 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -552,6 +552,10 @@ static int rtsx_usb_reset_chip(struct rtsx_ucr *ucr) ret = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (ret) return ret; + /* config OCP */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_DETECT_EN, MS_OCP_DETECT_EN); + rtsx_usb_write_register(ucr, OCPPARA1, 0xF0, 0x50); + rtsx_usb_write_register(ucr, OCPPARA2, 0x7, 0x3); /* config non-crystal mode */ rtsx_usb_read_register(ucr, CFG_MODE, &val); @@ -722,6 +726,9 @@ static int rtsx_usb_suspend(struct usb_interface *intf, pm_message_t message) if (val & (SD_CD | MS_CD)) { device_for_each_child(&intf->dev, NULL, rtsx_usb_resume_child); return -EAGAIN; + } else { + /* if the card does not exists, clear OCP status */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); } } else { /* There is an ongoing operation*/ diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index c5f6b9df066b..e1ed39c657c3 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -48,7 +48,7 @@ struct rtsx_usb_sdmmc { bool ddr_mode; unsigned char power_mode; - + u16 ocp_stat; #ifdef RTSX_USB_USE_LEDS_CLASS struct led_classdev led; char led_name[32]; @@ -785,6 +785,9 @@ static int sdmmc_get_cd(struct mmc_host *mmc) mutex_unlock(&ucr->dev_mutex); + /* get OCP status */ + host->ocp_stat = (val >> 4) & 0x03; + /* Treat failed detection as non-exist */ if (err) goto no_card; @@ -795,6 +798,11 @@ static int sdmmc_get_cd(struct mmc_host *mmc) } no_card: + /* clear OCP status */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); + host->ocp_stat = 0; + } host->card_exist = false; return 0; } @@ -818,7 +826,11 @@ static void sdmmc_request(struct mmc_host *mmc, struct mmc_request *mrq) cmd->error = -ENOMEDIUM; goto finish_detect_card; } - + /* check OCP stat */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + cmd->error = -ENOMEDIUM; + goto finish_detect_card; + } mutex_lock(&ucr->dev_mutex); mutex_lock(&host->host_mutex); @@ -952,6 +964,10 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) struct rtsx_ucr *ucr = host->ucr; int err; + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + dev_dbg(sdmmc_dev(host), "over current\n"); + return -EIO; + } dev_dbg(sdmmc_dev(host), "%s\n", __func__); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_SELECT, 0x07, SD_MOD_SEL); @@ -977,9 +993,19 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) usleep_range(800, 1000); + rtsx_usb_init_cmd(ucr); + /* WA OCP issue: after OCP, there were problems with reopen card power */ + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, POWER_MASK, POWER_ON); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, FPDCTL, SSC_POWER_MASK, SSC_POWER_DOWN); + err = rtsx_usb_send_cmd(ucr, MODE_C, 100); + if (err) + return err; + msleep(20); + rtsx_usb_write_register(ucr, FPDCTL, SSC_POWER_MASK, SSC_POWER_ON); + usleep_range(180, 200); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, - POWER_MASK|LDO3318_PWR_MASK, POWER_ON|LDO_ON); + LDO3318_PWR_MASK, LDO_ON); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, SD_OUTPUT_EN, SD_OUTPUT_EN); @@ -1332,6 +1358,7 @@ static void rtsx_usb_init_host(struct rtsx_usb_sdmmc *host) mmc->max_req_size = 524288; host->power_mode = MMC_POWER_OFF; + host->ocp_stat = 0; } static int rtsx_usb_sdmmc_drv_probe(struct platform_device *pdev) diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index f267a06c6b1e..276b509c03e3 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -99,6 +99,17 @@ extern int rtsx_usb_card_exclusive_check(struct rtsx_ucr *ucr, int card); #define CD_MASK (SD_CD | MS_CD | XD_CD) #define SD_WP 0x08 +/* OCPCTL */ +#define MS_OCP_DETECT_EN 0x08 +#define MS_OCP_INT_EN 0x04 +#define MS_OCP_INT_CLR 0x02 +#define MS_OCP_CLEAR 0x01 + +/* OCPSTAT */ +#define MS_OCP_DETECT 0x80 +#define MS_OCP_NOW 0x02 +#define MS_OCP_EVER 0x01 + /* reader command field offset & parameters */ #define READ_REG_CMD 0 #define WRITE_REG_CMD 1 -- cgit v1.2.3 From 99e6cc80d5ce5af5781f84d20e4f3478d66ee8ee Mon Sep 17 00:00:00 2001 From: Benoît Monin Date: Mon, 18 Aug 2025 16:02:50 +0200 Subject: mmc: core: add mmc_read_tuning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide a function to the MMC hosts to read some blocks of data as part of their tuning. This function only returns the status of the read operation, not the data read. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-5-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_ops.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mmc/host.h | 1 + 2 files changed, 73 insertions(+) (limited to 'include') diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 66283825513c..a952cc8265af 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -1077,3 +1077,75 @@ int mmc_sanitize(struct mmc_card *card, unsigned int timeout_ms) return err; } EXPORT_SYMBOL_GPL(mmc_sanitize); + +/** + * mmc_read_tuning() - read data blocks from the mmc + * @host: mmc host doing the read + * @blksz: data block size + * @blocks: number of blocks to read + * + * Read one or more blocks of data from the beginning of the mmc. This is a + * low-level helper for tuning operation. It is assumed that CMD23 can be used + * for multi-block read if the host supports it. + * + * Note: Allocate and free a temporary buffer to store the data read. The data + * is not available outside of the function, only the status of the read + * operation. + * + * Return: 0 in case of success, otherwise -EIO / -ENOMEM / -E2BIG + */ +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks) +{ + struct mmc_request mrq = {}; + struct mmc_command sbc = {}; + struct mmc_command cmd = {}; + struct mmc_command stop = {}; + struct mmc_data data = {}; + struct scatterlist sg; + void *buf; + unsigned int len; + + if (blocks > 1) { + if (mmc_host_can_cmd23(host)) { + mrq.sbc = &sbc; + sbc.opcode = MMC_SET_BLOCK_COUNT; + sbc.arg = blocks; + sbc.flags = MMC_RSP_R1 | MMC_CMD_AC; + } + cmd.opcode = MMC_READ_MULTIPLE_BLOCK; + mrq.stop = &stop; + stop.opcode = MMC_STOP_TRANSMISSION; + stop.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; + } else { + cmd.opcode = MMC_READ_SINGLE_BLOCK; + } + + mrq.cmd = &cmd; + cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; + + mrq.data = &data; + data.flags = MMC_DATA_READ; + data.blksz = blksz; + data.blocks = blocks; + data.blk_addr = 0; + data.sg = &sg; + data.sg_len = 1; + data.timeout_ns = 1000000000; + + if (check_mul_overflow(blksz, blocks, &len)) + return -E2BIG; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + sg_init_one(&sg, buf, len); + + mmc_wait_for_req(host, &mrq); + kfree(buf); + + if (sbc.error || cmd.error || data.error) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(mmc_read_tuning); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 68f09a955a90..5ed5d203de23 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -743,5 +743,6 @@ int mmc_send_status(struct mmc_card *card, u32 *status); int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks); #endif /* LINUX_MMC_HOST_H */ -- cgit v1.2.3 From 730ff06d3f5cc2ce0348414b78c10528b767d4a3 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Thu, 14 Aug 2025 07:04:10 -0700 Subject: net: mana: Use page pool fragments for RX buffers instead of full pages to improve memory efficiency. This patch enhances RX buffer handling in the mana driver by allocating pages from a page pool and slicing them into MTU-sized fragments, rather than dedicating a full page per packet. This approach is especially beneficial on systems with large base page sizes like 64KB. Key improvements: - Proper integration of page pool for RX buffer allocations. - MTU-sized buffer slicing to improve memory utilization. - Reduce overall per Rx queue memory footprint. - Automatic fallback to full-page buffers when: * Jumbo frames are enabled (MTU > PAGE_SIZE / 2). * The XDP path is active, to avoid complexities with fragment reuse. Testing on VMs with 64KB pages shows around 200% throughput improvement. Memory efficiency is significantly improved due to reduced wastage in page allocations. Example: We are now able to fit 35 rx buffers in a single 64kb page for MTU size of 1500, instead of 1 rx buffer per page previously. Tested: - iperf3, iperf2, and nttcp benchmarks. - Jumbo frames with MTU 9000. - Native XDP programs (XDP_PASS, XDP_DROP, XDP_TX, XDP_REDIRECT) for testing the XDP path in driver. - Memory leak detection (kmemleak). - Driver load/unload, reboot, and stress scenarios. Reviewed-by: Jacob Keller Reviewed-by: Saurabh Sengar Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20250814140410.GA22089@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_bpf.c | 46 +++++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 151 +++++++++++++++++-------- include/net/mana/mana.h | 4 + 3 files changed, 150 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c index d30721d4516f..7697c9b52ed3 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c +++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c @@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, struct mana_port_context *apc = netdev_priv(ndev); struct bpf_prog *old_prog; struct gdma_context *gc; + int err; gc = apc->ac->gdma_dev->gdma_context; @@ -195,11 +196,45 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, */ apc->bpf_prog = prog; - if (old_prog) - bpf_prog_put(old_prog); + if (apc->port_is_up) { + /* Re-create rxq's after xdp prog was loaded or unloaded. + * Ex: re create rxq's to switch from full pages to smaller + * size page fragments when xdp prog is unloaded and + * vice-versa. + */ + + /* Pre-allocate buffers to prevent failure in mana_attach */ + err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "XDP: Insufficient memory for tx/rx re-config"); + return err; + } + + err = mana_detach(ndev, false); + if (err) { + netdev_err(ndev, + "mana_detach failed at xdp set: %d\n", err); + NL_SET_ERR_MSG_MOD(extack, + "XDP: Re-config failed at detach"); + goto err_dealloc_rxbuffs; + } + + err = mana_attach(ndev); + if (err) { + netdev_err(ndev, + "mana_attach failed at xdp set: %d\n", err); + NL_SET_ERR_MSG_MOD(extack, + "XDP: Re-config failed at attach"); + goto err_dealloc_rxbuffs; + } - if (apc->port_is_up) mana_chn_setxdp(apc, prog); + mana_pre_dealloc_rxbufs(apc); + } + + if (old_prog) + bpf_prog_put(old_prog); if (prog) ndev->max_mtu = MANA_XDP_MTU_MAX; @@ -207,6 +242,11 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, ndev->max_mtu = gc->adapter_mtu - ETH_HLEN; return 0; + +err_dealloc_rxbuffs: + apc->bpf_prog = old_prog; + mana_pre_dealloc_rxbufs(apc); + return err; } int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 550843e2164b..f4fc86f20213 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -57,6 +57,15 @@ static bool mana_en_need_log(struct mana_port_context *apc, int err) return true; } +static void mana_put_rx_page(struct mana_rxq *rxq, struct page *page, + bool from_pool) +{ + if (from_pool) + page_pool_put_full_page(rxq->page_pool, page, false); + else + put_page(page); +} + /* Microsoft Azure Network Adapter (MANA) functions */ static int mana_open(struct net_device *ndev) @@ -630,21 +639,40 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da) } /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */ -static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, - u32 *headroom) +static void mana_get_rxbuf_cfg(struct mana_port_context *apc, + int mtu, u32 *datasize, u32 *alloc_size, + u32 *headroom, u32 *frag_count) { - if (mtu > MANA_XDP_MTU_MAX) - *headroom = 0; /* no support for XDP */ - else - *headroom = XDP_PACKET_HEADROOM; + u32 len, buf_size; - *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + /* Calculate datasize first (consistent across all cases) */ + *datasize = mtu + ETH_HLEN; - /* Using page pool in this case, so alloc_size is PAGE_SIZE */ - if (*alloc_size < PAGE_SIZE) - *alloc_size = PAGE_SIZE; + /* For xdp and jumbo frames make sure only one packet fits per page */ + if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) { + if (mana_xdp_get(apc)) { + *headroom = XDP_PACKET_HEADROOM; + *alloc_size = PAGE_SIZE; + } else { + *headroom = 0; /* no support for XDP */ + *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + + *headroom); + } - *datasize = mtu + ETH_HLEN; + *frag_count = 1; + return; + } + + /* Standard MTU case - optimize for multiple packets per page */ + *headroom = 0; + + /* Calculate base buffer size needed */ + len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT); + + /* Calculate how many packets can fit in a page */ + *frag_count = PAGE_SIZE / buf_size; + *alloc_size = buf_size; } int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_queues) @@ -656,8 +684,9 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu void *va; int i; - mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize, - &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom); + mana_get_rxbuf_cfg(mpc, new_mtu, &mpc->rxbpre_datasize, + &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom, + &mpc->rxbpre_frag_count); dev = mpc->ac->gdma_dev->gdma_context->dev; @@ -1842,8 +1871,11 @@ drop_xdp: drop: if (from_pool) { - page_pool_recycle_direct(rxq->page_pool, - virt_to_head_page(buf_va)); + if (rxq->frag_count == 1) + page_pool_recycle_direct(rxq->page_pool, + virt_to_head_page(buf_va)); + else + page_pool_free_va(rxq->page_pool, buf_va, true); } else { WARN_ON_ONCE(rxq->xdp_save_va); /* Save for reuse */ @@ -1859,33 +1891,46 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev, dma_addr_t *da, bool *from_pool) { struct page *page; + u32 offset; void *va; - *from_pool = false; - /* Reuse XDP dropped page if available */ - if (rxq->xdp_save_va) { - va = rxq->xdp_save_va; - rxq->xdp_save_va = NULL; - } else { - page = page_pool_dev_alloc_pages(rxq->page_pool); - if (!page) + /* Don't use fragments for jumbo frames or XDP where it's 1 fragment + * per page. + */ + if (rxq->frag_count == 1) { + /* Reuse XDP dropped page if available */ + if (rxq->xdp_save_va) { + va = rxq->xdp_save_va; + page = virt_to_head_page(va); + rxq->xdp_save_va = NULL; + } else { + page = page_pool_dev_alloc_pages(rxq->page_pool); + if (!page) + return NULL; + + *from_pool = true; + va = page_to_virt(page); + } + + *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, + DMA_FROM_DEVICE); + if (dma_mapping_error(dev, *da)) { + mana_put_rx_page(rxq, page, *from_pool); return NULL; + } - *from_pool = true; - va = page_to_virt(page); + return va; } - *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, - DMA_FROM_DEVICE); - if (dma_mapping_error(dev, *da)) { - if (*from_pool) - page_pool_put_full_page(rxq->page_pool, page, false); - else - put_page(virt_to_head_page(va)); - + page = page_pool_dev_alloc_frag(rxq->page_pool, &offset, + rxq->alloc_size); + if (!page) return NULL; - } + + va = page_to_virt(page) + offset; + *da = page_pool_get_dma_addr(page) + offset + rxq->headroom; + *from_pool = true; return va; } @@ -1902,9 +1947,9 @@ static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq, va = mana_get_rxfrag(rxq, dev, &da, &from_pool); if (!va) return; - - dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, - DMA_FROM_DEVICE); + if (!rxoob->from_pool || rxq->frag_count == 1) + dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, + DMA_FROM_DEVICE); *old_buf = rxoob->buf_va; *old_fp = rxoob->from_pool; @@ -2315,15 +2360,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc, if (!rx_oob->buf_va) continue; - dma_unmap_single(dev, rx_oob->sgl[0].address, - rx_oob->sgl[0].size, DMA_FROM_DEVICE); - page = virt_to_head_page(rx_oob->buf_va); - if (rx_oob->from_pool) - page_pool_put_full_page(rxq->page_pool, page, false); - else - put_page(page); + if (rxq->frag_count == 1 || !rx_oob->from_pool) { + dma_unmap_single(dev, rx_oob->sgl[0].address, + rx_oob->sgl[0].size, DMA_FROM_DEVICE); + mana_put_rx_page(rxq, page, rx_oob->from_pool); + } else { + page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true); + } rx_oob->buf_va = NULL; } @@ -2429,11 +2474,22 @@ static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc) struct page_pool_params pprm = {}; int ret; - pprm.pool_size = mpc->rx_queue_size; + pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1; pprm.nid = gc->numa_node; pprm.napi = &rxq->rx_cq.napi; pprm.netdev = rxq->ndev; pprm.order = get_order(rxq->alloc_size); + pprm.queue_idx = rxq->rxq_idx; + pprm.dev = gc->dev; + + /* Let the page pool do the dma map when page sharing with multiple + * fragments enabled for rx buffers. + */ + if (rxq->frag_count > 1) { + pprm.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pprm.max_len = PAGE_SIZE; + pprm.dma_dir = DMA_FROM_DEVICE; + } rxq->page_pool = page_pool_create(&pprm); @@ -2472,9 +2528,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, rxq->rxq_idx = rxq_idx; rxq->rxobj = INVALID_MANA_HANDLE; - mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size, - &rxq->headroom); - + mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size, + &rxq->headroom, &rxq->frag_count); /* Create page pool for RX queue */ err = mana_create_page_pool(rxq, gc); if (err) { diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index e1030a7d2daa..0921485565c0 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -65,6 +65,8 @@ enum TRI_STATE { #define MANA_STATS_RX_COUNT 5 #define MANA_STATS_TX_COUNT 11 +#define MANA_RX_FRAG_ALIGNMENT 64 + struct mana_stats_rx { u64 packets; u64 bytes; @@ -328,6 +330,7 @@ struct mana_rxq { u32 datasize; u32 alloc_size; u32 headroom; + u32 frag_count; mana_handle_t rxobj; @@ -510,6 +513,7 @@ struct mana_port_context { u32 rxbpre_datasize; u32 rxbpre_alloc_size; u32 rxbpre_headroom; + u32 rxbpre_frag_count; struct bpf_prog *bpf_prog; -- cgit v1.2.3 From c3f0c02997c7f8489fec259e28e0e04e9811edac Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:26 -0700 Subject: net: Add skb_dstref_steal and skb_dstref_restore Going forward skb_dst_set will assert that skb dst_entry is empty during skb_dst_set to prevent potential leaks. There are few places that still manually manage dst_entry not using the helpers. Convert them to the following new helpers: - skb_dstref_steal that resets dst_entry and returns previous dst_entry value - skb_dstref_restore that restores dst_entry previously reset via skb_dstref_steal Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 14b923ddb6df..7538ca507ee9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1159,6 +1159,38 @@ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } +/** + * skb_dstref_steal() - return current dst_entry value and clear it + * @skb: buffer + * + * Resets skb dst_entry without adjusting its reference count. Useful in + * cases where dst_entry needs to be temporarily reset and restored. + * Note that the returned value cannot be used directly because it + * might contain SKB_DST_NOREF bit. + * + * When in doubt, prefer skb_dst_drop() over skb_dstref_steal() to correctly + * handle dst_entry reference counting. + * + * Returns: original skb dst_entry. + */ +static inline unsigned long skb_dstref_steal(struct sk_buff *skb) +{ + unsigned long refdst = skb->_skb_refdst; + + skb->_skb_refdst = 0; + return refdst; +} + +/** + * skb_dstref_restore() - restore skb dst_entry removed via skb_dstref_steal() + * @skb: buffer + * @refdst: dst entry from a call to skb_dstref_steal() + */ +static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst) +{ + skb->_skb_refdst = refdst; +} + /** * skb_dst_set - sets skb dst * @skb: buffer -- cgit v1.2.3 From a890348adcc993f48d1ae38f1174dc8de4c3c5ac Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:32 -0700 Subject: net: Add skb_dst_check_unset To prevent dst_entry leaks, add warning when the non-NULL dst_entry is rewritten. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-8-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7538ca507ee9..ca8be45dd8be 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1159,6 +1159,12 @@ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } +static inline void skb_dst_check_unset(struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE((skb->_skb_refdst & SKB_DST_PTRMASK) && + !(skb->_skb_refdst & SKB_DST_NOREF)); +} + /** * skb_dstref_steal() - return current dst_entry value and clear it * @skb: buffer @@ -1188,6 +1194,7 @@ static inline unsigned long skb_dstref_steal(struct sk_buff *skb) */ static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst) { + skb_dst_check_unset(skb); skb->_skb_refdst = refdst; } @@ -1201,6 +1208,7 @@ static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst) */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { + skb_dst_check_unset(skb); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } @@ -1217,6 +1225,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { + skb_dst_check_unset(skb); WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; -- cgit v1.2.3 From 68889dfd547bd8eabc5a98b58475d7b901cf5129 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:09 +0000 Subject: mptcp: Fix up subflow's memcg when CONFIG_SOCK_CGROUP_DATA=n. When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets sk->sk_memcg based on the current task. MPTCP subflow socket creation is triggered from userspace or an in-kernel worker. In the latter case, sk->sk_memcg is not what we want. So, we fix it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup(). Although the code is placed under #ifdef CONFIG_MEMCG, it is buried under #ifdef CONFIG_SOCK_CGROUP_DATA. The two configs are orthogonal. If CONFIG_MEMCG is enabled without CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged correctly. Let's move the code out of the wrong ifdef guard. Note that sk->sk_memcg is freed in sk_prot_free() and the parent sk holds the refcnt of memcg->css here, so we don't need to use css_tryget(). Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Matthieu Baerts (NGI0) Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 13 +++++++++++++ net/mptcp/subflow.c | 11 +++-------- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..25921fbec685 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1604,6 +1604,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1661,6 +1662,11 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg); #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..46713b9ece06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5024,6 +5024,19 @@ void mem_cgroup_sk_free(struct sock *sk) css_put(&sk->sk_memcg->css); } +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ + if (sk->sk_memcg == newsk->sk_memcg) + return; + + mem_cgroup_sk_free(newsk); + + if (sk->sk_memcg) + css_get(&sk->sk_memcg->css); + + newsk->sk_memcg = sk->sk_memcg; +} + /** * mem_cgroup_charge_skmem - charge socket memory * @memcg: memcg to charge diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..c8a7e4b59db1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1717,19 +1717,14 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = parent->sk_memcg; - - mem_cgroup_sk_free(child); - if (memcg && css_tryget(&memcg->css)) - child->sk_memcg = memcg; -#endif /* CONFIG_MEMCG */ - cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ + + if (mem_cgroup_sockets_enabled) + mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) -- cgit v1.2.3 From f7161b234f2ec7f18999009c4becc04eeb6b12a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:14 +0000 Subject: net-memcg: Introduce mem_cgroup_from_sk(). We will store a flag in the lowest bit of sk->sk_memcg. Then, directly dereferencing sk->sk_memcg will be illegal, and we do not want to allow touching the raw sk->sk_memcg in many places. Let's introduce mem_cgroup_from_sk(). Other places accessing the raw sk->sk_memcg will be converted later. Note that we cannot define the helper as an inline function in memcontrol.h as we cannot access any fields of struct sock there due to circular dependency, so it is placed in sock.h. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 12 ++++++++++++ mm/memcontrol.c | 13 +++++++++---- net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index c8a4b283df6f..811f95ea8d00 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2594,6 +2594,18 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return sk->sk_memcg; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return NULL; +} +#endif + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46713b9ece06..d8a52d1d08fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5020,19 +5020,24 @@ out: void mem_cgroup_sk_free(struct sock *sk) { - if (sk->sk_memcg) - css_put(&sk->sk_memcg->css); + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + + if (memcg) + css_put(&memcg->css); } void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { + struct mem_cgroup *memcg; + if (sk->sk_memcg == newsk->sk_memcg) return; mem_cgroup_sk_free(newsk); - if (sk->sk_memcg) - css_get(&sk->sk_memcg->css); + memcg = mem_cgroup_from_sk(sk); + if (memcg) + css_get(&memcg->css); newsk->sk_memcg = sk->sk_memcg; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 724bd9ed6cd4..93569bbe00f4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -718,7 +718,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) lock_sock(newsk); mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg) { + if (mem_cgroup_from_sk(newsk)) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ -- cgit v1.2.3 From 43049b0db03823c2cd003ca7d3dddcd3924da8dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:15 +0000 Subject: net-memcg: Introduce mem_cgroup_sk_enabled(). The socket memcg feature is enabled by a static key and only works for non-root cgroup. We check both conditions in many places. Let's factorise it as a helper function. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/proto_memory.h | 2 +- include/net/sock.h | 10 ++++++++++ include/net/tcp.h | 2 +- net/core/sock.c | 6 +++--- net/ipv4/tcp_output.c | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index a6ab2f4f5e28..859e63de81c4 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -31,7 +31,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && + if (mem_cgroup_sk_enabled(sk) && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; diff --git a/include/net/sock.h b/include/net/sock.h index 811f95ea8d00..3efdf680401d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2599,11 +2599,21 @@ static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return sk->sk_memcg; } + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); +} #else static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return NULL; } + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return false; +} #endif static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) diff --git a/include/net/tcp.h b/include/net/tcp.h index 526a26e7a150..9f01b6be6444 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -275,7 +275,7 @@ extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_memcg && + if (mem_cgroup_sk_enabled(sk) && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; diff --git a/net/core/sock.c b/net/core/sock.c index 000940ecf360..ab658fe23e1e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) @@ -3271,7 +3271,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) { + if (mem_cgroup_sk_enabled(sk)) { memcg = sk->sk_memcg; charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); if (!charged) @@ -3398,7 +3398,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (mem_cgroup_sk_enabled(sk)) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index caf11920a878..37fb320e6f70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3578,7 +3578,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (mem_cgroup_sk_enabled(sk)) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } -- cgit v1.2.3 From bb178c6bc08525d758a57775458d644304011bf8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:16 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_(un)?charge(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_charge_skmem() and mem_cgroup_uncharge_skmem(). Let's pass struct sock to the functions. While at it, they are renamed to match other functions starting with mem_cgroup_sk_. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 29 ++++++++++++++++++++++++----- mm/memcontrol.c | 18 +++++++++++------- net/core/sock.c | 24 +++++++++++------------- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_output.c | 3 +-- 5 files changed, 48 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25921fbec685..0837d3de3a68 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1596,15 +1596,16 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask); -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) + void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask); +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1660,13 +1661,31 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 -static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; -static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_alloc(struct sock *sk) +{ +} + +static inline void mem_cgroup_sk_free(struct sock *sk) +{ +} static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { } +static inline bool mem_cgroup_sk_charge(const struct sock *sk, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + return false; +} + +static inline void mem_cgroup_sk_uncharge(const struct sock *sk, + unsigned int nr_pages) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d8a52d1d08fa..df3e9205c9e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5043,17 +5043,19 @@ void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) } /** - * mem_cgroup_charge_skmem - charge socket memory - * @memcg: memcg to charge + * mem_cgroup_sk_charge - charge socket memory + * @sk: socket in memcg to charge * @nr_pages: number of pages to charge * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within * @memcg's configured limit, %false if it doesn't. */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask) +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return memcg1_charge_skmem(memcg, nr_pages, gfp_mask); @@ -5066,12 +5068,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, } /** - * mem_cgroup_uncharge_skmem - uncharge socket memory - * @memcg: memcg to uncharge + * mem_cgroup_sk_uncharge - uncharge socket memory + * @sk: socket in memcg to uncharge * @nr_pages: number of pages to uncharge */ -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { memcg1_uncharge_skmem(memcg, nr_pages); return; diff --git a/net/core/sock.c b/net/core/sock.c index ab658fe23e1e..5537ca263858 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1041,8 +1041,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) pages = sk_mem_pages(bytes); /* pre-charge to memcg */ - charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; @@ -1054,7 +1054,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); - mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); @@ -3263,17 +3263,16 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - struct mem_cgroup *memcg = NULL; - bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (mem_cgroup_sk_enabled(sk)) { - memcg = sk->sk_memcg; - charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); if (!charged) goto suppress_allocation; } @@ -3347,10 +3346,9 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg && !charged) { - mem_cgroup_charge_skmem(memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); - } + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; } } @@ -3360,7 +3358,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); if (charged) - mem_cgroup_uncharge_skmem(memcg, amt); + mem_cgroup_sk_uncharge(sk, amt); return 0; } @@ -3399,7 +3397,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + mem_cgroup_sk_uncharge(sk, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 93569bbe00f4..0ef1eacd539d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -727,7 +727,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + mem_cgroup_sk_charge(newsk, amt, gfp); kmem_cache_charge(newsk, gfp); release_sock(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 37fb320e6f70..dfbac0876d96 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3579,8 +3579,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt); if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. -- cgit v1.2.3 From b2ffd10cddde47cc6830e4981e91e3215def62b1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:17 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_under_memory_pressure(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_under_socket_pressure(). Let's pass struct sock to it and rename the function to match other functions starting with mem_cgroup_sk_. Note that the helper is moved to sock.h to use mem_cgroup_from_sk(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 18 ------------------ include/net/proto_memory.h | 2 +- include/net/sock.h | 22 ++++++++++++++++++++++ include/net/tcp.h | 2 +- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0837d3de3a68..fb27e3d2fdac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1642,19 +1642,6 @@ static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg) } #endif -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ -#ifdef CONFIG_MEMCG_V1 - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return !!memcg->tcpmem_pressure; -#endif /* CONFIG_MEMCG_V1 */ - do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) - return true; - } while ((memcg = parent_mem_cgroup(memcg))); - return false; -} - int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); @@ -1686,11 +1673,6 @@ static inline void mem_cgroup_sk_uncharge(const struct sock *sk, { } -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ - return false; -} - static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index 859e63de81c4..8e91a8fa31b5 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -32,7 +32,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) return false; if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + mem_cgroup_sk_under_memory_pressure(sk)) return true; return !!READ_ONCE(*sk->sk_prot->memory_pressure); diff --git a/include/net/sock.h b/include/net/sock.h index 3efdf680401d..3bc4d566f7d0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2604,6 +2604,23 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) { return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); } + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + +#ifdef CONFIG_MEMCG_V1 + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ + + do { + if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + return true; + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} #else static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { @@ -2614,6 +2631,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) { return false; } + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + return false; +} #endif static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f01b6be6444..2936b8175950 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -276,7 +276,7 @@ extern unsigned long tcp_memory_pressure; static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + mem_cgroup_sk_under_memory_pressure(sk)) return true; return READ_ONCE(tcp_memory_pressure); -- cgit v1.2.3 From bf64002c94fc330b996bc438f3d1b6bd3d781659 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:18 +0000 Subject: net: Define sk_memcg under CONFIG_MEMCG. Except for sk_clone_lock(), all accesses to sk->sk_memcg is done under CONFIG_MEMCG. As a bonus, let's define sk->sk_memcg under CONFIG_MEMCG. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-11-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ net/core/sock.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 3bc4d566f7d0..1c49ea13af4a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -443,7 +443,9 @@ struct sock { __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; +#ifdef CONFIG_MEMCG struct mem_cgroup *sk_memcg; +#endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif diff --git a/net/core/sock.c b/net/core/sock.c index 5537ca263858..ab6953d295df 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2512,8 +2512,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); +#ifdef CONFIG_MEMCG /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; +#endif cgroup_sk_clone(&newsk->sk_cgrp_data); @@ -4452,7 +4454,9 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- cgit v1.2.3 From b5940feda3dc7a12133c6589e463d2b3b6c7fe96 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 19 Aug 2025 08:43:49 -0700 Subject: scsi: ufs: core: Reduce the size of struct ufshcd_lrb The size of the data structures that are used in the hot path matters for performance (IOPS). Hence this patch that reduces the size of struct ufshcd_lrb on 64-bit systems by 16 bytes. The size of this data structure is reduced from 152 to 136 bytes. Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20250819154356.2256952-1-bvanassche@acm.org Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 1d3943777584..30ff169878dc 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -167,13 +167,13 @@ struct ufs_pm_lvl_states { * @task_tag: Task tag of the command * @lun: LUN of the command * @intr_cmd: Interrupt command (doesn't participate in interrupt aggregation) + * @req_abort_skip: skip request abort task flag * @issue_time_stamp: time stamp for debug purposes (CLOCK_MONOTONIC) * @issue_time_stamp_local_clock: time stamp for debug purposes (local_clock) * @compl_time_stamp: time stamp for statistics (CLOCK_MONOTONIC) * @compl_time_stamp_local_clock: time stamp for debug purposes (local_clock) * @crypto_key_slot: the key slot to use for inline crypto (-1 if none) * @data_unit_num: the data unit number for the first block for inline crypto - * @req_abort_skip: skip request abort task flag */ struct ufshcd_lrb { struct utp_transfer_req_desc *utr_descriptor_ptr; @@ -193,6 +193,7 @@ struct ufshcd_lrb { int task_tag; u8 lun; /* UPIU LUN id field is only 8-bit wide */ bool intr_cmd; + bool req_abort_skip; ktime_t issue_time_stamp; u64 issue_time_stamp_local_clock; ktime_t compl_time_stamp; @@ -201,8 +202,6 @@ struct ufshcd_lrb { int crypto_key_slot; u64 data_unit_num; #endif - - bool req_abort_skip; }; /** -- cgit v1.2.3 From bf40785fa437c1752117df2edb3220e9c37d98a6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:24 -0700 Subject: sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk authentication For SCTP chunk authentication, use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. There's no longer any need to pre-allocate 'crypto_shash' objects; the SCTP code now simply calls into the HMAC code directly. As part of this, make SCTP always support both HMAC-SHA1 and HMAC-SHA256. Previously, it only guaranteed support for HMAC-SHA1. However, HMAC-SHA256 tended to be supported too anyway, as it was supported if CONFIG_CRYPTO_SHA256 was enabled elsewhere in the kconfig. Acked-by: Xin Long Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-4-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sctp/auth.h | 17 ++--- include/net/sctp/constants.h | 4 -- include/net/sctp/structs.h | 5 -- net/sctp/Kconfig | 16 +++-- net/sctp/auth.c | 166 ++++++++----------------------------------- net/sctp/chunk.c | 3 +- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 10 --- 9 files changed, 48 insertions(+), 177 deletions(-) (limited to 'include') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index d4b3b2dcd15b..3d5879e08e78 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,16 +22,11 @@ struct sctp_endpoint; struct sctp_association; struct sctp_authkey; struct sctp_hmacalgo; -struct crypto_shash; -/* - * Define a generic struct that will hold all the info - * necessary for an HMAC transform - */ +/* Defines an HMAC algorithm supported by SCTP chunk authentication */ struct sctp_hmac { - __u16 hmac_id; /* one of the above ids */ - char *hmac_name; /* name for loading */ - __u16 hmac_len; /* length of the signature */ + __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */ + __u16 hmac_len; /* length of the HMAC value in bytes */ }; /* This is generic structure that containst authentication bytes used @@ -78,9 +73,9 @@ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp); -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]); -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs); int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5859e0a16a58..8e0f4c4f7750 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -417,16 +417,12 @@ enum { SCTP_AUTH_HMAC_ID_RESERVED_0, SCTP_AUTH_HMAC_ID_SHA1, SCTP_AUTH_HMAC_ID_RESERVED_2, -#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) SCTP_AUTH_HMAC_ID_SHA256, -#endif __SCTP_AUTH_HMAC_MAX }; #define SCTP_AUTH_HMAC_ID_MAX __SCTP_AUTH_HMAC_MAX - 1 #define SCTP_AUTH_NUM_HMACS __SCTP_AUTH_HMAC_MAX -#define SCTP_SHA1_SIG_SIZE 20 -#define SCTP_SHA256_SIG_SIZE 32 /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH chunks diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8a540ad9b509..6be6aec25731 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1329,11 +1329,6 @@ struct sctp_endpoint { /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; - /* SCTP AUTH: array of the HMACs that will be allocated - * we need this per association so that we don't serialize - */ - struct crypto_shash **auth_hmacs; - /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 24d5a35ce894..09c77b4d161b 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,9 +7,9 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS select NET_CRC32C select NET_UDP_TUNNEL help @@ -79,15 +79,17 @@ config SCTP_COOKIE_HMAC_MD5 bool "Enable optional MD5 hmac cookie generation" help Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5 - select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5 + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 config SCTP_COOKIE_HMAC_SHA1 bool "Enable optional SHA1 hmac cookie generation" help Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 - select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA1 config INET_SCTP_DIAG depends on INET_DIAG diff --git a/net/sctp/auth.c b/net/sctp/auth.c index c58fffc86a0c..82aad477590e 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -12,36 +12,37 @@ * Vlad Yasevich */ -#include +#include +#include #include #include -#include #include #include -static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { +static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, - .hmac_name = "hmac(sha1)", - .hmac_len = SCTP_SHA1_SIG_SIZE, + .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, -#if IS_ENABLED(CONFIG_CRYPTO_SHA256) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, - .hmac_name = "hmac(sha256)", - .hmac_len = SCTP_SHA256_SIG_SIZE, + .hmac_len = SHA256_DIGEST_SIZE, } -#endif }; +static bool sctp_hmac_supported(__u16 hmac_id) +{ + return hmac_id < ARRAY_SIZE(sctp_hmac_list) && + sctp_hmac_list[hmac_id].hmac_len != 0; +} void sctp_auth_key_put(struct sctp_auth_bytes *key) { @@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( return NULL; } -/* - * Initialize all the possible digest transforms that we can use. Right - * now, the supported digests are SHA1 and SHA256. We do this here once - * because of the restrictiong that transforms may only be allocated in - * user context. This forces us to pre-allocated all possible transforms - * at the endpoint init time. - */ -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) -{ - struct crypto_shash *tfm = NULL; - __u16 id; - - /* If the transforms are already allocated, we are done */ - if (ep->auth_hmacs) - return 0; - - /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS, - sizeof(struct crypto_shash *), - gfp); - if (!ep->auth_hmacs) - return -ENOMEM; - - for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) { - - /* See is we support the id. Supported IDs have name and - * length fields set, so that we can allocated and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (!sctp_hmac_list[id].hmac_name) - continue; - - /* If this TFM has been allocated, we are all set */ - if (ep->auth_hmacs[id]) - continue; - - /* Allocate the ID */ - tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0); - if (IS_ERR(tfm)) - goto out_err; - - ep->auth_hmacs[id] = tfm; - } - - return 0; - -out_err: - /* Clean up any successful allocations */ - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; - return -ENOMEM; -} - -/* Destroy the hmac tfm array */ -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]) -{ - int i; - - if (!auth_hmacs) - return; - - for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { - crypto_free_shash(auth_hmacs[i]); - } - kfree(auth_hmacs); -} - - -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } @@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) /* Get an hmac description information that we can use to build * the AUTH chunk */ -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; @@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range. And - * see if we support the id. Supported IDs have name and - * length fields set, so that we can allocate and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (id > SCTP_AUTH_HMAC_ID_MAX || - !sctp_hmac_list[id].hmac_name) { - id = 0; - continue; - } - - break; + if (sctp_hmac_supported(id)) + return &sctp_hmac_list[id]; } - - if (id == 0) - return NULL; - - return &sctp_hmac_list[id]; + return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) @@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { - struct sctp_endpoint *ep; __u16 id; int i; int n_params; @@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; - ep = asoc->ep; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range */ - if (id > SCTP_AUTH_HMAC_ID_MAX) - continue; - - /* If this TFM has been allocated, use this id */ - if (ep->auth_hmacs[id]) { + if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } @@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; - struct crypto_shash *tfm; __u16 key_id, hmac_id; - unsigned char *end; int free_key = 0; + size_t data_len; __u8 *digest; /* Extract the info we need: @@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, free_key = 1; } - /* set up scatter list */ - end = skb_tail_pointer(skb); - - tfm = asoc->ep->auth_hmacs[hmac_id]; - + data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); - if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len)) - goto free; - - crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth, - digest); + if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { + hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } else { + WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); + hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } -free: if (free_key) sctp_auth_key_put(asoc_key); } @@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; - if (id > SCTP_AUTH_HMAC_ID_MAX) + if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; - - if (!sctp_hmac_list[id].hmac_name) - return -EOPNOTSUPP; } if (!has_sha1) @@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { - int err = -ENOMEM; - /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. @@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) ep->auth_chunk_list = auth_chunks; } - /* Allocate and initialize transorms arrays for supported - * HMACs. - */ - err = sctp_auth_init_hmacs(ep, gfp); - if (err) - goto nomem; - return 0; nomem: @@ -1075,7 +969,7 @@ nomem: kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - return err; + return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) @@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep) kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index fd4f8243cc35..c655b571ca01 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { - struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); + const struct sctp_hmac *hmac_desc = + sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d099b605e44a..a1a3c8494c5d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1320,7 +1320,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; - struct sctp_hmac *hmac_desc; + const struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d4d5b14b49b3..4cb8f393434d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4362,7 +4362,7 @@ static enum sctp_ierror sctp_sf_authenticate( struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; - struct sctp_hmac *hmac; + const struct sctp_hmac *hmac; unsigned int sig_len; __u16 key_id; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4921416434f9..0292881a847c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9581,16 +9581,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, if (err) return err; - /* New ep's auth_hmacs should be set if old ep's is set, in case - * that net->sctp.auth_enable has been changed to 0 by users and - * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). - */ - if (oldsp->ep->auth_hmacs) { - err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); - if (err) - return err; - } - sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the -- cgit v1.2.3 From 2f3dd6ec901f29aef5fff3d7a63b1371d67c1760 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:25 -0700 Subject: sctp: Convert cookie authentication to use HMAC-SHA256 Convert SCTP cookies to use HMAC-SHA256, instead of the previous choice of the legacy algorithms HMAC-MD5 and HMAC-SHA1. Simplify and optimize the code by using the HMAC-SHA256 library instead of crypto_shash, and by preparing the HMAC key when it is generated instead of per-operation. This doesn't break compatibility, since the cookie format is an implementation detail, not part of the SCTP protocol itself. Note that the cookie size doesn't change either. The HMAC field was already 32 bytes, even though previously at most 20 bytes were actually compared. 32 bytes exactly fits an untruncated HMAC-SHA256 value. So, although we could safely truncate the MAC to something slightly shorter, for now just keep the cookie size the same. I also considered SipHash, but that would generate only 8-byte MACs. An 8-byte MAC *might* suffice here. However, there's quite a lot of information in the SCTP cookies: more than in TCP SYN cookies. So absent an analysis that occasional forgeries of all that information is okay in SCTP, I errored on the side of caution. Remove HMAC-MD5 and HMAC-SHA1 as options, since the new HMAC-SHA256 option is just better. It's faster as well as more secure. For example, benchmarking on x86_64, cookie authentication is now nearly 3x as fast as the previous default choice and implementation of HMAC-MD5. Also just make the kernel always support cookie authentication if SCTP is supported at all, rather than making it optional in the build. (It was sort of optional before, but it didn't really work properly. E.g., a kernel with CONFIG_SCTP_COOKIE_HMAC_MD5=n still supported HMAC-MD5 cookie authentication if CONFIG_CRYPTO_HMAC and CONFIG_CRYPTO_MD5 happened to be enabled in the kconfig for other reasons.) Acked-by: Xin Long Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-5-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 11 +++---- include/net/netns/sctp.h | 4 +-- include/net/sctp/constants.h | 5 ++- include/net/sctp/structs.h | 30 +++++------------- net/sctp/Kconfig | 43 ++++++------------------- net/sctp/endpointola.c | 23 +++++++------- net/sctp/protocol.c | 11 ++----- net/sctp/sm_make_chunk.c | 57 ++++++++++------------------------ net/sctp/socket.c | 31 +----------------- net/sctp/sysctl.c | 51 +++++++++++++----------------- 10 files changed, 79 insertions(+), 187 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9756d16e3df1..3d6782683eee 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -3508,16 +3508,13 @@ cookie_hmac_alg - STRING a listening sctp socket to a connecting client in the INIT-ACK chunk. Valid values are: - * md5 - * sha1 + * sha256 * none - Ability to assign md5 or sha1 as the selected alg is predicated on the - configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and - CONFIG_CRYPTO_SHA1). + md5 and sha1 are also accepted for backwards compatibility, but cause + sha256 to be selected. - Default: Dependent on configuration. MD5 if available, else SHA1 if - available, else none. + Default: sha256 rcvbuf_policy - INTEGER Determines if the receive buffer is attributed to the socket or to diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index d25cd7a9c5ff..c0f97f36389e 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -75,8 +75,8 @@ struct netns_sctp { /* Whether Cookie Preservative is enabled(1) or not(0) */ int cookie_preserve_enable; - /* The namespace default hmac alg */ - char *sctp_hmac_alg; + /* Whether cookie authentication is enabled(1) or not(0) */ + int cookie_auth_enable; /* Valid.Cookie.Life - 60 seconds */ unsigned int valid_cookie_life; diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 8e0f4c4f7750..ae3376ba0b99 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,9 +296,8 @@ enum { SCTP_MAX_GABS = 16 }; */ #define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */ -#define SCTP_SECRET_SIZE 32 /* Number of octets in a 256 bits. */ - -#define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */ +#define SCTP_COOKIE_KEY_SIZE 32 /* size of cookie HMAC key */ +#define SCTP_COOKIE_MAC_SIZE 32 /* size of HMAC field in cookies */ #define SCTP_COOKIE_MULTIPLE 32 /* Pad out our cookie to make our hash * functions simpler to write. diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6be6aec25731..2ae390219efd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -32,6 +32,7 @@ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ +#include #include #include #include @@ -68,7 +69,6 @@ struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; -struct crypto_shash; struct sctp_stream; @@ -155,10 +155,6 @@ struct sctp_sock { /* PF_ family specific functions. */ struct sctp_pf *pf; - /* Access to HMAC transform. */ - struct crypto_shash *hmac; - char *sctp_hmac_alg; - /* What is our base endpointer? */ struct sctp_endpoint *ep; @@ -227,7 +223,8 @@ struct sctp_sock { frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, - data_ready_signalled:1; + data_ready_signalled:1, + cookie_auth_enable:1; atomic_t pd_mode; @@ -335,7 +332,7 @@ struct sctp_cookie { /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { - __u8 signature[SCTP_SECRET_SIZE]; + __u8 mac[SCTP_COOKIE_MAC_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; @@ -1307,22 +1304,9 @@ struct sctp_endpoint { /* This is really a list of struct sctp_association entries. */ struct list_head asocs; - /* Secret Key: A secret key used by this endpoint to compute - * the MAC. This SHOULD be a cryptographic quality - * random number with a sufficient length. - * Discussion in [RFC1750] can be helpful in - * selection of the key. - */ - __u8 secret_key[SCTP_SECRET_SIZE]; - - /* digest: This is a digest of the sctp cookie. This field is - * only used on the receive path when we try to validate - * that the cookie has not been tampered with. We put - * this here so we pre-allocate this once and can re-use - * on every receive. - */ - __u8 *digest; - + /* Cookie authentication key used by this endpoint */ + struct hmac_sha256_key cookie_auth_key; + /* sendbuf acct. policy. */ __u32 sndbuf_policy; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 09c77b4d161b..e947646a380c 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -49,48 +49,25 @@ config SCTP_DBG_OBJCNT 'cat /proc/net/sctp/sctp_dbg_objcnt' If unsure, say N + choice - prompt "Default SCTP cookie HMAC encoding" - default SCTP_DEFAULT_COOKIE_HMAC_MD5 + prompt "Default SCTP cookie authentication method" + default SCTP_DEFAULT_COOKIE_HMAC_SHA256 help - This option sets the default sctp cookie hmac algorithm - when in doubt select 'md5' + This option sets the default SCTP cookie authentication method, for + when a method hasn't been explicitly selected via the + net.sctp.cookie_hmac_alg sysctl. -config SCTP_DEFAULT_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_MD5 + If unsure, choose the default (HMAC-SHA256). -config SCTP_DEFAULT_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_SHA1 +config SCTP_DEFAULT_COOKIE_HMAC_SHA256 + bool "HMAC-SHA256" config SCTP_DEFAULT_COOKIE_HMAC_NONE - bool "Use no hmac alg in SCTP cookie generation" - help - Use no hmac algorithm in SCTP cookie generation + bool "None" endchoice -config SCTP_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_MD5 - -config SCTP_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - config INET_SCTP_DIAG depends on INET_DIAG def_tristate INET_DIAG diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 7e77b450697c..31e989dfe846 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -35,6 +35,15 @@ /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); +static void gen_cookie_auth_key(struct hmac_sha256_key *key) +{ + u8 raw_key[SCTP_COOKIE_KEY_SIZE]; + + get_random_bytes(raw_key, sizeof(raw_key)); + hmac_sha256_preparekey(key, raw_key, sizeof(raw_key)); + memzero_explicit(raw_key, sizeof(raw_key)); +} + /* * Initialize the base fields of the endpoint structure. */ @@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct net *net = sock_net(sk); struct sctp_shared_key *null_key; - ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); - if (!ep->digest) - return NULL; - ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { @@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; - /* Initialize the secret key used with cookie. */ - get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); + /* Generate the cookie authentication key. */ + gen_cookie_auth_key(&ep->cookie_auth_key); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); @@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, nomem_shkey: sctp_auth_free(ep); nomem: - kfree(ep->digest); return NULL; } @@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) return; } - /* Free the digest buffer */ - kfree(ep->digest); - /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ @@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); - memset(ep->secret_key, 0, sizeof(ep->secret_key)); + memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key)); sk = ep->base.sk; /* Remove and free the port */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5ccada55f2b..3b2373b3bd5d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1334,14 +1334,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; - /* Default sctp sockets to use md5 as their hmac alg */ -#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) - net->sctp.sctp_hmac_alg = "md5"; -#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) - net->sctp.sctp_hmac_alg = "sha1"; -#else - net->sctp.sctp_hmac_alg = NULL; -#endif + /* Whether cookie authentication is enabled(1) or not(0) */ + net->sctp.cookie_auth_enable = + !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE); /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index a1a3c8494c5d..2c0017d058d4 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -30,7 +30,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -1675,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie( * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); - if (!retval) - goto nodata; + if (!retval) { + *cookie_len = 0; + return NULL; + } cookie = (struct sctp_signed_cookie *) retval->body; @@ -1707,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie( memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); - if (sctp_sk(ep->base.sk)->hmac) { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - /* Sign the message. */ - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, - cookie->signature); - if (err) - goto free_cookie; + /* Sign the cookie, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE); + hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c, + bodysize, cookie->mac); } return retval; - -free_cookie: - kfree(retval); -nodata: - *cookie_len = 0; - return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ @@ -1741,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; - __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; @@ -1771,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie( cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; - if (!sctp_sk(ep->base.sk)->hmac) - goto no_hmac; + /* Verify the cookie's MAC, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + u8 mac[SHA256_DIGEST_SIZE]; - /* Check the signature. */ - { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, - digest); - if (err) { - *error = -SCTP_IERROR_NOMEM; + hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie, + bodysize, mac); + static_assert(sizeof(cookie->mac) == sizeof(mac)); + if (crypto_memneq(mac, cookie->mac, sizeof(mac))) { + *error = -SCTP_IERROR_BAD_SIG; goto fail; } } - if (crypto_memneq(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - *error = -SCTP_IERROR_BAD_SIG; - goto fail; - } - -no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0292881a847c..ed8293a34240 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -37,7 +37,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -4987,7 +4986,7 @@ static int sctp_init_sock(struct sock *sk) sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; - sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; + sp->cookie_auth_enable = net->sctp.cookie_auth_enable; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or @@ -5079,8 +5078,6 @@ static int sctp_init_sock(struct sock *sk) if (!sp->ep) return -ENOMEM; - sp->hmac = NULL; - sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); @@ -5117,18 +5114,8 @@ static void sctp_destroy_sock(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } -/* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_common(struct sock *sk) -{ - struct sctp_sock *sp = sctp_sk(sk); - - /* Free up the HMAC transform. */ - crypto_free_shash(sp->hmac); -} - static void sctp_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -8530,22 +8517,8 @@ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; - struct crypto_shash *tfm = NULL; - char alg[32]; int err; - /* Allocate HMAC for generating cookie. */ - if (!sp->hmac && sp->sctp_hmac_alg) { - sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); - tfm = crypto_alloc_shash(alg, 0, 0); - if (IS_ERR(tfm)) { - net_info_ratelimited("failed to load transform for %s: %ld\n", - sp->sctp_hmac_alg, PTR_ERR(tfm)); - return -ENOSYS; - } - sctp_sk(sk)->hmac = tfm; - } - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -9561,7 +9534,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * copy. */ newsp->ep = newep; - newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), @@ -9713,7 +9685,6 @@ struct proto sctp_prot = { static void sctp_v6_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet6_sock_destruct(sk); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ee3eac338a9d..19acc57c3ed9 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", - .data = &init_net.sctp.sctp_hmac_alg, + .data = &init_net.sctp.cookie_auth_enable, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, @@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, - sctp.sctp_hmac_alg); + sctp.cookie_auth_enable); struct ctl_table tbl; - bool changed = false; - char *none = "none"; char tmp[8] = {0}; int ret; @@ -399,35 +397,28 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, if (write) { tbl.data = tmp; - tbl.maxlen = sizeof(tmp); - } else { - tbl.data = net->sctp.sctp_hmac_alg ? : none; - tbl.maxlen = strlen(tbl.data); - } - - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write && ret == 0) { -#ifdef CONFIG_CRYPTO_MD5 - if (!strncmp(tmp, "md5", 3)) { - net->sctp.sctp_hmac_alg = "md5"; - changed = true; + tbl.maxlen = sizeof(tmp) - 1; + ret = proc_dostring(&tbl, 1, buffer, lenp, ppos); + if (ret) + return ret; + if (!strcmp(tmp, "sha256") || + /* for backwards compatibility */ + !strcmp(tmp, "md5") || !strcmp(tmp, "sha1")) { + net->sctp.cookie_auth_enable = 1; + return 0; } -#endif -#ifdef CONFIG_CRYPTO_SHA1 - if (!strncmp(tmp, "sha1", 4)) { - net->sctp.sctp_hmac_alg = "sha1"; - changed = true; + if (!strcmp(tmp, "none")) { + net->sctp.cookie_auth_enable = 0; + return 0; } -#endif - if (!strncmp(tmp, "none", 4)) { - net->sctp.sctp_hmac_alg = NULL; - changed = true; - } - if (!changed) - ret = -EINVAL; + return -EINVAL; } - - return ret; + if (net->sctp.cookie_auth_enable) + tbl.data = (char *)"sha256"; + else + tbl.data = (char *)"none"; + tbl.maxlen = strlen(tbl.data); + return proc_dostring(&tbl, 0, buffer, lenp, ppos); } static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, -- cgit v1.2.3 From 8fc6056dcf79937c46c97fa4996cda65956437a9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Aug 2025 10:44:31 +0800 Subject: f2fs: fix to detect potential corrupted nid in free_nid_list As reported, on-disk footer.ino and footer.nid is the same and out-of-range, let's add sanity check on f2fs_alloc_nid() to detect any potential corruption in free_nid_list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 ++++++++++++++++- include/linux/f2fs_fs.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..286e8a53f8e7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; static struct kmem_cache *fsync_node_entry_slab; +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} + /* * Check whether the given nid is within node id range. */ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + if (unlikely(is_invalid_nid(sbi, nid))) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); @@ -2634,6 +2639,16 @@ retry: f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; + } + *nid = i->nid; __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2f8b8bfc0e73..6afb4a13b81d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -79,6 +79,7 @@ enum stop_cp_reason { STOP_CP_REASON_FLUSH_FAIL, STOP_CP_REASON_NO_SEGMENT, STOP_CP_REASON_CORRUPTED_FREE_BITMAP, + STOP_CP_REASON_CORRUPTED_NID, STOP_CP_REASON_MAX, }; -- cgit v1.2.3 From 5f8a4f34f6dcf4b64c3cbcfd4aa5a8d7dbcd268d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 19 Aug 2025 09:39:15 -0700 Subject: bnxt_en: hsi: Update FW interface to 1.10.3.133 The major change is struct pcie_ctx_hw_stats_v2 which has new latency histograms added. Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250819163919.104075-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- include/linux/bnxt/hsi.h | 315 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 253 insertions(+), 62 deletions(-) (limited to 'include') diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h index 549231703bce..8c5dac3b3ef3 100644 --- a/include/linux/bnxt/hsi.h +++ b/include/linux/bnxt/hsi.h @@ -276,6 +276,10 @@ struct cmd_nums { #define HWRM_REG_POWER_QUERY 0xe1UL #define HWRM_CORE_FREQUENCY_QUERY 0xe2UL #define HWRM_REG_POWER_HISTOGRAM 0xe3UL + #define HWRM_MONITOR_PAX_HISTOGRAM_START 0xe4UL + #define HWRM_MONITOR_PAX_HISTOGRAM_COLLECT 0xe5UL + #define HWRM_STAT_QUERY_ROCE_STATS 0xe6UL + #define HWRM_STAT_QUERY_ROCE_STATS_EXT 0xe7UL #define HWRM_WOL_FILTER_ALLOC 0xf0UL #define HWRM_WOL_FILTER_FREE 0xf1UL #define HWRM_WOL_FILTER_QCFG 0xf2UL @@ -407,9 +411,8 @@ struct cmd_nums { #define HWRM_FUNC_LAG_UPDATE 0x1b1UL #define HWRM_FUNC_LAG_FREE 0x1b2UL #define HWRM_FUNC_LAG_QCFG 0x1b3UL - #define HWRM_FUNC_TIMEDTX_PACING_RATE_ADD 0x1c2UL - #define HWRM_FUNC_TIMEDTX_PACING_RATE_DELETE 0x1c3UL - #define HWRM_FUNC_TIMEDTX_PACING_RATE_QUERY 0x1c4UL + #define HWRM_FUNC_TTX_PACING_RATE_PROF_QUERY 0x1c3UL + #define HWRM_FUNC_TTX_PACING_RATE_QUERY 0x1c4UL #define HWRM_SELFTEST_QLIST 0x200UL #define HWRM_SELFTEST_EXEC 0x201UL #define HWRM_SELFTEST_IRQ 0x202UL @@ -441,6 +444,7 @@ struct cmd_nums { #define HWRM_MFG_WRITE_CERT_NVM 0x21cUL #define HWRM_PORT_POE_CFG 0x230UL #define HWRM_PORT_POE_QCFG 0x231UL + #define HWRM_PORT_PHY_FDRSTAT 0x232UL #define HWRM_UDCC_QCAPS 0x258UL #define HWRM_UDCC_CFG 0x259UL #define HWRM_UDCC_QCFG 0x25aUL @@ -453,6 +457,8 @@ struct cmd_nums { #define HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS 0x261UL #define HWRM_QUEUE_PFCWD_TIMEOUT_CFG 0x262UL #define HWRM_QUEUE_PFCWD_TIMEOUT_QCFG 0x263UL + #define HWRM_QUEUE_ADPTV_QOS_RX_QCFG 0x264UL + #define HWRM_QUEUE_ADPTV_QOS_TX_QCFG 0x265UL #define HWRM_TF 0x2bcUL #define HWRM_TF_VERSION_GET 0x2bdUL #define HWRM_TF_SESSION_OPEN 0x2c6UL @@ -551,6 +557,8 @@ struct cmd_nums { #define HWRM_DBG_COREDUMP_CAPTURE 0xff2cUL #define HWRM_DBG_PTRACE 0xff2dUL #define HWRM_DBG_SIM_CABLE_STATE 0xff2eUL + #define HWRM_DBG_TOKEN_QUERY_AUTH_IDS 0xff2fUL + #define HWRM_DBG_TOKEN_CFG 0xff30UL #define HWRM_NVM_GET_VPD_FIELD_INFO 0xffeaUL #define HWRM_NVM_SET_VPD_FIELD_INFO 0xffebUL #define HWRM_NVM_DEFRAG 0xffecUL @@ -632,8 +640,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 3 -#define HWRM_VERSION_RSVD 97 -#define HWRM_VERSION_STR "1.10.3.97" +#define HWRM_VERSION_RSVD 133 +#define HWRM_VERSION_STR "1.10.3.133" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -688,6 +696,7 @@ struct hwrm_ver_get_output { #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED 0x4000UL #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE 0x8000UL #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_SOC_CAPABLE 0x10000UL + #define VER_GET_RESP_DEV_CAPS_CFG_DEBUG_TOKEN_SUPPORTED 0x20000UL u8 roce_fw_maj_8b; u8 roce_fw_min_8b; u8 roce_fw_bld_8b; @@ -872,7 +881,8 @@ struct hwrm_async_event_cmpl { #define ASYNC_EVENT_CMPL_EVENT_ID_REPRESENTOR_PAIR_CHANGE 0x4eUL #define ASYNC_EVENT_CMPL_EVENT_ID_VF_STAT_CHANGE 0x4fUL #define ASYNC_EVENT_CMPL_EVENT_ID_HOST_COREDUMP 0x50UL - #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x51UL + #define ASYNC_EVENT_CMPL_EVENT_ID_ADPTV_QOS 0x51UL + #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x52UL #define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG 0xfeUL #define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR 0xffUL #define ASYNC_EVENT_CMPL_EVENT_ID_LAST ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR @@ -1344,7 +1354,8 @@ struct hwrm_async_event_cmpl_dbg_buf_producer { #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA2_TRACE 0x9UL #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_RIGP1_TRACE 0xaUL #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE 0xbUL - #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_LAST ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE + #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_ERR_QPC_TRACE 0xcUL + #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_LAST ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_ERR_QPC_TRACE }; /* hwrm_async_event_cmpl_hwrm_error (size:128b/16B) */ @@ -1401,7 +1412,11 @@ struct hwrm_async_event_cmpl_error_report_base { #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_THERMAL_THRESHOLD 0x5UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED 0x6UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUP_UDCC_SES 0x7UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DB_DROP 0x8UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MD_TEMP 0x9UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_VNIC_ERR 0xaUL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_VNIC_ERR }; /* hwrm_async_event_cmpl_error_report_pause_storm (size:128b/16B) */ @@ -1914,6 +1929,12 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_FLAGS_EXT3_RX_RATE_PROFILE_SEL_SUPPORTED 0x8UL #define FUNC_QCAPS_RESP_FLAGS_EXT3_BIDI_OPT_SUPPORTED 0x10UL #define FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED 0x20UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_ROCE_VF_DYN_ALLOC_SUPPORT 0x40UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_CHANGE_UDP_SRCPORT_SUPPORT 0x80UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_PCIE_COMPLIANCE_SUPPORTED 0x100UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_MULTI_L2_DB_SUPPORTED 0x200UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_PCIE_SECURE_ATS_SUPPORTED 0x400UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_MBUF_STATS_SUPPORTED 0x800UL __le16 max_roce_vfs; __le16 max_crypto_rx_flow_filters; u8 unused_3[3]; @@ -1931,7 +1952,7 @@ struct hwrm_func_qcfg_input { u8 unused_0[6]; }; -/* hwrm_func_qcfg_output (size:1344b/168B) */ +/* hwrm_func_qcfg_output (size:1408b/176B) */ struct hwrm_func_qcfg_output { __le16 error_code; __le16 req_type; @@ -2124,7 +2145,43 @@ struct hwrm_func_qcfg_output { #define FUNC_QCFG_RESP_XID_PARTITION_CFG_TX_CK 0x1UL #define FUNC_QCFG_RESP_XID_PARTITION_CFG_RX_CK 0x2UL __le16 mirror_vnic_id; - u8 unused_7[7]; + u8 max_link_width; + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X1 0x1UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X2 0x2UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X4 0x4UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X8 0x8UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X16 0x10UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_LAST FUNC_QCFG_RESP_MAX_LINK_WIDTH_X16 + u8 max_link_speed; + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G1 0x1UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G2 0x2UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G3 0x3UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G4 0x4UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G5 0x5UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_LAST FUNC_QCFG_RESP_MAX_LINK_SPEED_G5 + u8 negotiated_link_width; + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X1 0x1UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X2 0x2UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X4 0x4UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X8 0x8UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X16 0x10UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_LAST FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X16 + u8 negotiated_link_speed; + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G1 0x1UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G2 0x2UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G3 0x3UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G4 0x4UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G5 0x5UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_LAST FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G5 + u8 unused_7[2]; + u8 pcie_compliance; + u8 unused_8; + __le16 l2_db_multi_page_size_kb; + u8 unused_9[5]; u8 valid; }; @@ -2322,6 +2379,7 @@ struct hwrm_func_cfg_input { #define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_GID_PER_VF 0x200UL #define FUNC_CFG_REQ_ENABLES2_XID_PARTITION_CFG 0x400UL #define FUNC_CFG_REQ_ENABLES2_PHYSICAL_SLOT_NUMBER 0x800UL + #define FUNC_CFG_REQ_ENABLES2_PCIE_COMPLIANCE 0x1000UL u8 port_kdnet_mode; #define FUNC_CFG_REQ_PORT_KDNET_MODE_DISABLED 0x0UL #define FUNC_CFG_REQ_PORT_KDNET_MODE_ENABLED 0x1UL @@ -2353,7 +2411,8 @@ struct hwrm_func_cfg_input { __le16 xid_partition_cfg; #define FUNC_CFG_REQ_XID_PARTITION_CFG_TX_CK 0x1UL #define FUNC_CFG_REQ_XID_PARTITION_CFG_RX_CK 0x2UL - __le16 unused_2; + u8 pcie_compliance; + u8 unused_2; }; /* hwrm_func_cfg_output (size:128b/16B) */ @@ -2370,11 +2429,41 @@ struct hwrm_func_cfg_output { struct hwrm_func_cfg_cmd_err { u8 code; #define FUNC_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_RANGE 0x1UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_MORE_THAN_MAX 0x2UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_UNSUPPORTED 0x3UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT 0x4UL - #define FUNC_CFG_CMD_ERR_CODE_LAST FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT + #define FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_OUT_OF_RANGE 0x1UL + #define FUNC_CFG_CMD_ERR_CODE_NPAR_PARTITION_DOWN_FAILED 0x2UL + #define FUNC_CFG_CMD_ERR_CODE_TPID_SET_DFLT_VLAN_NOT_SET 0x3UL + #define FUNC_CFG_CMD_ERR_CODE_RES_ARRAY_ALLOC_FAILED 0x4UL + #define FUNC_CFG_CMD_ERR_CODE_TX_RING_ASSET_TEST_FAILED 0x5UL + #define FUNC_CFG_CMD_ERR_CODE_TX_RING_RES_UPDATE_FAILED 0x6UL + #define FUNC_CFG_CMD_ERR_CODE_APPLY_MAX_BW_FAILED 0x7UL + #define FUNC_CFG_CMD_ERR_CODE_ENABLE_EVB_FAILED 0x8UL + #define FUNC_CFG_CMD_ERR_CODE_RSS_CTXT_ASSET_TEST_FAILED 0x9UL + #define FUNC_CFG_CMD_ERR_CODE_RSS_CTXT_RES_UPDATE_FAILED 0xaUL + #define FUNC_CFG_CMD_ERR_CODE_CMPL_RING_ASSET_TEST_FAILED 0xbUL + #define FUNC_CFG_CMD_ERR_CODE_CMPL_RING_RES_UPDATE_FAILED 0xcUL + #define FUNC_CFG_CMD_ERR_CODE_NQ_ASSET_TEST_FAILED 0xdUL + #define FUNC_CFG_CMD_ERR_CODE_NQ_RES_UPDATE_FAILED 0xeUL + #define FUNC_CFG_CMD_ERR_CODE_RX_RING_ASSET_TEST_FAILED 0xfUL + #define FUNC_CFG_CMD_ERR_CODE_RX_RING_RES_UPDATE_FAILED 0x10UL + #define FUNC_CFG_CMD_ERR_CODE_VNIC_ASSET_TEST_FAILED 0x11UL + #define FUNC_CFG_CMD_ERR_CODE_VNIC_RES_UPDATE_FAILED 0x12UL + #define FUNC_CFG_CMD_ERR_CODE_FAILED_TO_START_STATS_THREAD 0x13UL + #define FUNC_CFG_CMD_ERR_CODE_RDMA_SRIOV_DISABLED 0x14UL + #define FUNC_CFG_CMD_ERR_CODE_TX_KTLS_DISABLED 0x15UL + #define FUNC_CFG_CMD_ERR_CODE_TX_KTLS_ASSET_TEST_FAILED 0x16UL + #define FUNC_CFG_CMD_ERR_CODE_TX_KTLS_RES_UPDATE_FAILED 0x17UL + #define FUNC_CFG_CMD_ERR_CODE_RX_KTLS_DISABLED 0x18UL + #define FUNC_CFG_CMD_ERR_CODE_RX_KTLS_ASSET_TEST_FAILED 0x19UL + #define FUNC_CFG_CMD_ERR_CODE_RX_KTLS_RES_UPDATE_FAILED 0x1aUL + #define FUNC_CFG_CMD_ERR_CODE_TX_QUIC_DISABLED 0x1bUL + #define FUNC_CFG_CMD_ERR_CODE_TX_QUIC_ASSET_TEST_FAILED 0x1cUL + #define FUNC_CFG_CMD_ERR_CODE_TX_QUIC_RES_UPDATE_FAILED 0x1dUL + #define FUNC_CFG_CMD_ERR_CODE_RX_QUIC_DISABLED 0x1eUL + #define FUNC_CFG_CMD_ERR_CODE_RX_QUIC_ASSET_TEST_FAILED 0x1fUL + #define FUNC_CFG_CMD_ERR_CODE_RX_QUIC_RES_UPDATE_FAILED 0x20UL + #define FUNC_CFG_CMD_ERR_CODE_INVALID_KDNET_MODE 0x21UL + #define FUNC_CFG_CMD_ERR_CODE_SCHQ_CFG_FAIL 0x22UL + #define FUNC_CFG_CMD_ERR_CODE_LAST FUNC_CFG_CMD_ERR_CODE_SCHQ_CFG_FAIL u8 unused_0[7]; }; @@ -3780,6 +3869,7 @@ struct hwrm_func_backing_store_cfg_v2_input { #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_LAST FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID __le16 instance; @@ -3865,6 +3955,7 @@ struct hwrm_func_backing_store_qcfg_v2_input { #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_LAST FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID __le16 instance; @@ -3904,6 +3995,7 @@ struct hwrm_func_backing_store_qcfg_v2_output { #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA1_TRACE 0x27UL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RIGP1_TRACE 0x29UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_ERR_QPC_TRACE 0x2aUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_LAST FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID __le16 instance; @@ -4027,6 +4119,7 @@ struct hwrm_func_backing_store_qcaps_v2_input { #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_LAST FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID u8 rsvd[6]; @@ -4070,6 +4163,7 @@ struct hwrm_func_backing_store_qcaps_v2_output { #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_LAST FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID __le16 entry_size; @@ -4216,6 +4310,10 @@ struct hwrm_port_phy_cfg_input { #define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_1XN_DISABLE 0x100000UL #define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_ENABLE 0x200000UL #define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_DISABLE 0x400000UL + #define PORT_PHY_CFG_REQ_FLAGS_LINK_TRAINING_ENABLE 0x800000UL + #define PORT_PHY_CFG_REQ_FLAGS_LINK_TRAINING_DISABLE 0x1000000UL + #define PORT_PHY_CFG_REQ_FLAGS_PRECODING_ENABLE 0x2000000UL + #define PORT_PHY_CFG_REQ_FLAGS_PRECODING_DISABLE 0x4000000UL __le32 enables; #define PORT_PHY_CFG_REQ_ENABLES_AUTO_MODE 0x1UL #define PORT_PHY_CFG_REQ_ENABLES_AUTO_DUPLEX 0x2UL @@ -4703,6 +4801,8 @@ struct hwrm_port_phy_qcfg_output { #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_MEDIA_AUTO_DETECT 0x1UL #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SIGNAL_MODE_KNOWN 0x2UL #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SPEEDS2_SUPPORTED 0x4UL + #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_LINK_TRAINING 0x8UL + #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_PRECODING 0x10UL char phy_vendor_name[16]; char phy_vendor_partnumber[16]; __le16 support_pam4_speeds; @@ -4725,6 +4825,10 @@ struct hwrm_port_phy_qcfg_output { u8 link_down_reason; #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF 0x1UL #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION 0x2UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_CABLE_REMOVED 0x4UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_MODULE_FAULT 0x8UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_BMC_REQUEST 0x10UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_TX_LASER_DISABLED 0x20UL __le16 support_speeds2; #define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_1GB 0x1UL #define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_10GB 0x2UL @@ -5882,9 +5986,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led0_color_caps; - #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 led1_id; u8 led1_type; #define PORT_LED_QCAPS_RESP_LED1_TYPE_SPEED 0x0UL @@ -5900,9 +6005,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led1_color_caps; - #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 led2_id; u8 led2_type; #define PORT_LED_QCAPS_RESP_LED2_TYPE_SPEED 0x0UL @@ -5918,9 +6024,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led2_color_caps; - #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 led3_id; u8 led3_type; #define PORT_LED_QCAPS_RESP_LED3_TYPE_SPEED 0x0UL @@ -5936,9 +6043,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led3_color_caps; - #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 unused_4[3]; u8 valid; }; @@ -7036,9 +7144,22 @@ struct hwrm_vnic_rss_cfg_output { /* hwrm_vnic_rss_cfg_cmd_err (size:64b/8B) */ struct hwrm_vnic_rss_cfg_cmd_err { u8 code; - #define VNIC_RSS_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL - #define VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY 0x1UL - #define VNIC_RSS_CFG_CMD_ERR_CODE_LAST VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY + #define VNIC_RSS_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY 0x1UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_UNABLE_TO_GET_RSS_CFG 0x2UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_TYPE_UNSUPPORTED 0x3UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_TYPE_ERR 0x4UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_MODE_FAIL 0x5UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_RING_GRP_TABLE_ALLOC_ERR 0x6UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_KEY_ALLOC_ERR 0x7UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_DMA_FAILED 0x8UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_RX_RING_ALLOC_ERR 0x9UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_CMPL_RING_ALLOC_ERR 0xaUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HW_SET_RSS_FAILED 0xbUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_CTX_INVALID 0xcUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_VNIC_INVALID 0xdUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_VNIC_RING_TABLE_PAIR_INVALID 0xeUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_LAST VNIC_RSS_CFG_CMD_ERR_CODE_VNIC_RING_TABLE_PAIR_INVALID u8 unused_0[7]; }; @@ -7177,7 +7298,7 @@ struct hwrm_vnic_rss_cos_lb_ctx_free_output { u8 valid; }; -/* hwrm_ring_alloc_input (size:704b/88B) */ +/* hwrm_ring_alloc_input (size:768b/96B) */ struct hwrm_ring_alloc_input { __le16 req_type; __le16 cmpl_ring; @@ -7195,6 +7316,7 @@ struct hwrm_ring_alloc_input { #define RING_ALLOC_REQ_ENABLES_MPC_CHNLS_TYPE 0x400UL #define RING_ALLOC_REQ_ENABLES_STEERING_TAG_VALID 0x800UL #define RING_ALLOC_REQ_ENABLES_RX_RATE_PROFILE_VALID 0x1000UL + #define RING_ALLOC_REQ_ENABLES_DPI_VALID 0x2000UL u8 ring_type; #define RING_ALLOC_REQ_RING_TYPE_L2_CMPL 0x0UL #define RING_ALLOC_REQ_RING_TYPE_TX 0x1UL @@ -7287,6 +7409,8 @@ struct hwrm_ring_alloc_input { #define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_LAST RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE u8 unused_4; __le64 cq_handle; + __le16 dpi; + __le16 unused_5[3]; }; /* hwrm_ring_alloc_output (size:128b/16B) */ @@ -7776,7 +7900,10 @@ struct hwrm_cfa_l2_set_rx_mask_cmd_err { u8 code; #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_UNKNOWN 0x0UL #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR 0x1UL - #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_MAX_VLAN_TAGS 0x2UL + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_INVALID_VNIC_ID 0x3UL + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_INVALID_ACTION 0x4UL + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST CFA_L2_SET_RX_MASK_CMD_ERR_CODE_INVALID_ACTION u8 unused_0[7]; }; @@ -8109,9 +8236,38 @@ struct hwrm_cfa_ntuple_filter_alloc_output { /* hwrm_cfa_ntuple_filter_alloc_cmd_err (size:64b/8B) */ struct hwrm_cfa_ntuple_filter_alloc_cmd_err { u8 code; - #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN 0x0UL - #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR 0x1UL - #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN 0x0UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ZERO_MAC 0x65UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_BC_MC_MAC 0x66UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_VNIC 0x67UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_PF_FID 0x68UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_L2_CTXT_ID 0x69UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_NULL_L2_CTXT_CFG 0x6aUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_NULL_L2_DATA_FLD 0x6bUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_CFA_LAYOUT 0x6cUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L2_CTXT_ALLOC_FAIL 0x6dUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ROCE_FLOW_ERR 0x6eUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_OWNER_FID 0x6fUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ZERO_REF_CNT 0x70UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_FLOW_TYPE 0x71UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_IVLAN 0x72UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_MAX_VLAN_ID 0x73UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_TNL_REQ 0x74UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L2_ADDR 0x75UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L2_IVLAN 0x76UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L3_ADDR 0x77UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L3_ADDR_TYPE 0x78UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_T_L3_ADDR_TYPE 0x79UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_DST_VNIC_ID 0x7aUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_VNI 0x7bUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_DST_ID 0x7cUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_FAIL_ROCE_L2_FLOW 0x7dUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_NPAR_VLAN 0x7eUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ATSP_ADD 0x7fUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_DFLT_VLAN_FAIL 0x80UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_L3_TYPE 0x81UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_VAL_FAIL_TNL_FLOW 0x82UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_VAL_FAIL_TNL_FLOW u8 unused_0[7]; }; @@ -9181,7 +9337,7 @@ struct pcie_ctx_hw_stats { __le64 pcie_recovery_histogram; }; -/* pcie_ctx_hw_stats_v2 (size:4096b/512B) */ +/* pcie_ctx_hw_stats_v2 (size:4544b/568B) */ struct pcie_ctx_hw_stats_v2 { __le64 pcie_pl_signal_integrity; __le64 pcie_dl_signal_integrity; @@ -9212,6 +9368,9 @@ struct pcie_ctx_hw_stats_v2 { __le64 pcie_other_packet_count; __le64 pcie_blocked_packet_count; __le64 pcie_cmpl_packet_count; + __le32 pcie_rd_latency_histogram[12]; + __le32 pcie_rd_latency_all_normal_count; + __le32 unused_2; }; /* hwrm_stat_generic_qstats_input (size:256b/32B) */ @@ -9406,7 +9565,8 @@ struct hwrm_struct_hdr { #define STRUCT_HDR_STRUCT_ID_MSIX_PER_VF 0xc8UL #define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_COUNT 0x12cUL #define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND 0x12dUL - #define STRUCT_HDR_STRUCT_ID_LAST STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND + #define STRUCT_HDR_STRUCT_ID_DBG_TOKEN_CLAIMS 0x190UL + #define STRUCT_HDR_STRUCT_ID_LAST STRUCT_HDR_STRUCT_ID_DBG_TOKEN_CLAIMS __le16 len; u8 version; #define STRUCT_HDR_VERSION_0 0x0UL @@ -9459,11 +9619,13 @@ struct hwrm_fw_set_structured_data_output { /* hwrm_fw_set_structured_data_cmd_err (size:64b/8B) */ struct hwrm_fw_set_structured_data_cmd_err { u8 code; - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT 0x2UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID 0x3UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT 0x2UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID 0x3UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_ALREADY_ADDED 0x4UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_INST_IN_PROG 0x5UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_INST_IN_PROG u8 unused_0[7]; }; @@ -9487,7 +9649,9 @@ struct hwrm_fw_get_structured_data_input { #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_PEER 0x201UL #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_OPERATIONAL 0x202UL #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL 0x300UL - #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL + #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_CLAIMS_SUPPORTED 0x320UL + #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_CLAIMS_ACTIVE 0x321UL + #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_CLAIMS_ACTIVE u8 count; u8 unused_0; }; @@ -10172,7 +10336,8 @@ struct hwrm_dbg_log_buffer_flush_input { #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA2_TRACE 0x9UL #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_RIGP1_TRACE 0xaUL #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE 0xbUL - #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_LAST DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE + #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_ERR_QPC_TRACE 0xcUL + #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_LAST DBG_LOG_BUFFER_FLUSH_REQ_TYPE_ERR_QPC_TRACE u8 unused_1[2]; __le32 flags; #define DBG_LOG_BUFFER_FLUSH_REQ_FLAGS_FLUSH_ALL_BUFFERS 0x1UL @@ -10295,10 +10460,15 @@ struct hwrm_nvm_write_output { /* hwrm_nvm_write_cmd_err (size:64b/8B) */ struct hwrm_nvm_write_cmd_err { u8 code; - #define NVM_WRITE_CMD_ERR_CODE_UNKNOWN 0x0UL - #define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL - #define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL - #define NVM_WRITE_CMD_ERR_CODE_LAST NVM_WRITE_CMD_ERR_CODE_NO_SPACE + #define NVM_WRITE_CMD_ERR_CODE_UNKNOWN 0x0UL + #define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL + #define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL + #define NVM_WRITE_CMD_ERR_CODE_WRITE_FAILED 0x3UL + #define NVM_WRITE_CMD_ERR_CODE_REQD_ERASE_FAILED 0x4UL + #define NVM_WRITE_CMD_ERR_CODE_VERIFY_FAILED 0x5UL + #define NVM_WRITE_CMD_ERR_CODE_INVALID_HEADER 0x6UL + #define NVM_WRITE_CMD_ERR_CODE_UPDATE_DIGEST_FAILED 0x7UL + #define NVM_WRITE_CMD_ERR_CODE_LAST NVM_WRITE_CMD_ERR_CODE_UPDATE_DIGEST_FAILED u8 unused_0[7]; }; @@ -10438,7 +10608,11 @@ struct hwrm_nvm_get_dev_info_output { __le16 srt2_fw_minor; __le16 srt2_fw_build; __le16 srt2_fw_patch; - u8 unused_0[7]; + u8 security_soc_fw_major; + u8 security_soc_fw_minor; + u8 security_soc_fw_build; + u8 security_soc_fw_patch; + u8 unused_0[3]; u8 valid; }; @@ -10568,7 +10742,9 @@ struct hwrm_nvm_install_update_cmd_err { #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE 0x2UL #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_ANTI_ROLLBACK 0x3UL #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT 0x4UL - #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT + #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_DEFRAG_FAILED 0x5UL + #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN_DIR_ERR 0x6UL + #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN_DIR_ERR u8 unused_0[7]; }; @@ -10591,7 +10767,8 @@ struct hwrm_nvm_get_variable_input { __le16 index_2; __le16 index_3; u8 flags; - #define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT 0x1UL + #define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT 0x1UL + #define NVM_GET_VARIABLE_REQ_FLAGS_VALIDATE_OPT_VALUE 0x2UL u8 unused_0; }; @@ -10606,18 +10783,25 @@ struct hwrm_nvm_get_variable_output { #define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_0 0x0UL #define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF 0xffffUL #define NVM_GET_VARIABLE_RESP_OPTION_NUM_LAST NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF - u8 unused_0[3]; + u8 flags; + #define NVM_GET_VARIABLE_RESP_FLAGS_VALIDATE_OPT_VALUE 0x1UL + u8 unused_0[2]; u8 valid; }; /* hwrm_nvm_get_variable_cmd_err (size:64b/8B) */ struct hwrm_nvm_get_variable_cmd_err { u8 code; - #define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT + #define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_INDEX_INVALID 0x4UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_ACCESS_DENIED 0x5UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_CB_FAILED 0x6UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_INVALID_DATA_LEN 0x7UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_NO_MEM 0x8UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST NVM_GET_VARIABLE_CMD_ERR_CODE_NO_MEM u8 unused_0[7]; }; @@ -10667,10 +10851,17 @@ struct hwrm_nvm_set_variable_output { /* hwrm_nvm_set_variable_cmd_err (size:64b/8B) */ struct hwrm_nvm_set_variable_cmd_err { u8 code; - #define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL - #define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL - #define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL - #define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR + #define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_ACTION_NOT_SUPPORTED 0x4UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_INDEX_INVALID 0x5UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_ACCESS_DENIED 0x6UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_CB_FAILED 0x7UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_INVALID_DATA_LEN 0x8UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_NO_MEM 0x9UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST NVM_SET_VARIABLE_CMD_ERR_CODE_NO_MEM u8 unused_0[7]; }; -- cgit v1.2.3 From a6d4f25888b83b8300aef28d9ee22765c1cc9b34 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Aug 2025 17:40:30 +0000 Subject: net: set net.core.rmem_max and net.core.wmem_max to 4 MB SO_RCVBUF and SO_SNDBUF have limited range today, unless distros or system admins change rmem_max and wmem_max. Even iproute2 uses 1 MB SO_RCVBUF which is capped by the kernel. Decouple [rw]mem_max and [rw]mem_default and increase [rw]mem_max to 4 MB. Before: $ sysctl net.core.rmem_default net.core.rmem_max net.core.wmem_default net.core.wmem_max net.core.rmem_default = 212992 net.core.rmem_max = 212992 net.core.wmem_default = 212992 net.core.wmem_max = 212992 After: $ sysctl net.core.rmem_default net.core.rmem_max net.core.wmem_default net.core.wmem_max net.core.rmem_default = 212992 net.core.rmem_max = 4194304 net.core.wmem_default = 212992 net.core.wmem_max = 4194304 Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250819174030.1986278-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/sysctl/net.rst | 4 ++++ Documentation/networking/ip-sysctl.rst | 6 +++--- include/net/sock.h | 4 ++-- net/core/sock.c | 8 ++++---- net/ipv4/arp.c | 2 +- net/ipv6/ndisc.c | 2 +- 6 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 7b0c4291c686..2ef50828aff1 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -222,6 +222,8 @@ rmem_max The maximum receive socket buffer size in bytes. +Default: 4194304 + rps_default_mask ---------------- @@ -247,6 +249,8 @@ wmem_max The maximum send socket buffer size in bytes. +Default: 4194304 + message_burst and message_cost ------------------------------ diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 43badb338d22..9f5891c9b07b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -209,7 +209,7 @@ neigh/default/unres_qlen_bytes - INTEGER Setting negative value is meaningless and will return error. - Default: SK_WMEM_MAX, (same as net.core.wmem_default). + Default: SK_WMEM_DEFAULT, (same as net.core.wmem_default). Exact value depends on architecture and kernel options, but should be enough to allow queuing 256 packets @@ -805,8 +805,8 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max This value results in initial window of 65535. max: maximal size of receive buffer allowed for automatically - selected receiver buffers for TCP socket. This value does not override - net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables + selected receiver buffers for TCP socket. + Calling setsockopt() with SO_RCVBUF disables automatic tuning of that socket's receive buffer size, in which case this value is ignored. Default: between 131072 and 32MB, depending on RAM size. diff --git a/include/net/sock.h b/include/net/sock.h index 1c49ea13af4a..63a6a48afb48 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2970,8 +2970,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/net/core/sock.c b/net/core/sock.c index ab6953d295df..8002ac6293dc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5cfc1c939673..833f2cf97178 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -170,7 +170,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7d5abb3158ec..57aaa7ae8ac3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -130,7 +130,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, -- cgit v1.2.3 From 5a774b64cd6a008f016119782a5d3f30ed0bf3b7 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 18 Aug 2025 09:51:21 +0200 Subject: net: phy: micrel: Add support for lan8842 The LAN8842 is a low-power, single port triple-speed (10BASE-T/ 100BASE-TX/ 1000BASE-T) ethernet physical layer transceiver (PHY) that supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6, Unshielded Twisted Pair (UTP) cables. The LAN8842 supports industry-standard SGMII (Serial Gigabit Media Independent Interface) providing chip-to-chip connection to a Gigabit Ethernet MAC using a single serialized link (differential pair) in each direction. There are 2 variants of the lan8842. The one that supports timestamping (lan8842) and one that doesn't have timestamping (lan8832). Signed-off-by: Horatiu Vultur Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250818075121.1298170-5-horatiu.vultur@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/phy/micrel.c | 209 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/micrel_phy.h | 1 + 2 files changed, 210 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index c621ed465d2e..04bd744920b0 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -448,6 +448,17 @@ struct kszphy_priv { struct kszphy_phy_stats phy_stats; }; +struct lan8842_phy_stats { + u64 rx_packets; + u64 rx_errors; + u64 tx_packets; + u64 tx_errors; +}; + +struct lan8842_priv { + struct lan8842_phy_stats phy_stats; +}; + static const struct kszphy_type lan8814_type = { .led_mode_reg = ~LAN8814_LED_CTRL_1, .cable_diag_reg = LAN8814_CABLE_DIAG, @@ -5768,6 +5779,188 @@ static int ksz9131_resume(struct phy_device *phydev) return kszphy_resume(phydev); } +#define LAN8842_SELF_TEST 14 /* 0x0e */ +#define LAN8842_SELF_TEST_RX_CNT_ENA BIT(8) +#define LAN8842_SELF_TEST_TX_CNT_ENA BIT(4) + +static int lan8842_probe(struct phy_device *phydev) +{ + struct lan8842_priv *priv; + int ret; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + /* Similar to lan8814 this PHY has a pin which needs to be pulled down + * to enable to pass any traffic through it. Therefore use the same + * function as lan8814 + */ + ret = lan8814_release_coma_mode(phydev); + if (ret) + return ret; + + /* Enable to count the RX and TX packets */ + ret = lanphy_write_page_reg(phydev, LAN8814_PAGE_PCS_DIGITAL, + LAN8842_SELF_TEST, + LAN8842_SELF_TEST_RX_CNT_ENA | + LAN8842_SELF_TEST_TX_CNT_ENA); + if (ret < 0) + return ret; + + return 0; +} + +static int lan8842_config_init(struct phy_device *phydev) +{ + int ret; + + /* Reset the PHY */ + ret = lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_QSGMII_SOFT_RESET, + LAN8814_QSGMII_SOFT_RESET_BIT, + LAN8814_QSGMII_SOFT_RESET_BIT); + if (ret < 0) + return ret; + + /* To allow the PHY to control the LEDs the GPIOs of the PHY should have + * a function mode and not the GPIO. Apparently by default the value is + * GPIO and not function even though the datasheet it says that it is + * function. Therefore set this value. + */ + return lanphy_write_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_GPIO_EN2, 0); +} + +#define LAN8842_INTR_CTRL_REG 52 /* 0x34 */ + +static int lan8842_config_intr(struct phy_device *phydev) +{ + int err; + + lanphy_write_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8842_INTR_CTRL_REG, + LAN8814_INTR_CTRL_REG_INTR_ENABLE); + + /* enable / disable interrupts */ + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + err = lan8814_ack_interrupt(phydev); + if (err) + return err; + + err = phy_write(phydev, LAN8814_INTC, LAN8814_INT_LINK); + } else { + err = phy_write(phydev, LAN8814_INTC, 0); + if (err) + return err; + + err = lan8814_ack_interrupt(phydev); + } + + return err; +} + +static unsigned int lan8842_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + /* Inband configuration can be enabled or disabled using the registers + * PCS1G_ANEG_CONFIG. + */ + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; +} + +static int lan8842_config_inband(struct phy_device *phydev, unsigned int modes) +{ + bool enable; + + if (modes == LINK_INBAND_DISABLE) + enable = false; + else + enable = true; + + /* Disable or enable in-band autoneg with PCS Host side + * It has the same address as lan8814 + */ + return lanphy_modify_page_reg(phydev, LAN8814_PAGE_PORT_REGS, + LAN8814_QSGMII_PCS1G_ANEG_CONFIG, + LAN8814_QSGMII_PCS1G_ANEG_CONFIG_ANEG_ENA, + enable ? LAN8814_QSGMII_PCS1G_ANEG_CONFIG_ANEG_ENA : 0); +} + +static irqreturn_t lan8842_handle_interrupt(struct phy_device *phydev) +{ + int ret = IRQ_NONE; + int irq_status; + + irq_status = phy_read(phydev, LAN8814_INTS); + if (irq_status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + + if (irq_status & LAN8814_INT_LINK) { + phy_trigger_machine(phydev); + ret = IRQ_HANDLED; + } + + return ret; +} + +static u64 lan8842_get_stat(struct phy_device *phydev, int count, int *regs) +{ + u64 ret = 0; + int val; + + for (int j = 0; j < count; ++j) { + val = lanphy_read_page_reg(phydev, LAN8814_PAGE_PCS_DIGITAL, + regs[j]); + if (val < 0) + return U64_MAX; + + ret <<= 16; + ret += val; + } + return ret; +} + +static int lan8842_update_stats(struct phy_device *phydev) +{ + struct lan8842_priv *priv = phydev->priv; + int rx_packets_regs[] = {88, 61, 60}; + int rx_errors_regs[] = {63, 62}; + int tx_packets_regs[] = {89, 85, 84}; + int tx_errors_regs[] = {87, 86}; + + priv->phy_stats.rx_packets = lan8842_get_stat(phydev, + ARRAY_SIZE(rx_packets_regs), + rx_packets_regs); + priv->phy_stats.rx_errors = lan8842_get_stat(phydev, + ARRAY_SIZE(rx_errors_regs), + rx_errors_regs); + priv->phy_stats.tx_packets = lan8842_get_stat(phydev, + ARRAY_SIZE(tx_packets_regs), + tx_packets_regs); + priv->phy_stats.tx_errors = lan8842_get_stat(phydev, + ARRAY_SIZE(tx_errors_regs), + tx_errors_regs); + + return 0; +} + +static void lan8842_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats) +{ + struct lan8842_priv *priv = phydev->priv; + + stats->rx_packets = priv->phy_stats.rx_packets; + stats->rx_errors = priv->phy_stats.rx_errors; + stats->tx_packets = priv->phy_stats.tx_packets; + stats->tx_errors = priv->phy_stats.tx_errors; +} + static struct phy_driver ksphy_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_KS8737), @@ -5987,6 +6180,21 @@ static struct phy_driver ksphy_driver[] = { .resume = lan8841_resume, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, +}, { + PHY_ID_MATCH_MODEL(PHY_ID_LAN8842), + .name = "Microchip LAN8842 Gigabit PHY", + .flags = PHY_POLL_CABLE_TEST, + .driver_data = &lan8814_type, + .probe = lan8842_probe, + .config_init = lan8842_config_init, + .config_intr = lan8842_config_intr, + .inband_caps = lan8842_inband_caps, + .config_inband = lan8842_config_inband, + .handle_interrupt = lan8842_handle_interrupt, + .get_phy_stats = lan8842_get_phy_stats, + .update_stats = lan8842_update_stats, + .cable_test_start = lan8814_cable_test_start, + .cable_test_get_status = ksz886x_cable_test_get_status, }, { PHY_ID_MATCH_MODEL(PHY_ID_KSZ9131), .name = "Microchip KSZ9131 Gigabit PHY", @@ -6082,6 +6290,7 @@ static const struct mdio_device_id __maybe_unused micrel_tbl[] = { { PHY_ID_MATCH_MODEL(PHY_ID_LAN8814) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8804) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8841) }, + { PHY_ID_MATCH_MODEL(PHY_ID_LAN8842) }, { } }; diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 9af01bdd86d2..ca691641788b 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -32,6 +32,7 @@ #define PHY_ID_LAN8814 0x00221660 #define PHY_ID_LAN8804 0x00221670 #define PHY_ID_LAN8841 0x00221650 +#define PHY_ID_LAN8842 0x002216C0 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From 6c9468aad215a198742c8375b0415e42521c905c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:54 -0700 Subject: fscrypt: replace raw loads of info pointer with helper function Add and use a helper function fscrypt_get_inode_info_raw(). It loads an inode's fscrypt info pointer using a raw dereference, which is appropriate when the caller knows the key setup already happened. This eliminates most occurrences of inode::i_crypt_info in the source, in preparation for replacing that with a filesystem-specific field. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-2-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/bio.c | 2 +- fs/crypto/crypto.c | 14 ++++++++------ fs/crypto/fname.c | 11 ++++++----- fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c | 12 +++++++----- fs/crypto/policy.c | 7 ++++--- include/linux/fscrypt.h | 16 ++++++++++++++++ 7 files changed, 43 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13..0d746de4cd10 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fde..07f9cbfe3ea4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f..fb77ad1ca74a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -94,7 +94,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +138,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +274,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +544,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f7..7a5d4c168c49 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,7 +199,7 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b..5dee7c498bc8 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c06..9d51f3500de3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 10dd161690a2..23c5198612d1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -195,6 +195,22 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Load the inode's fscrypt info pointer, using a raw dereference. Since this + * uses a raw dereference with no memory barrier, it is appropriate to use only + * when the caller knows the inode's key setup already happened, resulting in + * non-NULL fscrypt info. E.g., the file contents en/decryption functions use + * this, since fscrypt_file_open() set up the key. + */ +static inline struct fscrypt_inode_info * +fscrypt_get_inode_info_raw(const struct inode *inode) +{ + struct fscrypt_inode_info *ci = inode->i_crypt_info; + + VFS_WARN_ON_ONCE(ci == NULL); + return ci; +} + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { -- cgit v1.2.3 From 93221de31a8df6710e02328f82dc68d7ab4ad9e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:55 -0700 Subject: fscrypt: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fscrypt_operations, and update fs/crypto/ to support it. When set to a nonzero value, it specifies the offset to the fscrypt_inode_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_crypt_info. Since this makes inode::i_crypt_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-3-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/fscrypt_private.h | 4 ++-- fs/crypto/keysetup.c | 43 ++++++++++++++++++++++++++----------------- include/linux/fscrypt.h | 22 ++++++++++++++++++---- 3 files changed, 46 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c..245e6b84aa17 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e..c1f85715c276 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -642,15 +642,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +682,13 @@ out: * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +742,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +753,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +801,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 23c5198612d1..d7ff53accbfe 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,12 @@ struct fscrypt_name { /* Crypto operations for filesystems */ struct fscrypt_operations { + /* + * The offset of the pointer to struct fscrypt_inode_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /* * If set, then fs/crypto/ will allocate a global bounce page pool the @@ -195,6 +201,14 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +static inline struct fscrypt_inode_info ** +fscrypt_inode_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_cop->inode_info_offs == 0) + return (struct fscrypt_inode_info **)&inode->i_crypt_info; + return (void *)inode + inode->i_sb->s_cop->inode_info_offs; +} + /* * Load the inode's fscrypt info pointer, using a raw dereference. Since this * uses a raw dereference with no memory barrier, it is appropriate to use only @@ -205,7 +219,7 @@ int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, static inline struct fscrypt_inode_info * fscrypt_get_inode_info_raw(const struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode); VFS_WARN_ON_ONCE(ci == NULL); return ci; @@ -216,11 +230,11 @@ fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). - * I.e., another task may publish ->i_crypt_info concurrently, executing - * a RELEASE barrier. We need to use smp_load_acquire() here to safely + * I.e., another task may publish the fscrypt info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_crypt_info); + return smp_load_acquire(fscrypt_inode_info_addr(inode)); } /** -- cgit v1.2.3 From ab90c2d2476c4dd6deddd089c7e83b858d135783 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:00 -0700 Subject: fs: remove inode::i_crypt_info Now that all fscrypt-capable filesystems store the pointer to fscrypt_inode_info in the filesystem-specific part of the inode structure, inode::i_crypt_info is no longer needed. Update fscrypt_inode_info_addr() to no longer support the fallback to inode::i_crypt_info. Finally, remove inode::i_crypt_info itself along with the now-unnecessary forward declaration of fscrypt_inode_info. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_ENCRYPTION=y kernels for all filesystems that don't support fscrypt. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-8-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fscrypt.h | 8 ++++++-- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..1dafa18169be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -72,7 +72,6 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; @@ -780,10 +779,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_ENCRYPTION - struct fscrypt_inode_info *i_crypt_info; -#endif - #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index d7ff53accbfe..516aba5b858b 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -201,11 +201,15 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Returns the address of the fscrypt info pointer within the + * filesystem-specific part of the inode. (To save memory on filesystems that + * don't support fscrypt, a field in 'struct inode' itself is no longer used.) + */ static inline struct fscrypt_inode_info ** fscrypt_inode_info_addr(const struct inode *inode) { - if (inode->i_sb->s_cop->inode_info_offs == 0) - return (struct fscrypt_inode_info **)&inode->i_crypt_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_cop->inode_info_offs; } -- cgit v1.2.3 From 2a7349add18e5915cd87251af5f98db1772b6131 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:01 -0700 Subject: fsverity: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fsverity_operations, and update fs/verity/ to support it. When set to a nonzero value, it specifies the offset to the fsverity_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_verity_info. Since this makes inode::i_verity_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-9-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/verity/enable.c | 6 +++--- fs/verity/fsverity_private.h | 9 +++++---- fs/verity/open.c | 23 ++++++++++++----------- fs/verity/verify.c | 2 +- include/linux/fsverity.h | 44 ++++++++++++++++++++++++++++++++++---------- 5 files changed, 55 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf4296..89eccc4becf9 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -284,9 +284,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad..bc1d887c532e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c..77b1c977af02 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ fail: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b2..affc307eb6a6 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -245,7 +245,7 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, unsigned long max_ra_pages) { struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; + struct fsverity_info *vi = *fsverity_info_addr(inode); const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 1eb7eae580be..e0f132cb7839 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -28,6 +28,12 @@ /* Verity operations for filesystems */ struct fsverity_operations { + /** + * The offset of the pointer to struct fsverity_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. @@ -124,15 +130,33 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +static inline struct fsverity_info ** +fsverity_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_vop->inode_info_offs == 0) + return (struct fsverity_info **)&inode->i_verity_info; + return (void *)inode + inode->i_sb->s_vop->inode_info_offs; +} + static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* - * Pairs with the cmpxchg_release() in fsverity_set_info(). - * I.e., another task may publish ->i_verity_info concurrently, - * executing a RELEASE barrier. We need to use smp_load_acquire() here - * to safely ACQUIRE the memory the other task published. + * Since this function can be called on inodes belonging to filesystems + * that don't support fsverity at all, and fsverity_info_addr() doesn't + * work on such filesystems, we have to start with an IS_VERITY() check. + * Checking IS_VERITY() here is also useful to minimize the overhead of + * fsverity_active() on non-verity files. + */ + if (!IS_VERITY(inode)) + return NULL; + + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., + * another task may publish the inode's verity info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely + * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_verity_info); + return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ @@ -156,11 +180,11 @@ void __fsverity_cleanup_inode(struct inode *inode); * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * - * Filesystems must call this on inode eviction to free ->i_verity_info. + * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (inode->i_verity_info) + if (*fsverity_info_addr(inode)) __fsverity_cleanup_inode(inode); } @@ -267,12 +291,12 @@ static inline bool fsverity_verify_page(struct page *page) * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * - * This checks whether ->i_verity_info has been set. + * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with - * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ @@ -287,7 +311,7 @@ static inline bool fsverity_active(const struct inode *inode) * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. + * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. -- cgit v1.2.3 From 818c659ac164e4e4639ceaedaccbdfebb1ef63b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:05 -0700 Subject: fs: remove inode::i_verity_info Now that all fsverity-capable filesystems store the pointer to fsverity_info in the filesystem-specific part of the inode structure, inode::i_verity_info is no longer needed. Update fsverity_info_addr() to no longer support the fallback to inode::i_verity_info. Finally, remove inode::i_verity_info itself, and move the forward declaration of struct fsverity_info from fs.h (which no longer needs it) to fsverity.h. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_VERITY=y kernels for all filesystems that don't support fsverity. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-13-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fsverity.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1dafa18169be..12ecc6b0e6f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -73,7 +73,6 @@ struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_operations; -struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; @@ -779,10 +778,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_VERITY - struct fsverity_info *i_verity_info; -#endif - void *i_private; /* fs or device private pointer */ } __randomize_layout; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index e0f132cb7839..844f7b8b56bb 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -26,6 +26,8 @@ /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 +struct fsverity_info; + /* Verity operations for filesystems */ struct fsverity_operations { /** @@ -130,11 +132,15 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +/* + * Returns the address of the verity info pointer within the filesystem-specific + * part of the inode. (To save memory on filesystems that don't support + * fsverity, a field in 'struct inode' itself is no longer used.) + */ static inline struct fsverity_info ** fsverity_info_addr(const struct inode *inode) { - if (inode->i_sb->s_vop->inode_info_offs == 0) - return (struct fsverity_info **)&inode->i_verity_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_vop->inode_info_offs; } -- cgit v1.2.3 From 8a3d00dde63a339d31d1fdeead24ddfd4d459c70 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:06 -0700 Subject: fsverity: check IS_VERITY() in fsverity_cleanup_inode() Since getting the address of the fsverity_info has gotten a bit more expensive, make fsverity_cleanup_inode() check for IS_VERITY() instead. This avoids adding more overhead to non-verity files. This assumes that verity info is never set when !IS_VERITY(), which is currently true, but add a VFS_WARN_ON_ONCE() that asserts that. (This of course defeats the optimization, but only when CONFIG_VFS_DEBUG=y.) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-14-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fsverity.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 844f7b8b56bb..5bc7280425a7 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -190,8 +190,15 @@ void __fsverity_cleanup_inode(struct inode *inode); */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (*fsverity_info_addr(inode)) + /* + * Only IS_VERITY() inodes can have verity info, so start by checking + * for IS_VERITY() (which is faster than retrieving the pointer to the + * verity info). This minimizes overhead for non-verity inodes. + */ + if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); + else + VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ -- cgit v1.2.3 From 07cf71bf25cd4e5735ff13468e7b86f02c3665cb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:56:50 -0700 Subject: net: page_pool: add page_pool_get() There is a page_pool_put() function but no get equivalent. Having multiple references to a page pool is quite useful. It avoids branching in create / destroy paths in drivers which support memory providers. Use the new helper in bnxt. Acked-by: Jesper Dangaard Brouer Reviewed-by: Dragos Tatulea Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250820025704.166248-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 +++++------ include/net/page_pool/helpers.h | 5 +++++ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ba99de403138..14fa5b9e0f6c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3800,8 +3800,7 @@ static void bnxt_free_rx_rings(struct bnxt *bp) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool(rxr)) - page_pool_destroy(rxr->head_pool); + page_pool_destroy(rxr->head_pool); rxr->page_pool = rxr->head_pool = NULL; kfree(rxr->rx_agg_bmap); @@ -3848,6 +3847,8 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pool = page_pool_create(&pp); if (IS_ERR(pool)) goto err_destroy_pp; + } else { + page_pool_get(pool); } rxr->head_pool = pool; @@ -15903,8 +15904,7 @@ err_rxq_info_unreg: xdp_rxq_info_unreg(&clone->xdp_rxq); err_page_pool_destroy: page_pool_destroy(clone->page_pool); - if (bnxt_separate_head_pool(clone)) - page_pool_destroy(clone->head_pool); + page_pool_destroy(clone->head_pool); clone->page_pool = NULL; clone->head_pool = NULL; return rc; @@ -15922,8 +15922,7 @@ static void bnxt_queue_mem_free(struct net_device *dev, void *qmem) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool(rxr)) - page_pool_destroy(rxr->head_pool); + page_pool_destroy(rxr->head_pool); rxr->page_pool = NULL; rxr->head_pool = NULL; diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index db180626be06..aa3719f28216 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -489,6 +489,11 @@ page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, offset, dma_sync_size); } +static inline void page_pool_get(struct page_pool *pool) +{ + refcount_inc(&pool->user_cnt); +} + static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); -- cgit v1.2.3 From 0f07b7919d679050d354d3279faa74bdc7ce17a0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:12 +0200 Subject: uprobes: Rename arch_uretprobe_trampoline function We are about to add uprobe trampoline, so cleaning up the namespace. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-3-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 2 +- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e839..77050e5a4680 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -338,7 +338,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..01112f27cd21 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -224,7 +224,7 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); #else /* !CONFIG_UPROBES */ struct uprobes_state { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1cbfe3cfe573..dd4dd156c956 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1726,7 +1726,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1758,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) -- cgit v1.2.3 From 82afdd05a16a424409682e06a53d6afcda038d30 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:13 +0200 Subject: uprobes: Make copy_from_page global Making copy_from_page global and adding uprobe prefix. Adding the uprobe prefix to copy_to_page as well for symmetry. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-4-jolsa@kernel.org --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 01112f27cd21..7447e15559b8 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -226,6 +226,7 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dd4dd156c956..f993a3422083 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -205,7 +205,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -1051,7 +1051,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1397,7 +1397,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -2393,7 +2393,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ -- cgit v1.2.3 From 33d7b2beaf34a3c0f6406bc76f6e1b1755150ad9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:14 +0200 Subject: uprobes: Add uprobe_write function Adding uprobe_write function that does what uprobe_write_opcode did so far, but allows to pass verify callback function that checks the memory location before writing the opcode. It will be used in following changes to implement specific checking logic for instruction update. The uprobe_write_opcode now calls uprobe_write with verify_opcode as the verify callback. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-5-jolsa@kernel.org --- include/linux/uprobes.h | 5 +++++ kernel/events/uprobes.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7447e15559b8..e13382054435 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -187,6 +187,9 @@ struct uprobes_state { struct xol_area *xol_area; }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *opcode); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -195,6 +198,8 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t opcode, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f993a3422083..838ac40e91e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -399,7 +399,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long opcode_vaddr, uprobe_opcode_t opcode) { @@ -487,6 +487,12 @@ remap: */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -509,7 +515,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,7 +527,7 @@ retry: goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, opcode_vaddr, &opcode); if (ret <= 0) { folio_put(folio); goto out; @@ -560,7 +566,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); folio_walk_end(&fw, vma); } -- cgit v1.2.3 From f8b7c528b4fb7018d12b6bb63bb52576cfc73697 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:15 +0200 Subject: uprobes: Add nbytes argument to uprobe_write Adding nbytes argument to uprobe_write and related functions as preparation for writing whole instructions in following changes. Also renaming opcode arguments to insn, which seems to fit better. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-6-jolsa@kernel.org --- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 26 ++++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e13382054435..147c4a0a1af9 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -188,7 +188,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *opcode); + uprobe_opcode_t *insn, int nbytes); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -199,7 +199,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t opcode, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 838ac40e91e6..c133fd4b492d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -208,7 +209,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -401,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -488,13 +489,14 @@ remap: int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { - return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; @@ -504,7 +506,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); + is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -527,7 +529,7 @@ retry: goto out; folio = page_folio(page); - ret = verify(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes); if (ret <= 0) { folio_put(folio); goto out; @@ -566,7 +568,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); folio_walk_end(&fw, vma); } -- cgit v1.2.3 From ec46350fe1e2338f42ee84974c36b25afe8ba53a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:16 +0200 Subject: uprobes: Add is_register argument to uprobe_write and uprobe_write_opcode The uprobe_write has special path to restore the original page when we write original instruction back. This happens when uprobe_write detects that we want to write anything else but breakpoint instruction. Moving the detection away and passing it to uprobe_write as argument, so it's possible to write different instructions (other than just breakpoint and rest). Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-7-jolsa@kernel.org --- arch/arm/probes/uprobes/core.c | 2 +- include/linux/uprobes.h | 5 +++-- kernel/events/uprobes.c | 21 +++++++++++---------- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20..3d96fb41d624 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 147c4a0a1af9..518b26756469 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -197,9 +197,10 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c133fd4b492d..955e5ed3e383 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -402,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -487,26 +487,27 @@ remap: * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify) + uprobe_write_verify_t verify, bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -568,7 +569,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -610,7 +611,7 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -626,7 +627,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ -- cgit v1.2.3 From 18a111256a0b4fedfe47101f084441a84d7e357a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:17 +0200 Subject: uprobes: Add do_ref_ctr argument to uprobe_write function Making update_ref_ctr call in uprobe_write conditional based on do_ref_ctr argument. This way we can use uprobe_write for instruction update without doing ref_ctr_offset update. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-8-jolsa@kernel.org --- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 518b26756469..5080619560d4 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -200,7 +200,7 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 955e5ed3e383..da2b3d0deab6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,12 +491,12 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register); + verify_opcode, is_register, true /* do_update_ref_ctr */); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -537,7 +537,7 @@ retry: } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -589,7 +589,7 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ -- cgit v1.2.3 From 91440ff4cafad4c86322a612e523f7f021a493e7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:18 +0200 Subject: uprobes/x86: Add mapping for optimized uprobe trampolines Adding support to add special mapping for user space trampoline with following functions: uprobe_trampoline_get - find or add uprobe_trampoline uprobe_trampoline_put - remove or destroy uprobe_trampoline The user space trampoline is exported as arch specific user space special mapping through tramp_mapping, which is initialized in following changes with new uprobe syscall. The uprobe trampoline needs to be callable/reachable from the probed address, so while searching for available address we use is_reachable_by_call function to decide if the uprobe trampoline is callable from the probe address. All uprobe_trampoline objects are stored in uprobes_state object and are cleaned up when the process mm_struct goes down. Adding new arch hooks for that, because this change is x86_64 specific. Locking is provided by callers in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-9-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/uprobes.h | 6 ++ kernel/events/uprobes.c | 10 ++++ kernel/fork.c | 1 + 4 files changed, 161 insertions(+) (limited to 'include') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 77050e5a4680..6c4dcbdd0c3c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -608,6 +608,150 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +__maybe_unused +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 5080619560d4..b40d33aae016 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,6 +186,9 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, @@ -233,6 +237,8 @@ extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index da2b3d0deab6..2cd7a4c6f303 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1801,6 +1801,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1812,6 +1820,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..d827cc6c5362 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1015,6 +1015,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } -- cgit v1.2.3 From 56101b69c9190667f473b9f93f8b6d8209aaa816 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:19 +0200 Subject: uprobes/x86: Add uprobe syscall to speed up uprobe Adding new uprobe syscall that calls uprobe handlers for given 'breakpoint' address. The idea is that the 'breakpoint' address calls the user space trampoline which executes the uprobe syscall. The syscall handler reads the return address of the initial call to retrieve the original 'breakpoint' address. With this address we find the related uprobe object and call its consumers. Adding the arch_uprobe_trampoline_mapping function that provides uprobe trampoline mapping. This mapping is backed with one global page initialized at __init time and shared by the all the mapping instances. We do not allow to execute uprobe syscall if the caller is not from uprobe trampoline mapping. The uprobe syscall ensures the consumer (bpf program) sees registers values in the state before the trampoline was called. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org --- arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 2 + include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 17 ++++ kernel/sys_ni.c | 1 + 6 files changed, 161 insertions(+) (limited to 'include') diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291e..ced2a1deecd7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c4dcbdd0c3c..d18e1ae59901 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) destroy_uprobe_trampoline(tramp); } + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d4413..66c06fcdfe19 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b40d33aae016..b6b077cc7d0f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2cd7a4c6f303..eb07e602b6c9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2771,6 +2771,23 @@ out: rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c..bf5d05c635ff 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); -- cgit v1.2.3 From ba2bfc97b4629b10bd8d02b36e04f3932a04cac4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:20 +0200 Subject: uprobes/x86: Add support to optimize uprobes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Putting together all the previously added pieces to support optimized uprobes on top of 5-byte nop instruction. The current uprobe execution goes through following: - installs breakpoint instruction over original instruction - exception handler hit and calls related uprobe consumers - and either simulates original instruction or does out of line single step execution of it - returns to user space The optimized uprobe path does following: - checks the original instruction is 5-byte nop (plus other checks) - adds (or uses existing) user space trampoline with uprobe syscall - overwrites original instruction (5-byte nop) with call to user space trampoline - the user space trampoline executes uprobe syscall that calls related uprobe consumers - trampoline returns back to next instruction This approach won't speed up all uprobes as it's limited to using nop5 as original instruction, but we plan to use nop5 as USDT probe instruction (which currently uses single byte nop) and speed up the USDT probes. The arch_uprobe_optimize triggers the uprobe optimization and is called after first uprobe hit. I originally had it called on uprobe installation but then it clashed with elf loader, because the user space trampoline was added in a place where loader might need to put elf segments, so I decided to do it after first uprobe hit when loading is done. The uprobe is un-optimized in arch specific set_orig_insn call. The instruction overwrite is x86 arch specific and needs to go through 3 updates: (on top of nop5 instruction) - write int3 into 1st byte - write last 4 bytes of the call instruction - update the call instruction opcode And cleanup goes though similar reverse stages: - overwrite call opcode with breakpoint (int3) - write last 4 bytes of the nop5 instruction - write the nop5 first instruction byte We do not unmap and release uprobe trampoline when it's no longer needed, because there's no easy way to make sure none of the threads is still inside the trampoline. But we do not waste memory, because there's just single page for all the uprobe trampoline mappings. We do waste frame on page mapping for every 4GB by keeping the uprobe trampoline page mapped, but that seems ok. We take the benefit from the fact that set_swbp and set_orig_insn are called under mmap_write_lock(mm), so we can use the current instruction as the state the uprobe is in - nop5/breakpoint/call trampoline - and decide the needed action (optimize/un-optimize) based on that. Attaching the speed up from benchs/run_bench_uprobes.sh script: current: usermode-count : 152.604 ± 0.044M/s syscall-count : 13.359 ± 0.042M/s --> uprobe-nop : 3.229 ± 0.002M/s uprobe-push : 3.086 ± 0.004M/s uprobe-ret : 1.114 ± 0.004M/s uprobe-nop5 : 1.121 ± 0.005M/s uretprobe-nop : 2.145 ± 0.002M/s uretprobe-push : 2.070 ± 0.001M/s uretprobe-ret : 0.931 ± 0.001M/s uretprobe-nop5 : 0.957 ± 0.001M/s after the change: usermode-count : 152.448 ± 0.244M/s syscall-count : 14.321 ± 0.059M/s uprobe-nop : 3.148 ± 0.007M/s uprobe-push : 2.976 ± 0.004M/s uprobe-ret : 1.068 ± 0.003M/s --> uprobe-nop5 : 7.038 ± 0.007M/s uretprobe-nop : 2.109 ± 0.004M/s uretprobe-push : 2.035 ± 0.001M/s uretprobe-ret : 0.908 ± 0.001M/s uretprobe-nop5 : 3.377 ± 0.009M/s I see bit more speed up on Intel (above) compared to AMD. The big nop5 speed up is partly due to emulating nop5 and partly due to optimization. The key speed up we do this for is the USDT switch from nop to nop5: uprobe-nop : 3.148 ± 0.007M/s uprobe-nop5 : 7.038 ± 0.007M/s Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-11-jolsa@kernel.org --- arch/x86/include/asm/uprobes.h | 7 + arch/x86/kernel/uprobes.c | 283 ++++++++++++++++++++++++++++++++++++++++- include/linux/uprobes.h | 6 +- kernel/events/uprobes.c | 16 ++- 4 files changed, 305 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a7..1ee2e5115955 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d18e1ae59901..209ce74ab93f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) return tramp; } -__maybe_unused static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) { struct uprobes_state *state = ¤t->mm->uprobes_state; @@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void) late_initcall(arch_uprobes_init); +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + err = is_optimized(vma->vm_mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + struct mm_struct *mm = vma->vm_mm; + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) { + err = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(err); + return err; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + /* + * Do not optimize if shadow stack is enabled, the return address hijack + * code in arch_uretprobe_hijack_return_addr updates wrong frame when + * the entry uprobe is optimized and the shadow stack crashes the app. + */ + if (shstk_is_enabled()) + return; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b6b077cc7d0f..08ef78439d0d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -192,7 +192,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *insn, int nbytes); + uprobe_opcode_t *insn, int nbytes, void *data); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void * extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eb07e602b6c9..4a194d7c838b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, - int nbytes) + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -491,12 +491,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register, true /* do_update_ref_ctr */); + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -530,7 +531,7 @@ retry: goto out; folio = page_folio(page); - ret = verify(page, insn_vaddr, insn, nbytes); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; @@ -2696,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2760,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; -- cgit v1.2.3 From 11d5674fc2e5e75ccaa13685a909c14e033544b7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 11 Aug 2025 12:44:00 +0800 Subject: crypto: hash - Make HASH_MAX_DESCSIZE a bit more obvious Move S390_SHA_CTX_SIZE into crypto/hash.h so that the derivation of HASH_MAX_DESCSIZE is less cryptic. Signed-off-by: Herbert Xu --- arch/s390/crypto/sha.h | 8 +++++++- include/crypto/hash.h | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h index cadb4b13622a..b9cd9572dd35 100644 --- a/arch/s390/crypto/sha.h +++ b/arch/s390/crypto/sha.h @@ -10,14 +10,15 @@ #ifndef _CRYPTO_ARCH_S390_SHA_H #define _CRYPTO_ARCH_S390_SHA_H +#include #include #include +#include #include /* must be big enough for the largest SHA variant */ #define CPACF_MAX_PARMBLOCK_SIZE SHA3_STATE_SIZE #define SHA_MAX_BLOCK_SIZE SHA3_224_BLOCK_SIZE -#define S390_SHA_CTX_SIZE sizeof(struct s390_sha_ctx) struct s390_sha_ctx { u64 count; /* message length in bytes */ @@ -42,4 +43,9 @@ int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data, int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len, u8 *out); +static inline void __check_s390_sha_ctx_size(void) +{ + BUILD_BUG_ON(S390_SHA_CTX_SIZE != sizeof(struct s390_sha_ctx)); +} + #endif diff --git a/include/crypto/hash.h b/include/crypto/hash.h index bbaeae705ef0..586700332c73 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -177,14 +177,26 @@ struct shash_desc { #define HASH_MAX_DIGESTSIZE 64 +/* + * The size of a core hash state and a partial block. The final byte + * is the length of the partial block. + */ +#define HASH_STATE_AND_BLOCK(state, block) ((state) + (block) + 1) + + /* Worst case is sha3-224. */ -#define HASH_MAX_STATESIZE 200 + 144 + 1 +#define HASH_MAX_STATESIZE HASH_STATE_AND_BLOCK(200, 144) + +/* This needs to match arch/s390/crypto/sha.h. */ +#define S390_SHA_CTX_SIZE 216 /* * Worst case is hmac(sha3-224-s390). Its context is a nested 'shash_desc' * containing a 'struct s390_sha_ctx'. */ -#define HASH_MAX_DESCSIZE (sizeof(struct shash_desc) + 361) +#define SHA3_224_S390_DESCSIZE HASH_STATE_AND_BLOCK(S390_SHA_CTX_SIZE, 144) +#define HASH_MAX_DESCSIZE (sizeof(struct shash_desc) + \ + SHA3_224_S390_DESCSIZE) #define MAX_SYNC_HASH_REQSIZE (sizeof(struct ahash_request) + \ HASH_MAX_DESCSIZE) -- cgit v1.2.3 From 3202d6ed9368fc1e842fda73727553ae614633f8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 21 Aug 2025 15:07:50 +0200 Subject: mmc: core: Add infrastructure for undervoltage handling Implement the core infrastructure to allow MMC bus types to handle REGULATOR_EVENT_UNDER_VOLTAGE events from power regulators. This is primarily aimed at allowing devices like eMMC to perform an emergency shutdown to prevent data corruption when a power failure is imminent. This patch introduces: - A new 'handle_undervoltage' function pointer to 'struct mmc_bus_ops'. Bus drivers (e.g., for eMMC) can implement this to define their emergency procedures. - A workqueue ('uv_work') in 'struct mmc_supply' to handle the event asynchronously in a high-priority context. - A new function 'mmc_handle_undervoltage()' which is called from the workqueue. It stops the host queue to prevent races with card removal, checks for the bus op, and invokes the handler. - Functions to register and unregister the regulator notifier, intended to be called by bus drivers like 'mmc_attach_mmc' when a compatible card is detected. The notifier is only registered for the main vmmc supply, as undervoltage handling for vqmmc or vqmmc2 is not required at this time. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20250821130751.2089587-2-o.rempel@pengutronix.de Signed-off-by: Ulf Hansson --- drivers/mmc/core/bus.c | 12 +++++++ drivers/mmc/core/core.c | 23 +++++++++++++ drivers/mmc/core/core.h | 5 +++ drivers/mmc/core/host.c | 2 ++ drivers/mmc/core/regulator.c | 77 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mmc/host.h | 11 +++++++ 6 files changed, 130 insertions(+) (limited to 'include') diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 1cf64e0952fb..ec4f3462bf80 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -19,6 +19,7 @@ #include #include +#include #include "core.h" #include "card.h" @@ -383,6 +384,14 @@ int mmc_add_card(struct mmc_card *card) mmc_card_set_present(card); + /* + * Register for undervoltage notification if the card supports + * power-off notification, enabling emergency shutdowns. + */ + if (mmc_card_mmc(card) && + card->ext_csd.power_off_notification == EXT_CSD_POWER_ON) + mmc_regulator_register_undervoltage_notifier(card->host); + return 0; } @@ -394,6 +403,9 @@ void mmc_remove_card(struct mmc_card *card) { struct mmc_host *host = card->host; + if (mmc_card_present(card)) + mmc_regulator_unregister_undervoltage_notifier(host); + mmc_remove_card_debugfs(card); if (mmc_card_present(card)) { diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 88fd231fee1d..860378bea557 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1398,6 +1398,29 @@ void mmc_power_cycle(struct mmc_host *host, u32 ocr) mmc_power_up(host, ocr); } +/** + * mmc_handle_undervoltage - Handle an undervoltage event on the MMC bus + * @host: The MMC host that detected the undervoltage condition + * + * This function is called when an undervoltage event is detected on one of + * the MMC regulators. + * + * Returns: 0 on success or a negative error code on failure. + */ +int mmc_handle_undervoltage(struct mmc_host *host) +{ + /* Stop the host to prevent races with card removal */ + __mmc_stop_host(host); + + if (!host->bus_ops || !host->bus_ops->handle_undervoltage) + return 0; + + dev_warn(mmc_dev(host), "%s: Undervoltage detected, initiating emergency stop\n", + mmc_hostname(host)); + + return host->bus_ops->handle_undervoltage(host); +} + /* * Assign a mmc bus handler to a host. Only one bus handler may control a * host at any given time. diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 73f5d3d8c77d..a028b48be164 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -31,6 +31,7 @@ struct mmc_bus_ops { int (*sw_reset)(struct mmc_host *); bool (*cache_enabled)(struct mmc_host *); int (*flush_cache)(struct mmc_host *); + int (*handle_undervoltage)(struct mmc_host *host); }; void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops); @@ -59,6 +60,10 @@ void mmc_power_off(struct mmc_host *host); void mmc_power_cycle(struct mmc_host *host, u32 ocr); void mmc_set_initial_state(struct mmc_host *host); u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max); +int mmc_handle_undervoltage(struct mmc_host *host); +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host); +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host); +void mmc_undervoltage_workfn(struct work_struct *work); static inline void mmc_delay(unsigned int ms) { diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index f14671ea5716..5f0ec23aeff5 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -564,6 +564,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) INIT_WORK(&host->sdio_irq_work, sdio_irq_work); timer_setup(&host->retune_timer, mmc_retune_timer, 0); + INIT_WORK(&host->supply.uv_work, mmc_undervoltage_workfn); + /* * By default, hosts do not support SGIO or large requests. * They have to set these according to their abilities. diff --git a/drivers/mmc/core/regulator.c b/drivers/mmc/core/regulator.c index 3dae2e9b7978..a85179f1a4de 100644 --- a/drivers/mmc/core/regulator.c +++ b/drivers/mmc/core/regulator.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -262,6 +263,82 @@ static inline int mmc_regulator_get_ocrmask(struct regulator *supply) #endif /* CONFIG_REGULATOR */ +/* To be called from a high-priority workqueue */ +void mmc_undervoltage_workfn(struct work_struct *work) +{ + struct mmc_supply *supply; + struct mmc_host *host; + + supply = container_of(work, struct mmc_supply, uv_work); + host = container_of(supply, struct mmc_host, supply); + + mmc_handle_undervoltage(host); +} + +static int mmc_handle_regulator_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mmc_supply *supply = container_of(nb, struct mmc_supply, + vmmc_nb); + struct mmc_host *host = container_of(supply, struct mmc_host, supply); + unsigned long flags; + + switch (event) { + case REGULATOR_EVENT_UNDER_VOLTAGE: + spin_lock_irqsave(&host->lock, flags); + if (host->undervoltage) { + spin_unlock_irqrestore(&host->lock, flags); + return NOTIFY_OK; + } + + host->undervoltage = true; + spin_unlock_irqrestore(&host->lock, flags); + + queue_work(system_highpri_wq, &host->supply.uv_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +/** + * mmc_regulator_register_undervoltage_notifier - Register for undervoltage + * events + * @host: MMC host + * + * To be called by a bus driver when a card supporting graceful shutdown + * is attached. + */ +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host) +{ + int ret; + + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + host->supply.vmmc_nb.notifier_call = mmc_handle_regulator_event; + ret = regulator_register_notifier(host->supply.vmmc, + &host->supply.vmmc_nb); + if (ret) + dev_warn(mmc_dev(host), "Failed to register vmmc notifier: %d\n", ret); +} + +/** + * mmc_regulator_unregister_undervoltage_notifier - Unregister undervoltage + * notifier + * @host: MMC host + */ +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host) +{ + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + regulator_unregister_notifier(host->supply.vmmc, &host->supply.vmmc_nb); + cancel_work_sync(&host->supply.uv_work); +} + /** * mmc_regulator_get_supply - try to get VMMC and VQMMC regulators for a host * @mmc: the host to regulate diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 5ed5d203de23..e0d935a4ac1d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -337,11 +337,15 @@ struct mmc_slot { struct regulator; struct mmc_pwrseq; +struct notifier_block; struct mmc_supply { struct regulator *vmmc; /* Card power supply */ struct regulator *vqmmc; /* Optional Vccq supply */ struct regulator *vqmmc2; /* Optional supply for phy */ + + struct notifier_block vmmc_nb; /* Notifier for vmmc */ + struct work_struct uv_work; /* Undervoltage work */ }; struct mmc_ctx { @@ -494,6 +498,13 @@ struct mmc_host { unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ + /* + * Indicates if an undervoltage event has already been handled. + * This prevents repeated regulator notifiers from triggering + * multiple REGULATOR_EVENT_UNDER_VOLTAGE events. + */ + unsigned int undervoltage:1; /* Undervoltage state */ + int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ -- cgit v1.2.3 From f41345f47fb267a9c95ca710c33448f8d0d81d83 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Aug 2025 15:18:06 +0200 Subject: bpf: Use tnums for JEQ/JNE is_branch_taken logic In the following toy program (reg states minimized for readability), R0 and R1 always have different values at instruction 6. This is obvious when reading the program but cannot be guessed from ranges alone as they overlap (R0 in [0; 0xc0000000], R1 in [1024; 0xc0000400]). 0: call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: w0 = w0 ; R0_w=scalar(var_off=(0x0; 0xffffffff)) 2: r0 >>= 30 ; R0_w=scalar(var_off=(0x0; 0x3)) 3: r0 <<= 30 ; R0_w=scalar(var_off=(0x0; 0xc0000000)) 4: r1 = r0 ; R1_w=scalar(var_off=(0x0; 0xc0000000)) 5: r1 += 1024 ; R1_w=scalar(var_off=(0x400; 0xc0000000)) 6: if r1 != r0 goto pc+1 Looking at tnums however, we can deduce that R1 is always different from R0 because their tnums don't agree on known bits. This patch uses this logic to improve is_scalar_branch_taken in case of BPF_JEQ and BPF_JNE. This change has a tiny impact on complexity, which was measured with the Cilium complexity CI test. That test covers 72 programs with various build and load time configurations for a total of 970 test cases. For 80% of test cases, the patch has no impact. On the other test cases, the patch decreases complexity by only 0.08% on average. In the best case, the verifier needs to walk 3% less instructions and, in the worst case, 1.5% more. Overall, the patch has a small positive impact, especially for our largest programs. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/be3ee70b6e489c49881cb1646114b1d861b5c334.1755694147.git.paul.chaignon@gmail.com --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 8 ++++++++ kernel/bpf/verifier.c | 4 ++++ 3 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 57ed3035cc30..0ffb77ffe0e8 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -51,6 +51,9 @@ struct tnum tnum_xor(struct tnum a, struct tnum b); /* Multiply two tnums, return @a * @b */ struct tnum tnum_mul(struct tnum a, struct tnum b); +/* Return true if the known bits of both tnums have the same value */ +bool tnum_overlap(struct tnum a, struct tnum b); + /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index fa353c5d550f..d9328bbb3680 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -143,6 +143,14 @@ struct tnum tnum_mul(struct tnum a, struct tnum b) return tnum_add(TNUM(acc_v, 0), acc_m); } +bool tnum_overlap(struct tnum a, struct tnum b) +{ + u64 mu; + + mu = ~a.mask & ~b.mask; + return (a.value & mu) == (b.value & mu); +} + /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e47992361ea..5c9dd16b2c56 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15897,6 +15897,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; + if (!tnum_overlap(t1, t2)) + return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; @@ -15921,6 +15923,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; + if (!tnum_overlap(t1, t2)) + return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; -- cgit v1.2.3 From afa3701c0e45ecb9e4d160048ca4e353c7489948 Mon Sep 17 00:00:00 2001 From: Tiffany Yang Date: Thu, 21 Aug 2025 18:37:52 -0700 Subject: cgroup: cgroup.stat.local time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There isn't yet a clear way to identify a set of "lost" time that everyone (or at least a wider group of users) cares about. However, users can perform some delay accounting by iterating over components of interest. This patch allows cgroup v2 freezing time to be one of those components. Track the cumulative time that each v2 cgroup spends freezing and expose it to userland via a new local stat file in cgroupfs. Thank you to Michal, who provided the ASCII art in the updated documentation. To access this value: $ mkdir /sys/fs/cgroup/test $ cat /sys/fs/cgroup/test/cgroup.stat.local freeze_time_total 0 Ensure consistent freeze time reads with freeze_seq, a per-cgroup sequence counter. Writes are serialized using the css_set_lock. Signed-off-by: Tiffany Yang Cc: Tejun Heo Cc: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 18 ++++++++++++++++++ include/linux/cgroup-defs.h | 17 +++++++++++++++++ kernel/cgroup/cgroup.c | 28 ++++++++++++++++++++++++++++ kernel/cgroup/freezer.c | 16 ++++++++++++---- 4 files changed, 75 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index d9d3cc7df348..9a3a909ee40b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,24 @@ All cgroup core files are prefixed with "cgroup." Total number of dying cgroup subsystems (e.g. memory cgroup) at and beneath the current cgroup. + cgroup.stat.local + A read-only flat-keyed file which exists in non-root cgroups. + The following entry is defined: + + frozen_usec + Cumulative time that this cgroup has spent between freezing and + thawing, regardless of whether by self or ancestor groups. + NB: (not) reaching "frozen" state is not accounted here. + + Using the following ASCII representation of a cgroup's freezer + state, :: + + 1 _____ + frozen 0 __/ \__ + ab cd + + the duration being measured is the span between a and c. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6b93a64115fe..539c64eeef38 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -433,6 +433,23 @@ struct cgroup_freezer_state { * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; + + /* Freeze time data consistency protection */ + seqcount_t freeze_seq; + + /* + * Most recent time the cgroup was requested to freeze. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 freeze_start_nsec; + + /* + * Total duration the cgroup has spent freezing. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 frozen_nsec; }; struct cgroup { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..ab096b884bbc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3763,6 +3763,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + seq_printf(seq, "frozen_usec %llu\n", + (unsigned long long) freeze_time / NSEC_PER_USEC); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -5354,6 +5375,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, @@ -5763,6 +5789,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5771,6 +5798,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dd..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } -- cgit v1.2.3 From d47cc4dea17391c99b943fa8d70a279e906b2843 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 13:16:15 -0700 Subject: bpf: Use sha1() instead of sha1_transform() in bpf_prog_calc_tag() Now that there's a proper SHA-1 library API, just use that instead of the low-level SHA-1 compression function. This eliminates the need for bpf_prog_calc_tag() to implement the SHA-1 padding itself. No functional change; the computed tags remain the same. Signed-off-by: Eric Biggers Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250811201615.564461-1-ebiggers@kernel.org --- include/linux/filter.h | 6 ------ kernel/bpf/core.c | 50 +++++++++----------------------------------------- 2 files changed, 9 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index c0a74fb9fcb1..9092d8ea95c8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -997,12 +997,6 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) return prog->len * sizeof(struct bpf_insn); } -static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) -{ - return round_up(bpf_prog_insn_size(prog) + - sizeof(__be64) + 1, SHA1_BLOCK_SIZE); -} - static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..ef01cc644a96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -293,28 +294,19 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); - u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA1_DIGEST_WORDS]; - u32 ws[SHA1_WORKSPACE_WORDS]; - u32 i, bsize, psize, blocks; + size_t size = bpf_prog_insn_size(fp); + u8 digest[SHA1_DIGEST_SIZE]; struct bpf_insn *dst; bool was_ld_map; - u8 *raw, *todo; - __be32 *result; - __be64 *bits; + u32 i; - raw = vmalloc(raw_size); - if (!raw) + dst = vmalloc(size); + if (!dst) return -ENOMEM; - sha1_init_raw(digest); - memset(ws, 0, sizeof(ws)); - /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ - dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -334,33 +326,9 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - - psize = bpf_prog_insn_size(fp); - memset(&raw[psize], 0, raw_size - psize); - raw[psize++] = 0x80; - - bsize = round_up(psize, SHA1_BLOCK_SIZE); - blocks = bsize / SHA1_BLOCK_SIZE; - todo = raw; - if (bsize - psize >= sizeof(__be64)) { - bits = (__be64 *)(todo + bsize - sizeof(__be64)); - } else { - bits = (__be64 *)(todo + bsize + bits_offset); - blocks++; - } - *bits = cpu_to_be64((psize - 1) << 3); - - while (blocks--) { - sha1_transform(digest, todo, ws); - todo += SHA1_BLOCK_SIZE; - } - - result = (__force __be32 *)digest; - for (i = 0; i < SHA1_DIGEST_WORDS; i++) - result[i] = cpu_to_be32(digest[i]); - memcpy(fp->tag, result, sizeof(fp->tag)); - - vfree(raw); + sha1((const u8 *)dst, size, digest); + memcpy(fp->tag, digest, sizeof(fp->tag)); + vfree(dst); return 0; } -- cgit v1.2.3 From 3c716487936aa54083c130d46ad5747769695e09 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 14 Aug 2025 18:59:49 +0200 Subject: genirq: Remove GENERIC_IRQ_LEGACY IA64 is gone and with it the last GENERIC_IRQ_LEGACY user. Remove GENERIC_IRQ_LEGACY. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250814165949.hvtP03r4@linutronix.de --- include/linux/irq.h | 4 ---- kernel/irq/Kconfig | 4 ---- kernel/irq/irqdesc.c | 7 ------- 3 files changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d6b606a81ef..c9bcdbf6bc63 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -976,10 +976,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq); -#endif - /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1da5e9d9da71..36673640c4fc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -6,10 +6,6 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool -# Legacy support, required for itanic -config GENERIC_IRQ_LEGACY - bool - # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b64c57b44c20..db714d3014b5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq) irq_insert_desc(irq, irq_desc + irq); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq) -{ - free_desc(irq); -} -#endif - #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) -- cgit v1.2.3 From 7a721a2fee2bce01af26699a87739db8ca8ea3c8 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:31 +0800 Subject: genirq: Add irq_chip_(startup/shutdown)_parent() As the MSI controller on SG2044 uses PLIC as the underlying interrupt controller, it needs to call irq_enable() and irq_disable() to startup/shutdown interrupts. Otherwise, the MSI interrupt can not be startup correctly and will not respond any incoming interrupt. Introduce irq_chip_startup_parent() and irq_chip_shutdown_parent() to allow the interrupt controller to call the irq_startup()/irq_shutdown() callbacks of the parent interrupt chip. In case the irq_startup()/irq_shutdown() callbacks are not implemented for the parent interrupt chip, this will fallback to irq_chip_enable_parent() or irq_chip_disable_parent(). Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Link: https://lore.kernel.org/all/20250813232835.43458-2-inochiama@gmail.com Link: https://lore.kernel.org/lkml/20250722224513.22125-1-inochiama@gmail.com/ --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index c9bcdbf6bc63..c67e76fbcc07 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -669,6 +669,8 @@ extern int irq_chip_set_parent_state(struct irq_data *data, extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); +extern void irq_chip_shutdown_parent(struct irq_data *data); +extern unsigned int irq_chip_startup_parent(struct irq_data *data); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0d0276378c70..3ffa0d80ddd1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1259,6 +1259,43 @@ int irq_chip_get_parent_state(struct irq_data *data, } EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); +/** + * irq_chip_shutdown_parent - Shutdown the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_shutdown() callback of the parent if available or falls + * back to irq_chip_disable_parent(). + */ +void irq_chip_shutdown_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_shutdown) + parent->chip->irq_shutdown(parent); + else + irq_chip_disable_parent(data); +} +EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent); + +/** + * irq_chip_startup_parent - Startup the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_startup() callback of the parent if available or falls + * back to irq_chip_enable_parent(). + */ +unsigned int irq_chip_startup_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_startup) + return parent->chip->irq_startup(parent); + + irq_chip_enable_parent(data); + return 0; +} +EXPORT_SYMBOL_GPL(irq_chip_startup_parent); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) -- cgit v1.2.3 From 54f45a30c0d0153d2be091ba2d683ab6db6d1d5b Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:32 +0800 Subject: PCI/MSI: Add startup/shutdown for per device domains As the RISC-V PLIC cannot apply affinity settings without invoking irq_enable(), it will make the interrupt unavailble when used as an underlying interrupt chip for the MSI controller. Implement the irq_startup() and irq_shutdown() callbacks for the PCI MSI and MSI-X templates. For chips that specify MSI_FLAG_PCI_MSI_STARTUP_PARENT, the parent startup and shutdown functions are invoked. That allows the interrupt on the parent chip to be enabled if the interrupt has not been enabled during allocation. This is necessary for MSI controllers which use PLIC as underlying parent interrupt chip. Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/all/20250813232835.43458-3-inochiama@gmail.com --- drivers/pci/msi/irqdomain.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/msi.h | 2 ++ 2 files changed, 54 insertions(+) (limited to 'include') diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf..e0a800f918e8 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -148,6 +148,23 @@ static void pci_device_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *d arg->hwirq = desc->msi_index; } +static void cond_shutdown_parent(struct irq_data *data) +{ + struct msi_domain_info *info = data->domain->host_data; + + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + irq_chip_shutdown_parent(data); +} + +static unsigned int cond_startup_parent(struct irq_data *data) +{ + struct msi_domain_info *info = data->domain->host_data; + + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + return irq_chip_startup_parent(data); + return 0; +} + static __always_inline void cond_mask_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; @@ -164,6 +181,23 @@ static __always_inline void cond_unmask_parent(struct irq_data *data) irq_chip_unmask_parent(data); } +static void pci_irq_shutdown_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + + pci_msi_mask(desc, BIT(data->irq - desc->irq)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + unsigned int ret = cond_startup_parent(data); + + pci_msi_unmask(desc, BIT(data->irq - desc->irq)); + return ret; +} + static void pci_irq_mask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); @@ -194,6 +228,8 @@ static void pci_irq_unmask_msi(struct irq_data *data) static const struct msi_domain_template pci_msi_template = { .chip = { .name = "PCI-MSI", + .irq_startup = pci_irq_startup_msi, + .irq_shutdown = pci_irq_shutdown_msi, .irq_mask = pci_irq_mask_msi, .irq_unmask = pci_irq_unmask_msi, .irq_write_msi_msg = pci_msi_domain_write_msg, @@ -210,6 +246,20 @@ static const struct msi_domain_template pci_msi_template = { }, }; +static void pci_irq_shutdown_msix(struct irq_data *data) +{ + pci_msix_mask(irq_data_get_msi_desc(data)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msix(struct irq_data *data) +{ + unsigned int ret = cond_startup_parent(data); + + pci_msix_unmask(irq_data_get_msi_desc(data)); + return ret; +} + static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); @@ -234,6 +284,8 @@ EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { .name = "PCI-MSIX", + .irq_startup = pci_irq_startup_msix, + .irq_shutdown = pci_irq_shutdown_msix, .irq_mask = pci_irq_mask_msix, .irq_unmask = pci_irq_unmask_msix, .irq_write_msi_msg = pci_msi_domain_write_msg, diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb..3111ba95fbde 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -568,6 +568,8 @@ enum { MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Support for parent mask/unmask */ MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), + /* Support for parent startup/shutdown */ + MSI_FLAG_PCI_MSI_STARTUP_PARENT = (1 << 10), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), -- cgit v1.2.3 From ab6d91d141a801dadf9eed7860b2ea09c9268149 Mon Sep 17 00:00:00 2001 From: Nickolay Goppen Date: Fri, 15 Aug 2025 19:56:51 +0300 Subject: dt-bindings: clock: gcc-sdm660: Add LPASS/CDSP vote clocks/GDSCs Add defines for the missing clocks, which are required to power up the related remote processors. Co-developed-by: Konrad Dybcio Signed-off-by: Konrad Dybcio Signed-off-by: Nickolay Goppen Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250815-gcc-sdm660-vote-clocks-and-gdscs-v1-1-c5a8af040093@yandex.ru Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,gcc-sdm660.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,gcc-sdm660.h b/include/dt-bindings/clock/qcom,gcc-sdm660.h index 74c22f67da21..f19018b742f5 100644 --- a/include/dt-bindings/clock/qcom,gcc-sdm660.h +++ b/include/dt-bindings/clock/qcom,gcc-sdm660.h @@ -138,10 +138,16 @@ #define GCC_UFS_UNIPRO_CORE_HW_CTL_CLK 128 #define GCC_RX0_USB2_CLKREF_CLK 129 #define GCC_RX1_USB2_CLKREF_CLK 130 +#define GCC_HLOS1_VOTE_LPASS_ADSP_SMMU_CLK 131 +#define GCC_HLOS1_VOTE_TURING_ADSP_SMMU_CLK 132 +#define GCC_HLOS2_VOTE_TURING_ADSP_SMMU_CLK 133 #define PCIE_0_GDSC 0 #define UFS_GDSC 1 #define USB_30_GDSC 2 +#define HLOS1_VOTE_TURING_ADSP_GDSC 3 +#define HLOS2_VOTE_TURING_ADSP_GDSC 4 +#define HLOS1_VOTE_LPASS_ADSP_GDSC 5 #define GCC_QUSB2PHY_PRIM_BCR 0 #define GCC_QUSB2PHY_SEC_BCR 1 -- cgit v1.2.3 From 7d50d9bf1cd00d6bab0abf3b01d5d261aa6a2b04 Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Mon, 11 Aug 2025 21:40:33 +0800 Subject: dt-bindings: clock: spacemit: CLK_SSPA_I2S_BCLK for SSPA In order to use the virtual clock SSPAx_I2S_BCLK in the device tree and register it in the driver, this patch introduces the macro definition. Fixes: 1b72c59db0add ("clk: spacemit: Add clock support for SpacemiT K1 SoC") Acked-by: Rob Herring (Arm) Signed-off-by: Troy Mitchell Link: https://lore.kernel.org/r/20250811-k1-clk-i2s-v5-1-ebadd06e1e91@linux.spacemit.com Signed-off-by: Yixun Lan --- include/dt-bindings/clock/spacemit,k1-syscon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/spacemit,k1-syscon.h b/include/dt-bindings/clock/spacemit,k1-syscon.h index 2714c3fe66cd..505205453d7f 100644 --- a/include/dt-bindings/clock/spacemit,k1-syscon.h +++ b/include/dt-bindings/clock/spacemit,k1-syscon.h @@ -182,6 +182,8 @@ #define CLK_SSPA1_BUS 97 #define CLK_TSEN_BUS 98 #define CLK_IPC_AP2AUD_BUS 99 +#define CLK_SSPA0_I2S_BCLK 100 +#define CLK_SSPA1_I2S_BCLK 101 /* APBC resets */ #define RESET_UART0 0 -- cgit v1.2.3 From 92a96b0a227e91dc42475265a1ce766b6cd044fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 Aug 2025 23:09:18 +0100 Subject: io_uring: add request poisoning Poison various request fields on free. __io_req_caches_free() is a slow path, so can be done unconditionally, but gate it on kasan for io_req_add_to_cache(). Note that some fields are logically retained between cache allocations and can't be poisoned in io_req_add_to_cache(). Ideally, it'd be replaced with KASAN'ed caches, but that can't be enabled because of some synchronisation nuances. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7a78e8a7f5be434313c400650b862e36c211b312.1755459452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/poison.h | 3 +++ io_uring/io_uring.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/poison.h b/include/linux/poison.h index 8ca2235f78d5..299e2dd7da6d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -90,4 +90,7 @@ /********** lib/stackdepot.c **********/ #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) +/********** io_uring/ **********/ +#define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a165..e511949086dd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -179,6 +179,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif +static void io_poison_cached_req(struct io_kiocb *req) +{ + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; +} + +static void io_poison_req(struct io_kiocb *req) +{ + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; +} + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -235,6 +255,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -2767,6 +2789,7 @@ static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } -- cgit v1.2.3 From 5fda51255439addd1c9059098e30847a375a1008 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Aug 2025 20:03:39 -0600 Subject: io_uring/kbuf: switch to storing struct io_buffer_list locally Currently the buffer list is stored in struct io_kiocb. The buffer list can be of two types: 1) Classic/legacy buffer list. These don't need to get referenced after a buffer pick, and hence storing them in struct io_kiocb is perfectly fine. 2) Ring provided buffer lists. These DO need to be referenced after the initial buffer pick, as they need to get consumed later on. This can be either just incrementing the head of the ring, or it can be consuming parts of a buffer if incremental buffer consumptions has been configured. For case 2, io_uring needs to be careful not to access the buffer list after the initial pick-and-execute context. The core does recycling of these, but it's easy to make a mistake, because it's stored in the io_kiocb which does persist across multiple execution contexts. Either because it's a multishot request, or simply because it needed some kind of async trigger (eg poll) for retry purposes. Add a struct io_buffer_list to struct io_br_sel, which is always on stack for the various users of it. This prevents the buffer list from leaking outside of that execution context, and additionally it enables kbuf to not even pass back the struct io_buffer_list if the given context isn't appropriately locked already. This doesn't fix any bugs, it's simply a defensive measure to prevent any issues with reuse of a buffer list. Link: https://lore.kernel.org/r/20250821020750.598432-12-axboe@kernel.dk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ------ io_uring/io_uring.c | 6 +++--- io_uring/kbuf.c | 27 ++++++++++++++----------- io_uring/kbuf.h | 16 ++++++--------- io_uring/net.c | 46 ++++++++++++++++-------------------------- io_uring/poll.c | 6 +++--- io_uring/rw.c | 22 ++++++++++---------- 7 files changed, 55 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d896..1d33984611bc 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -674,12 +674,6 @@ struct io_kiocb { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - struct io_rsrc_node *buf_node; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 27bc1486f07b..985b4681e513 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1007,7 +1007,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, res, req->buf_list)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -2025,11 +2025,11 @@ fail: switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); io_queue_iowq(req); break; case IO_APOLL_OK: diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 21c12c437ab9..3e9aab21af9d 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -171,8 +171,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (*len == 0 || *len > buf->len) *len = buf->len; req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; - req->buf_list = bl; req->buf_index = buf->bid; + sel.buf_list = bl; sel.addr = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { @@ -186,8 +186,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ - io_kbuf_commit(req, bl, *len, 1); - req->buf_list = NULL; + io_kbuf_commit(req, sel.buf_list, *len, 1); + sel.buf_list = NULL; } return sel; } @@ -294,7 +294,6 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; return iov - arg->iovs; } @@ -302,16 +301,15 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, struct io_br_sel *sel, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, arg->buf_group); - if (unlikely(!bl)) + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); + if (unlikely(!sel->buf_list)) goto out_unlock; - if (bl->flags & IOBL_BUF_RING) { - ret = io_ring_buffers_peek(req, arg, bl); + if (sel->buf_list->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, sel->buf_list); /* * Don't recycle these buffers if we need to go through poll. * Nobody else can use them anyway, and holding on to provided @@ -321,13 +319,16 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, */ if (ret > 0) { req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; - io_kbuf_commit(req, bl, arg->out_len, ret); + io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); } } else { - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); } out_unlock: - io_ring_submit_unlock(ctx, issue_flags); + if (issue_flags & IO_URING_F_UNLOCKED) { + sel->buf_list = NULL; + mutex_unlock(&ctx->uring_lock); + } return ret; } @@ -348,10 +349,12 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; + sel->buf_list = bl; return ret; } /* don't support multiple buffer selections for legacy */ + sel->buf_list = NULL; return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b1723c2620da..1a539969fc9c 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -63,11 +63,14 @@ struct buf_sel_arg { }; /* - * Return value from io_buffer_list selection. Just returns the error or - * user address for now, will be extended to return the buffer list in the - * future. + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. */ struct io_br_sel { + struct io_buffer_list *buf_list; /* * Some selection parts return the user address, others return an error. */ @@ -107,13 +110,6 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, struct io_buffer_list *bl) { - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - * The exception is partial io, that case we should increment bl->head - * to monopolize the buffer. - */ if (bl) { req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; diff --git a/io_uring/net.c b/io_uring/net.c index 4eb208d24169..b00cd2f59091 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -433,7 +433,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; sr->msg_flags |= MSG_WAITALL; - req->buf_list = NULL; req->flags |= REQ_F_MULTISHOT; } @@ -512,11 +511,11 @@ static inline bool io_send_finish(struct io_kiocb *req, unsigned int cflags; if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { - cflags = io_put_kbuf(req, sel->val, req->buf_list); + cflags = io_put_kbuf(req, sel->val, sel->buf_list); goto finish; } - cflags = io_put_kbufs(req, sel->val, req->buf_list, io_bundle_nbufs(kmsg, sel->val)); + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); if (bundle_finished || req->flags & REQ_F_BL_EMPTY) goto finish; @@ -657,6 +656,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_bundle: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); if (ret) @@ -682,7 +682,7 @@ retry_bundle: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -795,18 +795,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (req->flags & REQ_F_BUFFER_SELECT) { - /* - * Store the buffer group for this multishot receive separately, - * as if we end up doing an io-wq based issue that selects a - * buffer, it has to be committed immediately and that will - * clear ->buf_list. This means we lose the link to the buffer - * list, and the eventual buffer put on completion then cannot - * restore it. - */ + if (req->flags & REQ_F_BUFFER_SELECT) sr->buf_group = req->buf_index; - req->buf_list = NULL; - } sr->mshot_total_len = sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -874,7 +864,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, if (sr->flags & IORING_RECVSEND_BUNDLE) { size_t this_ret = sel->val - sr->done_io; - cflags |= io_put_kbufs(req, this_ret, req->buf_list, io_bundle_nbufs(kmsg, this_ret)); + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); if (sr->flags & IORING_RECV_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); if (sr->mshot_len && sel->val >= sr->mshot_len) @@ -896,7 +886,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, return false; } } else { - cflags |= io_put_kbuf(req, sel->val, req->buf_list); + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); } /* @@ -1038,6 +1028,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { size_t len = sr->len; @@ -1048,7 +1039,7 @@ retry_multishot: if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); if (ret) { - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } } @@ -1072,14 +1063,12 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) - io_kbuf_recycle(req, req->buf_list, issue_flags); - + io_kbuf_recycle(req, sel.buf_list, issue_flags); return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1093,7 +1082,7 @@ retry_multishot: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); sel.val = ret; if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) @@ -1178,7 +1167,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; - struct io_br_sel sel = { }; + struct io_br_sel sel; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -1198,6 +1187,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { sel.val = sr->len; ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); @@ -1217,16 +1207,14 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) - io_kbuf_recycle(req, req->buf_list, issue_flags); - + io_kbuf_recycle(req, sel.buf_list, issue_flags); return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1242,7 +1230,7 @@ out_free: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); sel.val = ret; if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) diff --git a/io_uring/poll.c b/io_uring/poll.c index 07ab22380c78..f3852bf7627b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -316,10 +316,10 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); return; } else if (ret == IOU_POLL_REQUEUE) { - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); __io_poll_execute(req, 0); return; } @@ -686,7 +686,7 @@ int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, NULL, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) diff --git a/io_uring/rw.c b/io_uring/rw.c index 2b106f644383..906e869d330a 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -579,7 +579,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, req->buf_list); + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); io_req_rw_cleanup(req, 0); io_req_task_complete(req, tw); @@ -648,7 +648,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } static int kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned final_ret = io_fixup_rw_res(req, ret); @@ -662,7 +662,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, * from the submission path. */ io_req_io_end(req); - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, req->buf_list)); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list)); io_req_rw_cleanup(req, issue_flags); return IOU_COMPLETE; } else { @@ -1024,10 +1024,10 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = __io_read(req, &sel, issue_flags); if (ret >= 0) - return kiocb_done(req, ret, issue_flags); + return kiocb_done(req, ret, &sel, issue_flags); if (req->flags & REQ_F_BUFFERS_COMMIT) - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } @@ -1057,15 +1057,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Reset rw->len to 0 again to avoid clamping future mshot * reads, in case the buffer size varies. */ - if (io_kbuf_recycle(req, req->buf_list, issue_flags)) + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) rw->len = 0; return IOU_RETRY; } else if (ret <= 0) { - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); if (ret < 0) req_set_fail(req); } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - cflags = io_put_kbuf(req, ret, req->buf_list); + cflags = io_put_kbuf(req, ret, sel.buf_list); } else { /* * Any successful return value will keep the multishot read @@ -1073,7 +1073,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, ret, req->buf_list); + cflags = io_put_kbuf(req, ret, sel.buf_list); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1202,7 +1202,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } done: - return kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, NULL, issue_flags); } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); @@ -1370,7 +1370,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - req->cqe.flags = io_put_kbuf(req, req->cqe.res, req->buf_list); + req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } -- cgit v1.2.3 From d589bcddaa3f8b1668499c3f0466863df3abe37a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:06 +0800 Subject: io-uring: move `struct io_br_sel` into io_uring_types.h Move `struct io_br_sel` into io_uring_types.h and prepare for supporting provided buffer on uring_cmd. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 19 +++++++++++++++++++ io_uring/kbuf.h | 18 ------------------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1d33984611bc..9c6c548f43f5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -85,6 +85,25 @@ struct io_mapped_region { unsigned flags; }; +/* + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. + */ +struct io_br_sel { + struct io_buffer_list *buf_list; + /* + * Some selection parts return the user address, others return an error. + */ + union { + void __user *addr; + ssize_t val; + }; +}; + + /* * Arbitrary limit, can be raised if need be */ diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 32f73adbe1e9..ada382ff38d7 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -62,24 +62,6 @@ struct buf_sel_arg { unsigned short partial_map; }; -/* - * Return value from io_buffer_list selection, to avoid stashing it in - * struct io_kiocb. For legacy/classic provided buffers, keeping a reference - * across execution contexts are fine. But for ring provided buffers, the - * list may go away as soon as ->uring_lock is dropped. As the io_kiocb - * persists, it's better to just keep the buffer local for those cases. - */ -struct io_br_sel { - struct io_buffer_list *buf_list; - /* - * Some selection parts return the user address, others return an error. - */ - union { - void __user *addr; - ssize_t val; - }; -}; - struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, -- cgit v1.2.3 From 620a50c927004f5c9420a7ca9b1a55673dbf3941 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:07 +0800 Subject: io_uring: uring_cmd: add multishot support Add UAPI flag IORING_URING_CMD_MULTISHOT for supporting multishot uring_cmd operations with provided buffer. This enables drivers to post multiple completion events from a single uring_cmd submission, which is useful for: - Notifying userspace of device events (e.g., interrupt handling) - Supporting devices with multiple event sources (e.g., multi-queue devices) - Avoiding the need for device poll() support when events originate from multiple sources device-wide The implementation adds two new APIs: - io_uring_cmd_select_buffer(): selects a buffer from the provided buffer group for multishot uring_cmd - io_uring_mshot_cmd_post_cqe(): posts a CQE after event data is pushed to the provided buffer Multishot uring_cmd must be used with buffer select (IOSQE_BUFFER_SELECT) and is mutually exclusive with IORING_URING_CMD_FIXED for now. The ublk driver will be the first user of this functionality: https://github.com/ming1/linux/commits/ublk-devel/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-3-ming.lei@redhat.com [axboe: fold in fix for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 26 ++++++++++++++++ include/uapi/linux/io_uring.h | 6 +++- io_uring/opdef.c | 1 + io_uring/uring_cmd.c | 71 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..4bd3a7339243 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -70,6 +70,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -102,6 +117,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + ssize_t ret, unsigned int issue_flags) +{ + return true; +} #endif /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..1e935f8901c5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -298,9 +298,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9568785810d9..932319633eac 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_URING_CMD] = { + .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0..3cfb5d51b88a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" +#include "kbuf.h" #include "uring_cmd.h" #include "poll.h" @@ -194,8 +195,21 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) + return -EINVAL; req->buf_index = READ_ONCE(sqe->buf_index); + } + + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ioucmd->flags & IORING_URING_CMD_FIXED) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + } else { + if (req->flags & REQ_F_BUFFER_SELECT) + return -EINVAL; + } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -251,6 +265,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ret >= 0) + return IOU_ISSUE_SKIP_COMPLETE; + } if (ret == -EAGAIN) { ioucmd->flags |= IORING_URING_CMD_REISSUE; return ret; @@ -333,3 +351,54 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, return false; return io_req_post_cqe32(req, cqe); } + +/* + * Work with io_uring_mshot_cmd_post_cqe() together for committing the + * provided buffer upfront + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return (struct io_br_sel) { .val = -EINVAL }; + + if (WARN_ON_ONCE(!io_do_buffer_select(req))) + return (struct io_br_sel) { .val = -EINVAL }; + + return io_buffer_select(req, len, buf_group, issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); + +/* + * Return true if this multishot uring_cmd needs to be completed, otherwise + * the event CQE is posted successfully. + * + * This function must use `struct io_br_sel` returned from + * io_uring_cmd_buffer_select() for committing the buffer in the same + * uring_cmd submission context. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + unsigned int cflags = 0; + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return true; + + if (sel->val > 0) { + cflags = io_put_kbuf(req, sel->val, sel->buf_list); + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) + return false; + } + + io_kbuf_recycle(req, sel->buf_list, issue_flags); + if (sel->val < 0) + req_set_fail(req); + io_req_set_res(req, sel->val, cflags); + return true; +} +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe); -- cgit v1.2.3 From d0201c4436c53412146d526855c585fa9d54ca13 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:01:46 -0600 Subject: io_uring: remove io_ctx_cqe32() helper It's pretty pointless and only used for the tracing helper, get rid of it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ------ include/trace/events/io_uring.h | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9c6c548f43f5..d1e25f3fe0b3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -740,10 +740,4 @@ struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; - -static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) -{ - return ctx->flags & IORING_SETUP_CQE32; -} - #endif diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 178ab6f611be..6a970625a3ea 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 ? cqe->big_cqe[0] : 0; + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " -- cgit v1.2.3 From b69458735d826f0676585623d028a0fd474f3e4f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:08:14 -0600 Subject: io_uring: add UAPI definitions for mixed CQE postings This adds the CQE flags related to supporting a mixed CQ ring mode, where both normal (16b) and big (32b) CQEs may be posted. No functional changes in this patch. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1e935f8901c5..7af8d10b3aba 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -491,12 +491,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 -- cgit v1.2.3 From 89a885972140ea68d3f55457d23d0da2350c96ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:13:36 -0600 Subject: io_uring/trace: support completion tracing of mixed 32b CQEs Check for IORING_CQE_F_32 as well, not just if the ring was setup with IORING_SETUP_CQE32 to only support big CQEs. Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 6a970625a3ea..45d15460b495 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; - __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 ? cqe->big_cqe[0] : 0; - __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 ? cqe->big_cqe[1] : 0; + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0; + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " -- cgit v1.2.3 From 6e376f245f19feeadddafb2c3fa5fbd6469ecdfe Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:42 +0200 Subject: gpio: generic: provide to_gpio_generic_chip() Provide a helper allowing to convert a struct gpio_chip address to the struct gpio_generic_chip that wraps it. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-1-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index f3a8db4598bb..5a85ecbef8d2 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -55,6 +55,12 @@ struct gpio_generic_chip { struct gpio_chip gc; }; +static inline struct gpio_generic_chip * +to_gpio_generic_chip(struct gpio_chip *gc) +{ + return container_of(gc, struct gpio_generic_chip, gc); +} + /** * gpio_generic_chip_init() - Initialize a generic GPIO chip. * @chip: Generic GPIO chip to set up. -- cgit v1.2.3 From 16397871b6e35fa46a2bec27b3558f93b050c6fc Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:43 +0200 Subject: gpio: generic: provide helpers for reading and writing registers Provide helpers wrapping the read_reg() and write_reg() callbacks of the generic GPIO API that are called directly by many users. This is done to hide their implementation ahead of moving them into the separate generic GPIO struct. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-2-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 5a85ecbef8d2..4c0626b53ec9 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -100,6 +100,37 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, return chip->gc.set(&chip->gc, offset, value); } +/** + * gpio_generic_read_reg() - Read a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to read. + * + * Returns: value read from register. + */ +static inline unsigned long +gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) +{ + if (WARN_ON(!chip->gc.read_reg)) + return 0; + + return chip->gc.read_reg(reg); +} + +/** + * gpio_generic_write_reg() - Write a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to write to. + * @val: New value to write. + */ +static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, + void __iomem *reg, unsigned long val) +{ + if (WARN_ON(!chip->gc.write_reg)) + return; + + chip->gc.write_reg(reg, val); +} + #define gpio_generic_chip_lock(gen_gc) \ raw_spin_lock(&(gen_gc)->gc.bgpio_lock) -- cgit v1.2.3 From 60ad9a07319283e6e1094cef3e972e754315c024 Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Wed, 20 Aug 2025 08:47:55 +0800 Subject: iio: core: switch info_mask fields to unsigned long to match find_bit helpers for_each_set_bit()/find_*_bit() expect arrays of unsigned long (see include/linux/find.h), but industrialio-core passed const long * into iio_device_add_info_mask_type{,_avail}(). These masks are used purely as bit arrays and are populated via BIT() (1UL << n). Switch the info_mask_* fields and the corresponding function parameters to unsigned long so the types match the helpers. This removes sparse warnings about signedness mismatches (seen with 'make C=1' CF='-Wsparse-all') without changing behavior or struct layout. No functional change intended. Suggested-by: Jonathan Cameron Signed-off-by: Junjie Cao Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250820004755.69627-1-junjie.cao@intel.com Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 4 ++-- include/linux/iio/iio.h | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index eb6a54f8115d..38848d753a33 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1244,7 +1244,7 @@ static int iio_device_add_channel_label(struct iio_dev *indio_dev, static int iio_device_add_info_mask_type(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, enum iio_shared_by shared_by, - const long *infomask) + const unsigned long *infomask) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret, attrcount = 0; @@ -1274,7 +1274,7 @@ static int iio_device_add_info_mask_type(struct iio_dev *indio_dev, static int iio_device_add_info_mask_type_avail(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, enum iio_shared_by shared_by, - const long *infomask) + const unsigned long *infomask) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret, attrcount = 0; diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 2f5560646ee4..872ebdf0dd77 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -271,14 +271,14 @@ struct iio_chan_spec { unsigned int num_ext_scan_type; }; }; - long info_mask_separate; - long info_mask_separate_available; - long info_mask_shared_by_type; - long info_mask_shared_by_type_available; - long info_mask_shared_by_dir; - long info_mask_shared_by_dir_available; - long info_mask_shared_by_all; - long info_mask_shared_by_all_available; + unsigned long info_mask_separate; + unsigned long info_mask_separate_available; + unsigned long info_mask_shared_by_type; + unsigned long info_mask_shared_by_type_available; + unsigned long info_mask_shared_by_dir; + unsigned long info_mask_shared_by_dir_available; + unsigned long info_mask_shared_by_all; + unsigned long info_mask_shared_by_all_available; const struct iio_event_spec *event_spec; unsigned int num_event_specs; const struct iio_chan_spec_ext_info *ext_info; -- cgit v1.2.3 From 5195b777552d2e2fa735c6cad75797efa132bd60 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 29 Jul 2025 02:50:10 +0300 Subject: media: v4l2-subdev: Make struct v4l2_subdev_stream_config private The v4l2_subdev_stream_config structure holds configuration data for a stream. It was meant to be used internally only, but already found its way into the ds90ub913 driver. Now that the driver has been fixed, make the structure private to v4l2-subdev.c to avoid using it by accident. Signed-off-by: Laurent Pinchart Reviewed-by: Jacopo Mondi Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-subdev.c | 24 ++++++++++++++++++++++++ include/media/v4l2-subdev.h | 25 +------------------------ 2 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 41e4aca77b7f..1da953629010 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -26,6 +26,30 @@ #include #include +/** + * struct v4l2_subdev_stream_config - Used for storing stream configuration. + * + * @pad: pad number + * @stream: stream number + * @enabled: has the stream been enabled with v4l2_subdev_enable_streams() + * @fmt: &struct v4l2_mbus_framefmt + * @crop: &struct v4l2_rect to be used for crop + * @compose: &struct v4l2_rect to be used for compose + * @interval: frame interval + * + * This structure stores configuration for a stream. + */ +struct v4l2_subdev_stream_config { + u32 pad; + u32 stream; + bool enabled; + + struct v4l2_mbus_framefmt fmt; + struct v4l2_rect crop; + struct v4l2_rect compose; + struct v4l2_fract interval; +}; + #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) /* * The Streams API is an experimental feature. To use the Streams API, set diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 5dcf4065708f..8f54fd0d90ad 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -36,6 +36,7 @@ struct v4l2_event_subscription; struct v4l2_fh; struct v4l2_subdev; struct v4l2_subdev_fh; +struct v4l2_subdev_stream_config; struct tuner_setup; struct v4l2_mbus_frame_desc; struct led_classdev; @@ -683,30 +684,6 @@ struct v4l2_subdev_pad_config { struct v4l2_fract interval; }; -/** - * struct v4l2_subdev_stream_config - Used for storing stream configuration. - * - * @pad: pad number - * @stream: stream number - * @enabled: has the stream been enabled with v4l2_subdev_enable_streams() - * @fmt: &struct v4l2_mbus_framefmt - * @crop: &struct v4l2_rect to be used for crop - * @compose: &struct v4l2_rect to be used for compose - * @interval: frame interval - * - * This structure stores configuration for a stream. - */ -struct v4l2_subdev_stream_config { - u32 pad; - u32 stream; - bool enabled; - - struct v4l2_mbus_framefmt fmt; - struct v4l2_rect crop; - struct v4l2_rect compose; - struct v4l2_fract interval; -}; - /** * struct v4l2_subdev_stream_configs - A collection of stream configs. * -- cgit v1.2.3 From f37df9a0eb5e43fcfe02cbaef076123dc0d79c7e Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Fri, 8 Aug 2025 11:59:15 +0300 Subject: media: v4l2-subdev: Fix alloc failure check in v4l2_subdev_call_state_try() v4l2_subdev_call_state_try() macro allocates a subdev state with __v4l2_subdev_state_alloc(), but does not check the returned value. If __v4l2_subdev_state_alloc fails, it returns an ERR_PTR, and that would cause v4l2_subdev_call_state_try() to crash. Add proper error handling to v4l2_subdev_call_state_try(). Signed-off-by: Tomi Valkeinen Fixes: 982c0487185b ("media: subdev: Add v4l2_subdev_call_state_try() macro") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aJTNtpDUbTz7eyJc%40stanley.mountain/ Cc: stable@vger.kernel.org Reviewed-by: Dan Carpenter Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/v4l2-subdev.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 8f54fd0d90ad..4b28086808c9 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -1939,19 +1939,23 @@ extern const struct v4l2_subdev_ops v4l2_subdev_call_wrappers; * * Note: only legacy non-MC drivers may need this macro. */ -#define v4l2_subdev_call_state_try(sd, o, f, args...) \ - ({ \ - int __result; \ - static struct lock_class_key __key; \ - const char *name = KBUILD_BASENAME \ - ":" __stringify(__LINE__) ":state->lock"; \ - struct v4l2_subdev_state *state = \ - __v4l2_subdev_state_alloc(sd, name, &__key); \ - v4l2_subdev_lock_state(state); \ - __result = v4l2_subdev_call(sd, o, f, state, ##args); \ - v4l2_subdev_unlock_state(state); \ - __v4l2_subdev_state_free(state); \ - __result; \ +#define v4l2_subdev_call_state_try(sd, o, f, args...) \ + ({ \ + int __result; \ + static struct lock_class_key __key; \ + const char *name = KBUILD_BASENAME \ + ":" __stringify(__LINE__) ":state->lock"; \ + struct v4l2_subdev_state *state = \ + __v4l2_subdev_state_alloc(sd, name, &__key); \ + if (IS_ERR(state)) { \ + __result = PTR_ERR(state); \ + } else { \ + v4l2_subdev_lock_state(state); \ + __result = v4l2_subdev_call(sd, o, f, state, ##args); \ + v4l2_subdev_unlock_state(state); \ + __v4l2_subdev_state_free(state); \ + } \ + __result; \ }) /** -- cgit v1.2.3 From 683342ce3c0dae068bf0ee157ee12c13088193f7 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 18 Aug 2025 16:49:39 +0300 Subject: media: v4l2-common: Drop the workaround from v4l2_get_link_freq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the workaround that allowed calling v4l2_get_link_freq() on the control handler. Signed-off-by: Sakari Ailus Reviewed-by: Niklas Söderlund Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-common.c | 17 ++++++++--------- include/media/v4l2-common.h | 19 ++++--------------- 2 files changed, 12 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index a5334aa35992..0574f5d685f8 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -512,8 +512,9 @@ int v4l2_fill_pixfmt(struct v4l2_pix_format *pixfmt, u32 pixelformat, } EXPORT_SYMBOL_GPL(v4l2_fill_pixfmt); -s64 __v4l2_get_link_freq_ctrl(struct v4l2_ctrl_handler *handler, - unsigned int mul, unsigned int div) +#ifdef CONFIG_MEDIA_CONTROLLER +static s64 v4l2_get_link_freq_ctrl(struct v4l2_ctrl_handler *handler, + unsigned int mul, unsigned int div) { struct v4l2_ctrl *ctrl; s64 freq; @@ -548,11 +549,9 @@ s64 __v4l2_get_link_freq_ctrl(struct v4l2_ctrl_handler *handler, return freq > 0 ? freq : -EINVAL; } -EXPORT_SYMBOL_GPL(__v4l2_get_link_freq_ctrl); -#ifdef CONFIG_MEDIA_CONTROLLER -s64 __v4l2_get_link_freq_pad(struct media_pad *pad, unsigned int mul, - unsigned int div) +s64 v4l2_get_link_freq(struct media_pad *pad, unsigned int mul, + unsigned int div) { struct v4l2_mbus_config mbus_config = {}; struct v4l2_subdev *sd; @@ -571,10 +570,10 @@ s64 __v4l2_get_link_freq_pad(struct media_pad *pad, unsigned int mul, * Fall back to using the link frequency control if the media bus config * doesn't provide a link frequency. */ - return __v4l2_get_link_freq_ctrl(sd->ctrl_handler, mul, div); + return v4l2_get_link_freq_ctrl(sd->ctrl_handler, mul, div); } -EXPORT_SYMBOL_GPL(__v4l2_get_link_freq_pad); -#endif /* CONFIG_MEDIA_CONTROLLER */ +EXPORT_SYMBOL_GPL(v4l2_get_link_freq); +#endif /* * Simplify a fraction using a simple continued fraction decomposition. The diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 39dd0c78d70f..ab0ce8e605c3 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -560,15 +560,14 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, /** * v4l2_get_link_freq - Get link rate from transmitter * - * @pad: The transmitter's media pad (or control handler for non-MC users or - * compatibility reasons, don't use in new code) + * @pad: The transmitter's media pad * @mul: The multiplier between pixel rate and link frequency. Bits per pixel on * D-PHY, samples per clock on parallel. 0 otherwise. * @div: The divisor between pixel rate and link frequency. Number of data lanes * times two on D-PHY, 1 on parallel. 0 otherwise. * * This function is intended for obtaining the link frequency from the - * transmitter sub-devices. It returns the link rate, either from the + * transmitter sub-device's pad. It returns the link rate, either from the * V4L2_CID_LINK_FREQ control implemented by the transmitter, or value * calculated based on the V4L2_CID_PIXEL_RATE implemented by the transmitter. * @@ -578,19 +577,9 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, * * %-EINVAL: Invalid link frequency value */ #ifdef CONFIG_MEDIA_CONTROLLER -#define v4l2_get_link_freq(pad, mul, div) \ - _Generic(pad, \ - struct media_pad *: __v4l2_get_link_freq_pad, \ - struct v4l2_ctrl_handler *: __v4l2_get_link_freq_ctrl) \ - (pad, mul, div) -s64 __v4l2_get_link_freq_pad(struct media_pad *pad, unsigned int mul, - unsigned int div); -#else -#define v4l2_get_link_freq(handler, mul, div) \ - __v4l2_get_link_freq_ctrl(handler, mul, div) +s64 v4l2_get_link_freq(struct media_pad *pad, unsigned int mul, + unsigned int div); #endif -s64 __v4l2_get_link_freq_ctrl(struct v4l2_ctrl_handler *handler, - unsigned int mul, unsigned int div); void v4l2_simplify_fraction(u32 *numerator, u32 *denominator, unsigned int n_terms, unsigned int threshold); -- cgit v1.2.3 From bdc9776dac860cae7f61e2b48929f87597306644 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 21 Aug 2025 14:56:49 +0300 Subject: media: v4l2-common: Update v4l2_get_link_freq() documentation Document that v4l2_get_link_freq() obtains the link frequency primarily by calling the get_mbus_config sub-device pad operation. Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/media/v4l2-common.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index ab0ce8e605c3..e31b4434ea5d 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -566,10 +566,12 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, * @div: The divisor between pixel rate and link frequency. Number of data lanes * times two on D-PHY, 1 on parallel. 0 otherwise. * - * This function is intended for obtaining the link frequency from the - * transmitter sub-device's pad. It returns the link rate, either from the - * V4L2_CID_LINK_FREQ control implemented by the transmitter, or value - * calculated based on the V4L2_CID_PIXEL_RATE implemented by the transmitter. + * This function obtains and returns the link frequency from the transmitter + * sub-device's pad. The link frequency is retrieved using the get_mbus_config + * sub-device pad operation. If this fails, the function falls back to obtaining + * the frequency either directly from the V4L2_CID_LINK_FREQ control if + * implemented by the transmitter, or by calculating it from the pixel rate + * obtained from the V4L2_CID_PIXEL_RATE control. * * Return: * * >0: Link frequency -- cgit v1.2.3 From 7b78fa862296f8931e42ecaec3703e307e4044d2 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Mon, 11 Aug 2025 13:50:17 +0530 Subject: media: cadence: cdns-csi2rx: Support multiple pixels per clock cycle The output pixel interface is a parallel bus (32 bits), which supports sending multiple pixels (1, 2 or 4) per clock cycle for smaller pixel widths like RAW8-RAW16. Dual-pixel and Quad-pixel modes can be a requirement if the export rate of the Cadence IP in Single-pixel mode maxes out before the maximum supported DPHY-RX frequency, which is the case with TI's integration of this IP [1]. So, we export a function that lets the downstream hardware block request a higher pixel-per-clock on a particular output pad. We check if we can support the requested pixels per clock given the known maximum for the currently configured format. If not, we set it to the highest feasible value and return this value to the caller. [1] Section 12.6.1.4.8.14 CSI_RX_IF Programming Restrictions of AM62 TRM Link: https://www.ti.com/lit/pdf/spruj16 Tested-by: Yemike Abhilash Chandra (on SK-AM68) Signed-off-by: Jai Luthra Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- MAINTAINERS | 1 + drivers/media/platform/cadence/cdns-csi2rx.c | 74 +++++++++++++++++++++------- include/media/cadence/cdns-csi2rx.h | 19 +++++++ 3 files changed, 76 insertions(+), 18 deletions(-) create mode 100644 include/media/cadence/cdns-csi2rx.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 2d5f73d632f1..41e9014db574 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5354,6 +5354,7 @@ S: Maintained F: Documentation/devicetree/bindings/media/cdns,*.txt F: Documentation/devicetree/bindings/media/cdns,csi2rx.yaml F: drivers/media/platform/cadence/cdns-csi2* +F: include/media/cadence/cdns-csi2* CADENCE NAND DRIVER L: linux-mtd@lists.infradead.org diff --git a/drivers/media/platform/cadence/cdns-csi2rx.c b/drivers/media/platform/cadence/cdns-csi2rx.c index c23204748785..828b4ba4301d 100644 --- a/drivers/media/platform/cadence/cdns-csi2rx.c +++ b/drivers/media/platform/cadence/cdns-csi2rx.c @@ -5,8 +5,10 @@ * Copyright (C) 2017 Cadence Design Systems Inc. */ +#include #include #include +#include #include #include #include @@ -17,6 +19,7 @@ #include #include +#include #include #include #include @@ -52,7 +55,9 @@ #define CSI2RX_STREAM_DATA_CFG_VC_SELECT(n) BIT((n) + 16) #define CSI2RX_STREAM_CFG_REG(n) (CSI2RX_STREAM_BASE(n) + 0x00c) -#define CSI2RX_STREAM_CFG_FIFO_MODE_LARGE_BUF (1 << 8) +#define CSI2RX_STREAM_CFG_FIFO_MODE_LARGE_BUF BIT(8) +#define CSI2RX_STREAM_CFG_NUM_PIXELS_MASK GENMASK(5, 4) +#define CSI2RX_STREAM_CFG_NUM_PIXELS(n) ((n) >> 1U) #define CSI2RX_LANES_MAX 4 #define CSI2RX_STREAMS_MAX 4 @@ -87,7 +92,10 @@ enum csi2rx_pads { struct csi2rx_fmt { u32 code; + /* width of a single pixel on CSI-2 bus */ u8 bpp; + /* max pixels per clock supported on output bus */ + u8 max_pixels; }; struct csi2rx_event { @@ -132,6 +140,7 @@ struct csi2rx_priv { struct reset_control *pixel_rst[CSI2RX_STREAMS_MAX]; struct phy *dphy; + u8 num_pixels[CSI2RX_STREAMS_MAX]; u8 lanes[CSI2RX_LANES_MAX]; u8 num_lanes; u8 max_lanes; @@ -149,22 +158,22 @@ struct csi2rx_priv { }; static const struct csi2rx_fmt formats[] = { - { .code = MEDIA_BUS_FMT_YUYV8_1X16, .bpp = 16, }, - { .code = MEDIA_BUS_FMT_UYVY8_1X16, .bpp = 16, }, - { .code = MEDIA_BUS_FMT_YVYU8_1X16, .bpp = 16, }, - { .code = MEDIA_BUS_FMT_VYUY8_1X16, .bpp = 16, }, - { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .bpp = 8, }, - { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .bpp = 8, }, - { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .bpp = 8, }, - { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .bpp = 8, }, - { .code = MEDIA_BUS_FMT_Y8_1X8, .bpp = 8, }, - { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .bpp = 10, }, - { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .bpp = 10, }, - { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .bpp = 10, }, - { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .bpp = 10, }, - { .code = MEDIA_BUS_FMT_RGB565_1X16, .bpp = 16, }, - { .code = MEDIA_BUS_FMT_RGB888_1X24, .bpp = 24, }, - { .code = MEDIA_BUS_FMT_BGR888_1X24, .bpp = 24, }, + { .code = MEDIA_BUS_FMT_YUYV8_1X16, .bpp = 16, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_UYVY8_1X16, .bpp = 16, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_YVYU8_1X16, .bpp = 16, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_VYUY8_1X16, .bpp = 16, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .bpp = 8, .max_pixels = 4, }, + { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .bpp = 8, .max_pixels = 4, }, + { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .bpp = 8, .max_pixels = 4, }, + { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .bpp = 8, .max_pixels = 4, }, + { .code = MEDIA_BUS_FMT_Y8_1X8, .bpp = 8, .max_pixels = 4, }, + { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .bpp = 10, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .bpp = 10, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .bpp = 10, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .bpp = 10, .max_pixels = 2, }, + { .code = MEDIA_BUS_FMT_RGB565_1X16, .bpp = 16, .max_pixels = 1, }, + { .code = MEDIA_BUS_FMT_RGB888_1X24, .bpp = 24, .max_pixels = 1, }, + { .code = MEDIA_BUS_FMT_BGR888_1X24, .bpp = 24, .max_pixels = 1, }, }; static void csi2rx_configure_error_irq_mask(void __iomem *base, @@ -370,7 +379,9 @@ static int csi2rx_start(struct csi2rx_priv *csi2rx) reset_control_deassert(csi2rx->pixel_rst[i]); - writel(CSI2RX_STREAM_CFG_FIFO_MODE_LARGE_BUF, + writel(CSI2RX_STREAM_CFG_FIFO_MODE_LARGE_BUF | + FIELD_PREP(CSI2RX_STREAM_CFG_NUM_PIXELS_MASK, + csi2rx->num_pixels[i]), csi2rx->base + CSI2RX_STREAM_CFG_REG(i)); /* @@ -569,6 +580,33 @@ static int csi2rx_init_state(struct v4l2_subdev *subdev, return csi2rx_set_fmt(subdev, state, &format); } +int cdns_csi2rx_negotiate_ppc(struct v4l2_subdev *subdev, unsigned int pad, + u8 *ppc) +{ + struct csi2rx_priv *csi2rx = v4l2_subdev_to_csi2rx(subdev); + const struct csi2rx_fmt *csi_fmt; + struct v4l2_subdev_state *state; + struct v4l2_mbus_framefmt *fmt; + + if (!ppc || pad < CSI2RX_PAD_SOURCE_STREAM0 || pad >= CSI2RX_PAD_MAX) + return -EINVAL; + + state = v4l2_subdev_lock_and_get_active_state(subdev); + fmt = v4l2_subdev_state_get_format(state, pad); + csi_fmt = csi2rx_get_fmt_by_code(fmt->code); + + /* Reduce requested PPC if it is too high */ + *ppc = min(*ppc, csi_fmt->max_pixels); + + v4l2_subdev_unlock_state(state); + + csi2rx->num_pixels[pad - CSI2RX_PAD_SOURCE_STREAM0] = + CSI2RX_STREAM_CFG_NUM_PIXELS(*ppc); + + return 0; +} +EXPORT_SYMBOL_GPL_FOR_MODULES(cdns_csi2rx_negotiate_ppc, "j721e-csi2rx"); + static const struct v4l2_subdev_pad_ops csi2rx_pad_ops = { .enum_mbus_code = csi2rx_enum_mbus_code, .get_fmt = v4l2_subdev_get_fmt, diff --git a/include/media/cadence/cdns-csi2rx.h b/include/media/cadence/cdns-csi2rx.h new file mode 100644 index 000000000000..782d03fc36d1 --- /dev/null +++ b/include/media/cadence/cdns-csi2rx.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _CDNS_CSI2RX_H +#define _CDNS_CSI2RX_H + +#include + +/** + * cdns_csi2rx_negotiate_ppc - Negotiate pixel-per-clock on output interface + * + * @subdev: point to &struct v4l2_subdev + * @pad: pad number of the source pad + * @ppc: pointer to requested pixel-per-clock value + * + * Returns 0 on success, negative error code otherwise. + */ +int cdns_csi2rx_negotiate_ppc(struct v4l2_subdev *subdev, unsigned int pad, + u8 *ppc); + +#endif -- cgit v1.2.3 From 7a6fc1634cea6f220228a69b1c0210e6b8b1aaf0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:45 -0700 Subject: blk-mq-dma: create blk_map_iter type The req_iterator happens to have a similar fields to what the dma iterator needs, but we're not necessarily iterating a request's bi_io_vec. Create a new type that can be amended for additional future use. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++-- include/linux/blk-mq-dma.h | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ad283017caef..51e7a0ff045f 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -10,7 +10,7 @@ struct phys_vec { u32 len; }; -static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, +static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; @@ -246,7 +246,7 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct req_iterator iter = { + struct blk_map_iter iter = { .bio = rq->bio, }; struct phys_vec vec; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c26a01aeae00..6a7e3828673d 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -5,6 +5,11 @@ #include #include +struct blk_map_iter { + struct bvec_iter iter; + struct bio *bio; +}; + struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; @@ -14,7 +19,7 @@ struct blk_dma_iter { blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ - struct req_iterator iter; + struct blk_map_iter iter; struct pci_p2pdma_map_state p2pdma; }; -- cgit v1.2.3 From dae75dead2359edd7c55e1964e0edf7d03535b31 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:46 -0700 Subject: blk-mq-dma: provide the bio_vec array being iterated This will make it easier to add different sources of the bvec array, like for upcoming integrity support, rather than assume to use the bio's bi_io_vec. It also makes iterating "special" payloads more in common with iterating normal payloads. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-3-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 56 +++++++++++++++++++++++++++------------------- include/linux/blk-mq-dma.h | 1 + 2 files changed, 34 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 51e7a0ff045f..8f41fe740b46 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -16,23 +16,14 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, unsigned int max_size; struct bio_vec bv; - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - if (!iter->iter.bi_size) return false; - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next @@ -43,19 +34,20 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct bio_vec next; if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) + if (!iter->bio || !iter->bio->bi_next) break; iter->bio = iter->bio->bi_next; iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; } - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; @@ -125,6 +117,30 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, return true; } +static inline void blk_rq_map_iter_init(struct request *rq, + struct blk_map_iter *iter) +{ + struct bio *bio = rq->bio; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + *iter = (struct blk_map_iter) { + .bvecs = &rq->special_vec, + .iter = { + .bi_size = rq->special_vec.bv_len, + } + }; + } else if (bio) { + *iter = (struct blk_map_iter) { + .bio = bio, + .bvecs = bio->bi_io_vec, + .iter = bio->bi_iter, + }; + } else { + /* the internal flush request may not have bio attached */ + *iter = (struct blk_map_iter) {}; + } +} + /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map @@ -153,8 +169,7 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, unsigned int total_len = blk_rq_payload_bytes(req); struct phys_vec vec; - iter->iter.bio = req->bio; - iter->iter.iter = req->bio->bi_iter; + blk_rq_map_iter_init(req, &iter->iter); memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; @@ -246,16 +261,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct blk_map_iter iter = { - .bio = rq->bio, - }; + struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - + blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 6a7e3828673d..e5cb5e46fc92 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -8,6 +8,7 @@ struct blk_map_iter { struct bvec_iter iter; struct bio *bio; + struct bio_vec *bvecs; }; struct blk_dma_iter { -- cgit v1.2.3 From 92fb75fd14b041038e30bc725ab4c1e625243573 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:47 -0700 Subject: blk-mq-dma: require unmap caller provide p2p map type In preparing for integrity dma mappings, we can't rely on the request flag because data and metadata may have different mapping types. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-4-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 9 ++++++++- include/linux/blk-mq-dma.h | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2c6d9506b172..111b6bc6c93e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -261,6 +261,9 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + + /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_P2P_BUS_ADDR = 1U << 3, }; struct nvme_dma_vec { @@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + iod->flags & IOD_P2P_BUS_ADDR)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req); else @@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_P2P_BUS_ADDR; + if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index e5cb5e46fc92..881880095e0d 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -47,14 +47,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap + * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) + struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { - if (req->cmd_flags & REQ_P2PDMA) + if (is_p2p) return true; if (dma_use_iova(state)) { -- cgit v1.2.3 From 7092639031a1bd5320ab827e8f665350f332b7ce Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:48 -0700 Subject: blk-mq: remove REQ_P2PDMA flag It's not serving any particular purpose. pci_p2pdma_state() already has all the appropriate checks, so the config and flag checks are not guarding anything. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-5-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-mq-dma.c | 30 ++++++++++++++---------------- include/linux/blk_types.h | 2 -- 3 files changed, 15 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 3b371a5da159..44c43b970387 100644 --- a/block/bio.c +++ b/block/bio.c @@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) - bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 8f41fe740b46..58defab21882 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -180,22 +180,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { - switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(vec.paddr))) { - case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter, &vec); - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * P2P transfers through the host bridge are treated the - * same as non-P2P transfers below and during unmap. - */ - req->cmd_flags &= ~REQ_P2PDMA; - break; - default: - iter->status = BLK_STS_INVAL; - return false; - } + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + case PCI_P2PDMA_MAP_NONE: + break; + default: + iter->status = BLK_STS_INVAL; + return false; } if (blk_can_dma_map_iova(req, dma_dev) && diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09b99d52fd36..930daff207df 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -386,7 +386,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -419,7 +418,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From fec9b16dc5550191fd85af118271ea00e8dcc5f8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:50 -0700 Subject: blk-mq-dma: add scatter-less integrity data DMA mapping Similar to regular data, introduce more efficient integrity mapping helpers that does away with the scatterlist structure. This uses the block mapping iterator to add IOVA segments if IOMMU is enabled, or maps directly if not. This also supports P2P segements if integrity data ever wants to allocate that type of memory. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-7-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 104 +++++++++++++++++++++++++++++++++++++++--- include/linux/blk-integrity.h | 17 +++++++ include/linux/blk-mq-dma.h | 1 + 3 files changed, 115 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 31dd8f58f081..60a244a129c3 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include #include #include "blk.h" @@ -10,6 +11,24 @@ struct phys_vec { u32 len; }; +static bool __blk_map_iter_next(struct blk_map_iter *iter) +{ + if (iter->iter.bi_size) + return true; + if (!iter->bio || !iter->bio->bi_next) + return false; + + iter->bio = iter->bio->bi_next; + if (iter->is_integrity) { + iter->iter = bio_integrity(iter->bio)->bip_iter; + iter->bvecs = bio_integrity(iter->bio)->bip_vec; + } else { + iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; + } + return true; +} + static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { @@ -33,13 +52,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; - if (!iter->iter.bi_size) { - if (!iter->bio || !iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - iter->bvecs = iter->bio->bi_io_vec; - } + if (!__blk_map_iter_next(iter)) + break; next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || @@ -290,3 +304,79 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +/** + * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment + * for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are + * provided by the caller and don't need to be initialized. @state needs to be + * stored for use at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr + * and the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + unsigned len = bio_integrity_bytes(&req->q->limits.integrity, + blk_rq_sectors(req)); + struct bio *bio = req->bio; + + iter->iter = (struct blk_map_iter) { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + return blk_dma_map_iter_start(req, dma_dev, state, iter, len); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); + +/** + * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next integrity mapping after a previous call to + * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description + * of the arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); +#endif diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e67a2b6e8f11..78fe2459e661 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -4,6 +4,7 @@ #include #include +#include struct request; @@ -31,6 +32,11 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter); +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -115,6 +121,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq, { return -EINVAL; } +static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + return false; +} +static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + return false; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 881880095e0d..0f45ea110ca1 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -9,6 +9,7 @@ struct blk_map_iter { struct bvec_iter iter; struct bio *bio; struct bio_vec *bvecs; + bool is_integrity; }; struct blk_dma_iter { -- cgit v1.2.3 From ac46f5b6c6614668727732e117842c9fa7a42c19 Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Mon, 11 Aug 2025 14:45:05 +0100 Subject: ACPICA: Add SoundWire File Table (SWFT) signature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The File Download (FDL) process of SoundWire Class Audio (SDCA) driver, which provides code/data which may be required by an SDCA device, utilizes SWFT to obtain that code/data. There is a single SWFT for the system, and SWFT can contain multiple files (information about the file as well as its binary contents). The SWFT has a standard ACPI Descriptor Table Header, followed by SoundWire File definitions as described in Discovery and Configuration (DisCo) Specification for SoundWire® Link: https://github.com/acpica/acpica/commit/18c96022 Signed-off-by: Maciej Strozek Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20250811134505.1162661-1-mstrozek@opensource.cirrus.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/tables.c | 2 +- include/acpi/actbl2.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index fa9bb8c8ce95..57fc8bc56166 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT, ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI, - ACPI_SIG_NBFT }; + ACPI_SIG_NBFT, ACPI_SIG_SWFT}; #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header) diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index 048f5f47f8b8..f726bce3eb84 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -57,6 +57,7 @@ #define ACPI_SIG_SDEI "SDEI" /* Software Delegated Exception Interface Table */ #define ACPI_SIG_SDEV "SDEV" /* Secure Devices table */ #define ACPI_SIG_SVKL "SVKL" /* Storage Volume Key Location Table */ +#define ACPI_SIG_SWFT "SWFT" /* SoundWire File Table */ #define ACPI_SIG_TDEL "TDEL" /* TD Event Log Table */ /* @@ -3478,6 +3479,26 @@ enum acpi_svkl_format { ACPI_SVKL_FORMAT_RESERVED = 1 /* 1 and greater are reserved */ }; +/******************************************************************************* + * SWFT - SoundWire File Table + * + * Conforms to "Discovery and Configuration (DisCo) Specification for SoundWire" + * Version 2.1, 2 October 2023 + * + ******************************************************************************/ +struct acpi_sw_file { + u16 vendor_id; + u32 file_id; + u16 file_version; + u32 file_length; + u8 data[]; +}; + +struct acpi_table_swft { + struct acpi_table_header header; + struct acpi_sw_file files[]; +}; + /******************************************************************************* * * TDEL - TD-Event Log -- cgit v1.2.3 From a214365140cc3009f07d4e14a8b481fd3dc41d31 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 10 Jul 2025 15:15:28 +0300 Subject: rculist: move list_for_each_rcu() to where it belongs The list_for_each_rcu() relies on the rcu_dereference() API which is not provided by the list.h. At the same time list.h is a low-level basic header that must not have dependencies like RCU, besides the fact of the potential circular dependencies in some cases. With all that said, move RCU related API to the rculist.h where it belongs. Signed-off-by: Andy Shevchenko Reviewed-by: Simona Vetter Reviewed-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) Signed-off-by: "Paul E. McKenney" --- include/linux/list.h | 10 ---------- include/linux/rculist.h | 10 ++++++++++ kernel/cgroup/dmem.c | 1 + 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index e7e28afd28f8..e7bdad9b8618 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -686,16 +686,6 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) -/** - * list_for_each_rcu - Iterate over a list in an RCU-safe fashion - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - */ -#define list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->next); \ - !list_is_head(pos, (head)); \ - pos = rcu_dereference(pos->next)) - /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 1b11926ddd47..2abba7552605 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -42,6 +42,16 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) */ #define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) +/** + * list_for_each_rcu - Iterate over a list in an RCU-safe fashion + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->next); \ + !list_is_head(pos, (head)); \ + pos = rcu_dereference(pos->next)) + /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 10b63433f057..e12b946278b6 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include #include #include +#include #include struct dmem_cgroup_region { -- cgit v1.2.3 From f45fc18b6de04483643e8aa2ab97737abfe03d59 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 23 Aug 2025 09:56:03 +0200 Subject: net: airoha: Add airoha_ppe_dev struct definition Introduce airoha_ppe_dev struct as container for PPE offload callbacks consumed by the MT76 driver during flowtable offload for traffic received by the wlan NIC and forwarded to the wired one. Add airoha_ppe_setup_tc_block_cb routine to PPE offload ops for MT76 driver. Rely on airoha_ppe_dev pointer in airoha_ppe_setup_tc_block_cb signature. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250823-airoha-en7581-wlan-rx-offload-v3-2-f78600ec3ed8@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 4 +- drivers/net/ethernet/airoha/airoha_eth.h | 4 +- drivers/net/ethernet/airoha/airoha_npu.c | 1 - drivers/net/ethernet/airoha/airoha_ppe.c | 67 +++++++++++++++++++++++++++++-- include/linux/soc/airoha/airoha_offload.h | 35 ++++++++++++++++ 5 files changed, 104 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index e6b802e3d844..5a04f90dd3de 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2599,13 +2599,15 @@ static int airoha_dev_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct net_device *dev = cb_priv; + struct airoha_gdm_port *port = netdev_priv(dev); + struct airoha_eth *eth = port->qdma->eth; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSFLOWER: - return airoha_ppe_setup_tc_block_cb(dev, type_data); + return airoha_ppe_setup_tc_block_cb(ð->ppe->dev, type_data); case TC_SETUP_CLSMATCHALL: return airoha_dev_tc_matchall(dev, type_data); default: diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index 9f721e2b972f..9060b1d2814e 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #define AIROHA_MAX_NUM_GDM_PORTS 4 @@ -546,6 +547,7 @@ struct airoha_gdm_port { #define AIROHA_RXD4_FOE_ENTRY GENMASK(15, 0) struct airoha_ppe { + struct airoha_ppe_dev dev; struct airoha_eth *eth; void *foe; @@ -622,7 +624,7 @@ bool airoha_is_valid_gdm_port(struct airoha_eth *eth, void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, u16 hash); -int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data); +int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data); int airoha_ppe_init(struct airoha_eth *eth); void airoha_ppe_deinit(struct airoha_eth *eth); void airoha_ppe_init_upd_mem(struct airoha_gdm_port *port); diff --git a/drivers/net/ethernet/airoha/airoha_npu.c b/drivers/net/ethernet/airoha/airoha_npu.c index 1a6b191ae0b0..e1d131d6115c 100644 --- a/drivers/net/ethernet/airoha/airoha_npu.c +++ b/drivers/net/ethernet/airoha/airoha_npu.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "airoha_eth.h" diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 36b45e98279a..03d9b1f24bb3 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -6,8 +6,9 @@ #include #include +#include +#include #include -#include #include #include @@ -1282,10 +1283,10 @@ error_npu_put: return err; } -int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data) +int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data) { - struct airoha_gdm_port *port = netdev_priv(dev); - struct airoha_eth *eth = port->qdma->eth; + struct airoha_ppe *ppe = dev->priv; + struct airoha_eth *eth = ppe->eth; int err = 0; mutex_lock(&flow_offload_mutex); @@ -1338,6 +1339,61 @@ void airoha_ppe_init_upd_mem(struct airoha_gdm_port *port) PPE_UPDMEM_WR_MASK | PPE_UPDMEM_REQ_MASK); } +struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev) +{ + struct platform_device *pdev; + struct device_node *np; + struct airoha_eth *eth; + + np = of_parse_phandle(dev->of_node, "airoha,eth", 0); + if (!np) + return ERR_PTR(-ENODEV); + + pdev = of_find_device_by_node(np); + if (!pdev) { + dev_err(dev, "cannot find device node %s\n", np->name); + of_node_put(np); + return ERR_PTR(-ENODEV); + } + of_node_put(np); + + if (!try_module_get(THIS_MODULE)) { + dev_err(dev, "failed to get the device driver module\n"); + goto error_pdev_put; + } + + eth = platform_get_drvdata(pdev); + if (!eth) + goto error_module_put; + + if (!device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER)) { + dev_err(&pdev->dev, + "failed to create device link to consumer %s\n", + dev_name(dev)); + goto error_module_put; + } + + return ð->ppe->dev; + +error_module_put: + module_put(THIS_MODULE); +error_pdev_put: + platform_device_put(pdev); + + return ERR_PTR(-ENODEV); +} +EXPORT_SYMBOL_GPL(airoha_ppe_get_dev); + +void airoha_ppe_put_dev(struct airoha_ppe_dev *dev) +{ + struct airoha_ppe *ppe = dev->priv; + struct airoha_eth *eth = ppe->eth; + + module_put(THIS_MODULE); + put_device(eth->dev); +} +EXPORT_SYMBOL_GPL(airoha_ppe_put_dev); + int airoha_ppe_init(struct airoha_eth *eth) { struct airoha_ppe *ppe; @@ -1347,6 +1403,9 @@ int airoha_ppe_init(struct airoha_eth *eth) if (!ppe) return -ENOMEM; + ppe->dev.ops.setup_tc_block_cb = airoha_ppe_setup_tc_block_cb; + ppe->dev.priv = ppe; + foe_size = PPE_NUM_ENTRIES * sizeof(struct airoha_foe_entry); ppe->foe = dmam_alloc_coherent(eth->dev, foe_size, &ppe->foe_dma, GFP_KERNEL); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 117c63c2448d..4b4b8b9e426d 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -9,6 +9,41 @@ #include #include +struct airoha_ppe_dev { + struct { + int (*setup_tc_block_cb)(struct airoha_ppe_dev *dev, + void *type_data); + } ops; + + void *priv; +}; + +#if (IS_BUILTIN(CONFIG_NET_AIROHA) || IS_MODULE(CONFIG_NET_AIROHA)) +struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev); +void airoha_ppe_put_dev(struct airoha_ppe_dev *dev); + +static inline int airoha_ppe_dev_setup_tc_block_cb(struct airoha_ppe_dev *dev, + void *type_data) +{ + return dev->ops.setup_tc_block_cb(dev, type_data); +} +#else +static inline struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev) +{ + return NULL; +} + +static inline void airoha_ppe_put_dev(struct airoha_ppe_dev *dev) +{ +} + +static inline int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, + void *type_data) +{ + return -EOPNOTSUPP; +} +#endif + #define NPU_NUM_CORES 8 #define NPU_NUM_IRQ 6 #define NPU_RX0_DESC_NUM 512 -- cgit v1.2.3 From a7cc1aa151e3a9c0314b995f06102f7763d3bd71 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 23 Aug 2025 09:56:04 +0200 Subject: net: airoha: Introduce check_skb callback in ppe_dev ops Export airoha_ppe_check_skb routine in ppe_dev ops. check_skb callback will be used by the MT76 driver in order to offload the traffic received by the wlan NIC and forwarded to the ethernet one. Add rx_wlan parameter to airoha_ppe_check_skb routine signature. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250823-airoha-en7581-wlan-rx-offload-v3-3-f78600ec3ed8@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 3 ++- drivers/net/ethernet/airoha/airoha_eth.h | 8 ++------ drivers/net/ethernet/airoha/airoha_ppe.c | 25 ++++++++++++++----------- include/linux/soc/airoha/airoha_offload.h | 20 ++++++++++++++++++++ 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 5a04f90dd3de..81ea01a652b9 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -698,7 +698,8 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget) reason = FIELD_GET(AIROHA_RXD4_PPE_CPU_REASON, msg1); if (reason == PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) - airoha_ppe_check_skb(eth->ppe, q->skb, hash); + airoha_ppe_check_skb(ð->ppe->dev, q->skb, hash, + false); done++; napi_gro_receive(&q->napi, q->skb); diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index 9060b1d2814e..77fd13d466dc 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -229,10 +229,6 @@ struct airoha_hw_stats { u64 rx_len[7]; }; -enum { - PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED = 0x0f, -}; - enum { AIROHA_FOE_STATE_INVALID, AIROHA_FOE_STATE_UNBIND, @@ -622,8 +618,8 @@ static inline bool airhoa_is_lan_gdm_port(struct airoha_gdm_port *port) bool airoha_is_valid_gdm_port(struct airoha_eth *eth, struct airoha_gdm_port *port); -void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, - u16 hash); +void airoha_ppe_check_skb(struct airoha_ppe_dev *dev, struct sk_buff *skb, + u16 hash, bool rx_wlan); int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data); int airoha_ppe_init(struct airoha_eth *eth); void airoha_ppe_deinit(struct airoha_eth *eth); diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 03d9b1f24bb3..78473527ff50 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -616,7 +616,7 @@ static bool airoha_ppe_foe_compare_entry(struct airoha_flow_table_entry *e, static int airoha_ppe_foe_commit_entry(struct airoha_ppe *ppe, struct airoha_foe_entry *e, - u32 hash) + u32 hash, bool rx_wlan) { struct airoha_foe_entry *hwe = ppe->foe + hash * sizeof(*hwe); u32 ts = airoha_ppe_get_timestamp(ppe); @@ -639,7 +639,8 @@ static int airoha_ppe_foe_commit_entry(struct airoha_ppe *ppe, goto unlock; } - airoha_ppe_foe_flow_stats_update(ppe, npu, hwe, hash); + if (!rx_wlan) + airoha_ppe_foe_flow_stats_update(ppe, npu, hwe, hash); if (hash < PPE_SRAM_NUM_ENTRIES) { dma_addr_t addr = ppe->foe_dma + hash * sizeof(*hwe); @@ -665,7 +666,7 @@ static void airoha_ppe_foe_remove_flow(struct airoha_ppe *ppe, e->data.ib1 &= ~AIROHA_FOE_IB1_BIND_STATE; e->data.ib1 |= FIELD_PREP(AIROHA_FOE_IB1_BIND_STATE, AIROHA_FOE_STATE_INVALID); - airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); + airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash, false); e->hash = 0xffff; } if (e->type == FLOW_TYPE_L2_SUBFLOW) { @@ -704,7 +705,7 @@ static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, static int airoha_ppe_foe_commit_subflow_entry(struct airoha_ppe *ppe, struct airoha_flow_table_entry *e, - u32 hash) + u32 hash, bool rx_wlan) { u32 mask = AIROHA_FOE_IB1_BIND_PACKET_TYPE | AIROHA_FOE_IB1_BIND_UDP; struct airoha_foe_entry *hwe_p, hwe; @@ -745,14 +746,14 @@ airoha_ppe_foe_commit_subflow_entry(struct airoha_ppe *ppe, } hwe.bridge.data = e->data.bridge.data; - airoha_ppe_foe_commit_entry(ppe, &hwe, hash); + airoha_ppe_foe_commit_entry(ppe, &hwe, hash, rx_wlan); return 0; } static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, struct sk_buff *skb, - u32 hash) + u32 hash, bool rx_wlan) { struct airoha_flow_table_entry *e; struct airoha_foe_bridge br = {}; @@ -785,7 +786,7 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, if (!airoha_ppe_foe_compare_entry(e, hwe)) continue; - airoha_ppe_foe_commit_entry(ppe, &e->data, hash); + airoha_ppe_foe_commit_entry(ppe, &e->data, hash, rx_wlan); commit_done = true; e->hash = hash; } @@ -797,7 +798,7 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, e = rhashtable_lookup_fast(&ppe->l2_flows, &br, airoha_l2_flow_table_params); if (e) - airoha_ppe_foe_commit_subflow_entry(ppe, e, hash); + airoha_ppe_foe_commit_subflow_entry(ppe, e, hash, rx_wlan); unlock: spin_unlock_bh(&ppe_lock); } @@ -1301,9 +1302,10 @@ int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data) return err; } -void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, - u16 hash) +void airoha_ppe_check_skb(struct airoha_ppe_dev *dev, struct sk_buff *skb, + u16 hash, bool rx_wlan) { + struct airoha_ppe *ppe = dev->priv; u16 now, diff; if (hash > PPE_HASH_MASK) @@ -1315,7 +1317,7 @@ void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, return; ppe->foe_check_time[hash] = now; - airoha_ppe_foe_insert_entry(ppe, skb, hash); + airoha_ppe_foe_insert_entry(ppe, skb, hash, rx_wlan); } void airoha_ppe_init_upd_mem(struct airoha_gdm_port *port) @@ -1404,6 +1406,7 @@ int airoha_ppe_init(struct airoha_eth *eth) return -ENOMEM; ppe->dev.ops.setup_tc_block_cb = airoha_ppe_setup_tc_block_cb; + ppe->dev.ops.check_skb = airoha_ppe_check_skb; ppe->dev.priv = ppe; foe_size = PPE_NUM_ENTRIES * sizeof(struct airoha_foe_entry); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 4b4b8b9e426d..1dc5b4e35ef9 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -9,10 +9,17 @@ #include #include +enum { + PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED = 0x0f, +}; + struct airoha_ppe_dev { struct { int (*setup_tc_block_cb)(struct airoha_ppe_dev *dev, void *type_data); + void (*check_skb)(struct airoha_ppe_dev *dev, + struct sk_buff *skb, u16 hash, + bool rx_wlan); } ops; void *priv; @@ -27,6 +34,13 @@ static inline int airoha_ppe_dev_setup_tc_block_cb(struct airoha_ppe_dev *dev, { return dev->ops.setup_tc_block_cb(dev, type_data); } + +static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, + struct sk_buff *skb, + u16 hash, bool rx_wlan) +{ + dev->ops.check_skb(dev, skb, hash, rx_wlan); +} #else static inline struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev) { @@ -42,6 +56,12 @@ static inline int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, { return -EOPNOTSUPP; } + +static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, + struct sk_buff *skb, u16 hash, + bool rx_wlan) +{ +} #endif #define NPU_NUM_CORES 8 -- cgit v1.2.3 From 2d842b6c670b9bffee7c16cda284eb49644d8169 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:57 +0000 Subject: tcp: Remove timewait_sock_ops.twsk_destructor(). Since DCCP has been removed, sk->sk_prot->twsk_prot->twsk_destructor is always tcp_twsk_destructor(). Let's call tcp_twsk_destructor() directly in inet_twsk_free() and remove ->twsk_destructor(). While at it, tcp_twsk_destructor() is un-exported. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/timewait_sock.h | 7 ------- net/ipv4/inet_timewait_sock.c | 5 +++-- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_minisocks.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 5 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 62b3e9f2aed4..0a85ac64a66d 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,13 +15,6 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - void (*twsk_destructor)(struct sock *sk); }; -static inline void twsk_destructor(struct sock *sk) -{ - if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) - sk->sk_prot->twsk_prot->twsk_destructor(sk); -} - #endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed..5b5426b8ee92 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -15,7 +15,7 @@ #include #include #include - +#include /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -74,7 +74,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; - twsk_destructor((struct sock *)tw); + + tcp_twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9543f1538359..a48b98f67b6a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2459,7 +2459,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_destructor= tcp_twsk_destructor, }; void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2994c9222c9c..d1c9e4088646 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -401,7 +401,6 @@ void tcp_twsk_destructor(struct sock *sk) #endif tcp_ao_destroy_sock(sk, true); } -EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5620d9e50e19..d99717376bff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2050,7 +2050,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From cb16f4b6c73df4be16b74099f826fea30ef72426 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:59 +0000 Subject: tcp: Don't pass hashinfo to socket lookup helpers. These socket lookup functions required struct inet_hashinfo because they are shared by TCP and DCCP. * __inet_lookup_established() * __inet_lookup_listener() * __inet6_lookup_established() * inet6_lookup_listener() DCCP has gone, and we don't need to pass hashinfo down to them. Let's fetch net->ipv4.tcp_death_row.hashinfo directly in the above 4 functions. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 9 ++--- drivers/net/ethernet/netronome/nfp/crypto/tls.c | 9 ++--- include/net/inet6_hashtables.h | 18 +++------ include/net/inet_hashtables.h | 37 ++++++++---------- net/core/filter.c | 5 +-- net/ipv4/esp4.c | 4 +- net/ipv4/inet_diag.c | 6 +-- net/ipv4/inet_hashtables.c | 28 +++++++------- net/ipv4/netfilter/nf_socket_ipv4.c | 3 +- net/ipv4/netfilter/nf_tproxy_ipv4.c | 5 +-- net/ipv4/tcp_ipv4.c | 16 +++----- net/ipv4/tcp_offload.c | 3 +- net/ipv6/esp6.c | 4 +- net/ipv6/inet6_hashtables.c | 45 +++++++++++----------- net/ipv6/netfilter/nf_socket_ipv6.c | 3 +- net/ipv6/netfilter/nf_tproxy_ipv6.c | 5 +-- net/ipv6/tcp_ipv6.c | 14 +++---- net/ipv6/tcpv6_offload.c | 3 +- 18 files changed, 94 insertions(+), 123 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 65ccb33edafb..d7a11ff9bbdb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -498,9 +498,9 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) depth += sizeof(struct iphdr); th = (void *)iph + sizeof(struct iphdr); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, iph->daddr, - th->dest, netdev->ifindex); + sk = inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, th->dest, + netdev->ifindex); #if IS_ENABLED(CONFIG_IPV6) } else { struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph; @@ -508,8 +508,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) depth += sizeof(struct ipv6hdr); th = (void *)ipv6h + sizeof(struct ipv6hdr); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &ipv6h->saddr, th->source, + sk = __inet6_lookup_established(net, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->dest), netdev->ifindex, 0); #endif diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c b/drivers/net/ethernet/netronome/nfp/crypto/tls.c index f80f1a6953fa..f252ecdcd2cd 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c @@ -495,14 +495,13 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev, switch (ipv6h->version) { case 4: - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, iph->daddr, - th->dest, netdev->ifindex); + sk = inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, th->dest, + netdev->ifindex); break; #if IS_ENABLED(CONFIG_IPV6) case 6: - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &ipv6h->saddr, th->source, + sk = __inet6_lookup_established(net, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->dest), netdev->ifindex, 0); break; diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index ab3929a2a956..1f985d2012ce 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -41,7 +41,6 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -65,7 +64,6 @@ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -83,7 +81,6 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, inet6_ehashfn_t *ehashfn); static inline struct sock *__inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -92,14 +89,14 @@ static inline struct sock *__inet6_lookup(const struct net *net, const int dif, const int sdif, bool *refcounted) { - struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, - sport, daddr, hnum, + struct sock *sk = __inet6_lookup_established(net, saddr, sport, + daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return inet6_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } @@ -143,8 +140,7 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, @@ -161,14 +157,12 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(net, hashinfo, skb, - doff, &ip6h->saddr, sport, + return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 19dbd9081d5a..a3b32241c2f2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -294,7 +294,6 @@ int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, @@ -302,12 +301,12 @@ struct sock *__inet_lookup_listener(const struct net *net, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif) + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, + int dif, int sdif) { - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } @@ -358,7 +357,6 @@ static inline bool inet_match(const struct net *net, const struct sock *sk, * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); @@ -384,18 +382,16 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); -static inline struct sock * - inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const __be16 dport, - const int dif) +static inline struct sock *inet_lookup_established(struct net *net, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif) { - return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, + return __inet_lookup_established(net, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -405,18 +401,17 @@ static inline struct sock *__inet_lookup(struct net *net, u16 hnum = ntohs(dport); struct sock *sk; - sk = __inet_lookup_established(net, hashinfo, saddr, sport, + sk = __inet_lookup_established(net, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -425,7 +420,7 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; bool refcounted; - sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet_lookup(net, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) @@ -473,8 +468,7 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, +static inline struct sock *__inet_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, @@ -492,8 +486,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet_lookup(net, hashinfo, skb, - doff, iph->saddr, sport, + return __inet_lookup(net, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..5da1cad66be2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6767,7 +6767,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6776,7 +6775,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, hinfo, NULL, 0, + sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6790,7 +6789,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, hinfo, NULL, 0, + sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f14a41ee4aa1..2c922afadb8f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, - dport, x->props.saddr.a4, sport, 0); + sk = inet_lookup_established(net, x->id.daddr.a4, dport, + x->props.saddr.a4, sport, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 549f1f521f4f..462406948c84 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -526,18 +526,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, rcu_read_lock(); if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, NULL, 0, + sk = inet6_lookup(net, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 374adb8a2640..4bc2b1921d2b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -425,13 +425,13 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, } struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; @@ -444,6 +444,7 @@ struct sock *__inet_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -489,21 +490,22 @@ void sock_edemux(struct sk_buff *skb) EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const u16 hnum, - const int dif, const int sdif) + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif, const int sdif) { - INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; + INET_ADDR_COOKIE(acookie, saddr, daddr); const struct hlist_nulls_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a1350fc25838..5080fa5fbf6a 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 73e66a088e25..041c3f37f237 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, hinfo, skb, + sk = inet_lookup_listener(net, skb, ip_hdrlen(skb) + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, hinfo, saddr, sport, + sk = inet_lookup_established(net, saddr, sport, daddr, dport, in->ifindex); break; default: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a48b98f67b6a..a0c93b24c6e0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -506,8 +506,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; int err; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->daddr, th->dest, iph->saddr, + sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); @@ -823,8 +822,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ @@ -1992,8 +1990,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return 0; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); if (sk) { @@ -2236,8 +2233,7 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: - sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), th->source, + sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -2426,9 +2422,7 @@ do_time_wait: &drop_reason); switch (tw_status) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(net, - net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index be5c2294610e..e6612bd84d09 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 72adfc107b55..e75da98f5283 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, - dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport, + &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d6c3db31dcab..a3a9ea49fee2 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif, const int sdif) + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif, const int sdif) { - struct sock *sk; - const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - + const struct hlist_nulls_node *node; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -200,13 +199,15 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif, const int sdif) + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; @@ -219,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -243,7 +245,6 @@ done: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -252,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net, struct sock *sk; bool refcounted; - sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 9ea5ef56cb27..ced8bd44828e 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 52f828bb5a83..b2f59ed9d7cc 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, hinfo, skb, + sk = inet6_lookup_listener(net, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + sk = __inet6_lookup_established(net, saddr, sport, daddr, ntohs(dport), in->ifindex, 0); break; default: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d99717376bff..8b2e7b7afbd8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -388,8 +388,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->daddr, th->dest, + sk = __inet6_lookup_established(net, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -1073,8 +1072,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, &ipv6h->saddr, th->source, + sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) @@ -1789,7 +1787,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) @@ -1976,8 +1974,7 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -2029,8 +2026,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a8a04f441e78..effeba58630b 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; -- cgit v1.2.3 From f1241200cd66b3e25fd2a44dd961d9720e965aa1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:07:00 +0000 Subject: tcp: Don't pass hashinfo to inet_diag helpers. These inet_diag functions required struct inet_hashinfo because they are shared by TCP and DCCP: * inet_diag_dump_icsk() * inet_diag_dump_one_icsk() * inet_diag_find_one_icsk() DCCP has gone, and we don't need to pass hashinfo down to them. Let's fetch net->ipv4.tcp_death_row.hashinfo directly in the first 2 functions. Note that inet_diag_find_one_icsk() don't need hashinfo since the previous patch. We will move TCP-specific functions to tcp_diag.c in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 6 ++---- net/ipv4/inet_diag.c | 10 +++++----- net/ipv4/tcp_diag.c | 17 +++-------------- 3 files changed, 10 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index a9033696b0aa..34de992b5bd9 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -48,15 +48,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, +void inet_diag_dump_icsk(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r); -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, +int inet_diag_dump_one_icsk(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req); int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 462406948c84..5cbbb0695aff 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -519,7 +519,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, } struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req) { struct sock *sk; @@ -562,8 +561,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net, } EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, +int inet_diag_dump_one_icsk(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; @@ -573,7 +571,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sock *sk; int err; - sk = inet_diag_find_one_icsk(net, hashinfo, req); + sk = inet_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -1018,7 +1016,7 @@ static void twsk_build_assert(void) #endif } -void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, +void inet_diag_dump_icsk(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { @@ -1026,10 +1024,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; int i, num, s_i, s_num; struct nlattr *bc; struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 45e174b8cd22..7cd9d032efdd 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -180,21 +180,13 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct inet_hashinfo *hinfo; - - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; - - inet_diag_dump_icsk(hinfo, skb, cb, r); + inet_diag_dump_icsk(skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - struct inet_hashinfo *hinfo; - - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; - - return inet_diag_dump_one_icsk(hinfo, cb, req); + return inet_diag_dump_one_icsk(cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY @@ -202,13 +194,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); - struct inet_hashinfo *hinfo; struct sock *sk; int err; - hinfo = net->ipv4.tcp_death_row.hashinfo; - sk = inet_diag_find_one_icsk(net, hinfo, req); - + sk = inet_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); -- cgit v1.2.3 From 382a4d9cb6dc07643345e15c49738088a727d29b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:07:01 +0000 Subject: tcp: Move TCP-specific diag functions to tcp_diag.c. tcp_diag_dump() / tcp_diag_dump_one() is just a wrapper of inet_diag_dump_icsk() / inet_diag_dump_one_icsk(), respectively. Let's inline them in tcp_diag.c and move static callees as well. Note that inet_sk_attr_size() is merged into tcp_diag_get_aux_size(), and we remove inet_diag_handler.idiag_get_aux_size() accordingly. While at it, BUG_ON() is replaced with DEBUG_NET_WARN_ON_ONCE(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250822190803.540788-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 11 -- net/ipv4/inet_diag.c | 479 ---------------------------------------------- net/ipv4/tcp_diag.c | 460 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 455 insertions(+), 495 deletions(-) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 34de992b5bd9..30bf8f7ea62b 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,9 +24,6 @@ struct inet_diag_handler { bool net_admin, struct sk_buff *skb); - size_t (*idiag_get_aux_size)(struct sock *sk, - bool net_admin); - int (*destroy)(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req); @@ -48,14 +45,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -void inet_diag_dump_icsk(struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r); -int inet_diag_dump_one_icsk(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req); - -struct sock *inet_diag_find_one_icsk(struct net *net, - const struct inet_diag_req_v2 *req); int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5cbbb0695aff..9d4dcd17728c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -20,9 +20,6 @@ #include #include #include -#include -#include -#include #include #include @@ -97,31 +94,6 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -static size_t inet_sk_attr_size(struct sock *sk, - const struct inet_diag_req_v2 *req, - bool net_admin) -{ - const struct inet_diag_handler *handler; - size_t aux = 0; - - rcu_read_lock(); - handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); - DEBUG_NET_WARN_ON_ONCE(!handler); - if (handler && handler->idiag_get_aux_size) - aux = handler->idiag_get_aux_size(sk, net_admin); - rcu_read_unlock(); - - return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(sizeof(struct inet_diag_msg)) - + inet_diag_msg_attrs_size() - + nla_total_size(sizeof(struct inet_diag_meminfo)) - + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) - + nla_total_size(TCP_CA_NAME_MAX) - + nla_total_size(sizeof(struct tcpvegas_info)) - + aux - + 64; -} - int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, @@ -422,181 +394,6 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_twsk_diag_fill(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct inet_timewait_sock *tw = inet_twsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, - sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); - - inet_diag_msg_common_fill(r, sk); - r->idiag_retrans = 0; - - r->idiag_state = READ_ONCE(tw->tw_substate); - r->idiag_timer = 3; - tmo = tw->tw_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - tw->tw_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct request_sock *reqsk = inet_reqsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - inet_diag_msg_common_fill(r, sk); - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = reqsk->num_retrans; - - BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != - offsetof(struct sock, sk_cookie)); - - tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - u16 nlmsg_flags, bool net_admin) -{ - if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, - net_admin); -} - -struct sock *inet_diag_find_one_icsk(struct net *net, - const struct inet_diag_req_v2 *req) -{ - struct sock *sk; - - rcu_read_lock(); - if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); -#if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { - if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && - ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], - req->id.idiag_dport, req->id.idiag_src[3], - req->id.idiag_sport, req->id.idiag_if); - else - sk = inet6_lookup(net, NULL, 0, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } - rcu_read_unlock(); - if (!sk) - return ERR_PTR(-ENOENT); - - if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { - sock_gen_put(sk); - return ERR_PTR(-ENOENT); - } - - return sk; -} -EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); - -int inet_diag_dump_one_icsk(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - struct sk_buff *in_skb = cb->skb; - bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; - struct sock *sk; - int err; - - sk = inet_diag_find_one_icsk(net, req); - if (IS_ERR(sk)) - return PTR_ERR(sk); - - rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); - if (!rep) { - err = -ENOMEM; - goto out; - } - - err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); - if (err < 0) { - WARN_ON(err == -EMSGSIZE); - nlmsg_free(rep); - goto out; - } - err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); - -out: - if (sk) - sock_gen_put(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); - static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, @@ -990,282 +787,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static void twsk_build_assert(void) -{ - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != - offsetof(struct sock, sk_family)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != - offsetof(struct inet_sock, inet_num)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != - offsetof(struct inet_sock, inet_dport)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != - offsetof(struct inet_sock, inet_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != - offsetof(struct inet_sock, inet_daddr)); - -#if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != - offsetof(struct sock, sk_v6_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != - offsetof(struct sock, sk_v6_daddr)); -#endif -} - -void inet_diag_dump_icsk(struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); - struct inet_diag_dump_data *cb_data = cb->data; - struct net *net = sock_net(skb->sk); - u32 idiag_states = r->idiag_states; - struct inet_hashinfo *hashinfo; - int i, num, s_i, s_num; - struct nlattr *bc; - struct sock *sk; - - hashinfo = net->ipv4.tcp_death_row.hashinfo; - bc = cb_data->inet_diag_nla_bc; - if (idiag_states & TCPF_SYN_RECV) - idiag_states |= TCPF_NEW_SYN_RECV; - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) - goto skip_listen_ht; - - for (i = s_i; i <= hashinfo->lhash2_mask; i++) { - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - - num = 0; - ilb = &hashinfo->lhash2[i]; - - if (hlist_nulls_empty(&ilb->nulls_head)) { - s_num = 0; - continue; - } - spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (num < s_num) { - num++; - continue; - } - - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_listen; - - if (r->id.idiag_sport != inet->inet_sport && - r->id.idiag_sport) - goto next_listen; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_listen; - - if (inet_sk_diag_fill(sk, inet_csk(sk), skb, - cb, r, NLM_F_MULTI, - net_admin) < 0) { - spin_unlock(&ilb->lock); - goto done; - } - -next_listen: - ++num; - } - spin_unlock(&ilb->lock); - - s_num = 0; - } -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - -/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets - * with bh disabled. - */ -#define SKARR_SZ 16 - - /* Dump bound but inactive (not listening, connecting, etc.) sockets */ - if (cb->args[0] == 1) { - if (!(idiag_states & TCPF_BOUND_INACTIVE)) - goto skip_bind_ht; - - for (i = s_i; i < hashinfo->bhash_size; i++) { - struct inet_bind_hashbucket *ibb; - struct inet_bind2_bucket *tb2; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - -resume_bind_walk: - num = 0; - accum = 0; - ibb = &hashinfo->bhash2[i]; - - if (hlist_empty(&ibb->chain)) { - s_num = 0; - continue; - } - spin_lock_bh(&ibb->lock); - inet_bind_bucket_for_each(tb2, &ibb->chain) { - if (!net_eq(ib2_net(tb2), net)) - continue; - - sk_for_each_bound(sk, &tb2->owners) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_bind; - - if (sk->sk_state != TCP_CLOSE || - !inet->inet_num) - goto next_bind; - - if (r->sdiag_family != AF_UNSPEC && - r->sdiag_family != sk->sk_family) - goto next_bind; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_bind; - - sock_hold(sk); - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - goto pause_bind_walk; -next_bind: - num++; - } - } -pause_bind_walk: - spin_unlock_bh(&ibb->lock); - - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = inet_sk_diag_fill(sk_arr[idx], - NULL, skb, cb, - r, NLM_F_MULTI, - net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_put(sk_arr[idx]); - } - if (res < 0) - goto done; - - cond_resched(); - - if (accum == SKARR_SZ) { - s_num = num + 1; - goto resume_bind_walk; - } - - s_num = 0; - } -skip_bind_ht: - cb->args[0] = 2; - s_i = num = s_num = 0; - } - - if (!(idiag_states & ~TCPF_LISTEN)) - goto out; - - for (i = s_i; i <= hashinfo->ehash_mask; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct hlist_nulls_node *node; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - - if (hlist_nulls_empty(&head->chain)) - continue; - - if (i > s_i) - s_num = 0; - -next_chunk: - num = 0; - accum = 0; - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &head->chain) { - int state; - - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) - goto next_normal; - state = (sk->sk_state == TCP_TIME_WAIT) ? - READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; - if (!(idiag_states & (1 << state))) - goto next_normal; - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != sk->sk_dport && - r->id.idiag_dport) - goto next_normal; - twsk_build_assert(); - - if (!inet_diag_bc_sk(bc, sk)) - goto next_normal; - - if (!refcount_inc_not_zero(&sk->sk_refcnt)) - goto next_normal; - - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - break; -next_normal: - ++num; - } - spin_unlock_bh(lock); - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, cb, r, - NLM_F_MULTI, net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_gen_put(sk_arr[idx]); - } - if (res < 0) - break; - cond_resched(); - if (accum == SKARR_SZ) { - s_num = num + 1; - goto next_chunk; - } - } - -done: - cb->args[1] = i; - cb->args[2] = num; -out: - ; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); - static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 7cd9d032efdd..2f3a779ce7a2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -12,6 +12,9 @@ #include +#include +#include +#include #include #include @@ -174,19 +177,467 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += ulp_ops->get_info_size(sk, net_admin); } } - return size; + + return size + + nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + +static int tcp_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT); + + inet_diag_msg_common_fill(r, sk); + r->idiag_retrans = 0; + + r->idiag_state = READ_ONCE(tw->tw_substate); + r->idiag_timer = 3; + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct request_sock *reqsk = inet_reqsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = reqsk->num_retrans; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + u16 nlmsg_flags, bool net_admin) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); +} + +static void twsk_build_assert(void) +{ + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); + +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); +#endif } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(skb, cb, r); + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; + struct net *net = sock_net(skb->sk); + u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; + int i, num, s_i, s_num; + struct nlattr *bc; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + bc = cb_data->inet_diag_nla_bc; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) + goto skip_listen_ht; + + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + + num = 0; + ilb = &hashinfo->lhash2[i]; + + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (num < s_num) { + num++; + continue; + } + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { + spin_unlock(&ilb->lock); + goto done; + } + +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + + s_num = 0; + } +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } + + if (!(idiag_states & ~TCPF_LISTEN)) + goto out; + + for (i = s_i; i <= hashinfo->ehash_mask; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); + struct hlist_nulls_node *node; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + + if (hlist_nulls_empty(&head->chain)) + continue; + + if (i > s_i) + s_num = 0; + +next_chunk: + num = 0; + accum = 0; + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { + int state; + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next_normal; + state = (sk->sk_state == TCP_TIME_WAIT) ? + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; + if (!(idiag_states & (1 << state))) + goto next_normal; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_normal; + if (r->id.idiag_sport != htons(sk->sk_num) && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != sk->sk_dport && + r->id.idiag_dport) + goto next_normal; + twsk_build_assert(); + + if (!inet_diag_bc_sk(bc, sk)) + goto next_normal; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_normal; + + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_gen_put(sk_arr[idx]); + } + if (res < 0) + break; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } + } + +done: + cb->args[1] = i; + cb->args[2] = num; +out: + ; +} + +static struct sock *tcp_diag_find_one_icsk(struct net *net, + const struct inet_diag_req_v2 *req) +{ + struct sock *sk; + + rcu_read_lock(); + if (req->sdiag_family == AF_INET) { + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + } else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, NULL, 0, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); +#endif + } else { + rcu_read_unlock(); + return ERR_PTR(-EINVAL); + } + rcu_read_unlock(); + if (!sk) + return ERR_PTR(-ENOENT); + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_gen_put(sk); + return ERR_PTR(-ENOENT); + } + + return sk; } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(cb, req); + struct sk_buff *in_skb = cb->skb; + struct sk_buff *rep; + struct sock *sk; + struct net *net; + bool net_admin; + int err; + + net = sock_net(in_skb->sk); + sk = tcp_diag_find_one_icsk(net, req); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); + rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + nlmsg_free(rep); + goto out; + } + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + +out: + if (sk) + sock_gen_put(sk); + + return err; } #ifdef CONFIG_INET_DIAG_DESTROY @@ -197,7 +648,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, struct sock *sk; int err; - sk = inet_diag_find_one_icsk(net, req); + sk = tcp_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -215,7 +666,6 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, - .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY -- cgit v1.2.3 From df534e757321ae6efe848a6a787098c22a390ac6 Mon Sep 17 00:00:00 2001 From: David Yang Date: Sun, 24 Aug 2025 09:30:03 +0800 Subject: net: phylink: remove stale an_enabled from doc state->an_enabled was removed by commit 4ee9b0dcf09f ("net: phylink: remove an_enabled") but is left in mac_config() doc, so clean it. Signed-off-by: David Yang Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250824013009.2443580-1-mmyangfl@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 30659b615fca..9af0411761d7 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -320,9 +320,8 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * If in 802.3z mode, the link speed is fixed, dependent on the * @state->interface. Duplex and pause modes are negotiated via * the in-band configuration word. Advertised pause modes are set - * according to the @state->an_enabled and @state->advertising - * flags. Beware of MACs which only support full duplex at gigabit - * and higher speeds. + * according to @state->advertising. Beware of MACs which only + * support full duplex at gigabit and higher speeds. * * If in Cisco SGMII mode, the link speed and duplex mode are passed * in the serial bitstream 16-bit configuration word, and the MAC @@ -331,7 +330,7 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * responsible for reading the configuration word and configuring * itself accordingly. * - * Valid state members: interface, an_enabled, pause, advertising. + * Valid state members: interface, pause, advertising. * * Implementations are expected to update the MAC to reflect the * requested settings - i.o.w., if nothing has changed between two -- cgit v1.2.3 From 1b93c03fb319d72a1f5f4723abd5df15ce40f4e2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:03 +0800 Subject: rcu: add rcu_read_lock_dont_migrate() migrate_disable() is called to disable migration in the kernel, and it is often used together with rcu_read_lock(). However, with PREEMPT_RCU disabled, it's unnecessary, as rcu_read_lock() will always disable preemption, which will also disable migration. Introduce rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate(), which will do the migration enable and disable only when PREEMPT_RCU. Signed-off-by: Menglong Dong Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/r/20250821090609.42508-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..9691ca380a4f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -962,6 +962,20 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } +static __always_inline void rcu_read_lock_dont_migrate(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_disable(); + rcu_read_lock(); +} + +static inline void rcu_read_unlock_migrate(void) +{ + rcu_read_unlock(); + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_enable(); +} + /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. -- cgit v1.2.3 From 05db35963eef7a55f1782190185cb8ddb9d923b7 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Chundru Date: Wed, 20 Aug 2025 13:58:47 +0530 Subject: OPP: Add support to find OPP for a set of keys Some clients, such as PCIe, may operate at the same clock frequency across different data rates by varying link width. In such cases, frequency alone is not sufficient to uniquely identify an OPP. To support these scenarios, introduce a new API dev_pm_opp_find_key_exact() that allows OPP lookup with different set of keys like freq, level & bandwidth. Signed-off-by: Krishna Chaitanya Chundru [ Viresh: Minor cleanups ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 30 +++++++++++++++ 2 files changed, 129 insertions(+) (limited to 'include') diff --git a/drivers/opp/core.c b/drivers/opp/core.c index edbd60501cf0..bba4f7daff8c 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -476,6 +476,16 @@ static unsigned long _read_bw(struct dev_pm_opp *opp, int index) return opp->bandwidth[index].peak; } +static unsigned long _read_opp_key(struct dev_pm_opp *opp, int index, + struct dev_pm_opp_key *key) +{ + key->bw = opp->bandwidth ? opp->bandwidth[index].peak : 0; + key->freq = opp->rates[index]; + key->level = opp->level; + + return true; +} + /* Generic comparison helpers */ static bool _compare_exact(struct dev_pm_opp **opp, struct dev_pm_opp *temp_opp, unsigned long opp_key, unsigned long key) @@ -509,6 +519,22 @@ static bool _compare_floor(struct dev_pm_opp **opp, struct dev_pm_opp *temp_opp, return false; } +static bool _compare_opp_key_exact(struct dev_pm_opp **opp, + struct dev_pm_opp *temp_opp, struct dev_pm_opp_key *opp_key, + struct dev_pm_opp_key *key) +{ + bool level_match = (key->level == OPP_LEVEL_UNSET || opp_key->level == key->level); + bool freq_match = (key->freq == 0 || opp_key->freq == key->freq); + bool bw_match = (key->bw == 0 || opp_key->bw == key->bw); + + if (freq_match && level_match && bw_match) { + *opp = temp_opp; + return true; + } + + return false; +} + /* Generic key finding helpers */ static struct dev_pm_opp *_opp_table_find_key(struct opp_table *opp_table, unsigned long *key, int index, bool available, @@ -541,6 +567,37 @@ static struct dev_pm_opp *_opp_table_find_key(struct opp_table *opp_table, return opp; } +static struct dev_pm_opp *_opp_table_find_opp_key(struct opp_table *opp_table, + struct dev_pm_opp_key *key, bool available, + unsigned long (*read)(struct dev_pm_opp *opp, int index, + struct dev_pm_opp_key *key), + bool (*compare)(struct dev_pm_opp **opp, struct dev_pm_opp *temp_opp, + struct dev_pm_opp_key *opp_key, struct dev_pm_opp_key *key), + bool (*assert)(struct opp_table *opp_table, unsigned int index)) +{ + struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE); + struct dev_pm_opp_key temp_key; + + /* Assert that the requirement is met */ + if (!assert(opp_table, 0)) + return ERR_PTR(-EINVAL); + + guard(mutex)(&opp_table->lock); + + list_for_each_entry(temp_opp, &opp_table->opp_list, node) { + if (temp_opp->available == available) { + read(temp_opp, 0, &temp_key); + if (compare(&opp, temp_opp, &temp_key, key)) { + /* Increment the reference count of OPP */ + dev_pm_opp_get(opp); + break; + } + } + } + + return opp; +} + static struct dev_pm_opp * _find_key(struct device *dev, unsigned long *key, int index, bool available, unsigned long (*read)(struct dev_pm_opp *opp, int index), @@ -632,6 +689,48 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact); +/** + * dev_pm_opp_find_key_exact() - Search for an OPP with exact key set + * @dev: Device for which the OPP is being searched + * @key: OPP key set to match + * @available: true/false - match for available OPP + * + * Search for an exact match of the key set in the OPP table. + * + * Return: A matching opp on success, else ERR_PTR in case of error. + * Possible error values: + * EINVAL: for bad pointers + * ERANGE: no match found for search + * ENODEV: if device not found in list of registered devices + * + * Note: 'available' is a modifier for the search. If 'available' == true, + * then the match is for exact matching key and is available in the stored + * OPP table. If false, the match is for exact key which is not available. + * + * This provides a mechanism to enable an OPP which is not available currently + * or the opposite as well. + * + * The callers are required to call dev_pm_opp_put() for the returned OPP after + * use. + */ +struct dev_pm_opp *dev_pm_opp_find_key_exact(struct device *dev, + struct dev_pm_opp_key *key, + bool available) +{ + struct opp_table *opp_table __free(put_opp_table) = _find_opp_table(dev); + + if (IS_ERR(opp_table)) { + dev_err(dev, "%s: OPP table not found (%ld)\n", __func__, + PTR_ERR(opp_table)); + return ERR_CAST(opp_table); + } + + return _opp_table_find_opp_key(opp_table, key, available, + _read_opp_key, _compare_opp_key_exact, + assert_single_clk); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_find_key_exact); + /** * dev_pm_opp_find_freq_exact_indexed() - Search for an exact freq for the * clock corresponding to the index diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index cf477beae4bb..789406d95e69 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -98,6 +98,25 @@ struct dev_pm_opp_data { unsigned long u_volt; }; +/** + * struct dev_pm_opp_key - Key used to identify OPP entries + * @freq: Frequency in Hz. Use 0 if frequency is not to be matched. + * @level: Performance level associated with the OPP entry. + * Use OPP_LEVEL_UNSET if level is not to be matched. + * @bw: Bandwidth associated with the OPP entry. + * Use 0 if bandwidth is not to be matched. + * + * This structure is used to uniquely identify an OPP entry based on + * frequency, performance level, and bandwidth. Each field can be + * selectively ignored during matching by setting it to its respective + * NOP value. + */ +struct dev_pm_opp_key { + unsigned long freq; + unsigned int level; + u32 bw; +}; + #if defined(CONFIG_PM_OPP) struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); @@ -131,6 +150,10 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, unsigned long freq, bool available); +struct dev_pm_opp *dev_pm_opp_find_key_exact(struct device *dev, + struct dev_pm_opp_key *key, + bool available); + struct dev_pm_opp * dev_pm_opp_find_freq_exact_indexed(struct device *dev, unsigned long freq, u32 index, bool available); @@ -289,6 +312,13 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_key_exact(struct device *dev, + struct dev_pm_opp_key *key, + bool available) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct dev_pm_opp * dev_pm_opp_find_freq_exact_indexed(struct device *dev, unsigned long freq, u32 index, bool available) -- cgit v1.2.3 From e649bcda25b5ae1a30a182cc450f928a0b282c93 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:39 -0400 Subject: perf: Remove get_perf_callchain() init_nr argument The 'init_nr' argument has double duty: it's used to initialize both the number of contexts and the number of stack entries. That's confusing and the callers always pass zero anyway. Hard code the zero. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250820180428.259565081@kernel.org --- include/linux/perf_event.h | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/events/callchain.c | 12 ++++++------ kernel/events/core.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bfbf9ea53f25..fd1d91017b99 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..ec3a57a5fba1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d01..b0f5bd228cd8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; @@ -228,11 +228,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) diff --git a/kernel/events/core.c b/kernel/events/core.c index ea357044d780..bade8e0fced7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8210,7 +8210,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } -- cgit v1.2.3 From e164461349444ad27873e4ab2f492eb4465dbbb0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Aug 2025 15:28:49 -0700 Subject: lib/crypto: md5: Add MD5 and HMAC-MD5 library functions Add library functions for MD5, including HMAC support. The MD5 implementation is derived from crypto/md5.c. This closely mirrors the corresponding SHA-1 and SHA-2 changes. Like SHA-1 and SHA-2, support for architecture-optimized MD5 implementations is included. I originally proposed dropping those, but unfortunately there is an AF_ALG user of the PowerPC MD5 code (https://lore.kernel.org/r/c4191597-341d-4fd7-bc3d-13daf7666c41@csgroup.eu/), and dropping that code would be viewed as a performance regression. We don't add new software algorithm implementations purely for AF_ALG, as escalating to kernel mode merely to do calculations that could be done in userspace is inefficient and is completely the wrong design. But since this one already existed, it gets grandfathered in for now. An objection was also raised to dropping the SPARC64 MD5 code because it utilizes the CPU's direct support for MD5, although it remains unclear that anyone is using that. Regardless, we'll keep these around for now. Note that while MD5 is a legacy algorithm that is vulnerable to practical collision attacks, it still has various in-kernel users that implement legacy protocols. Switching to a simple library API, which is the way the code should have been organized originally, will greatly simplify their code. For example: MD5: drivers/md/dm-crypt.c (for lmk IV generation) fs/nfsd/nfs4recover.c fs/ecryptfs/ fs/smb/client/ net/{ipv4,ipv6}/ (for TCP-MD5 signatures) HMAC-MD5: fs/smb/client/ fs/smb/server/ (Also net/sctp/ if it continues using HMAC-MD5 for cookie generation. However, that use case has the flexibility to upgrade to a more modern algorithm, which I'll be proposing instead.) As usual, the "md5" and "hmac(md5)" crypto_shash algorithms will also be reimplemented on top of these library functions. For "hmac(md5)" this will provide a faster, more streamlined implementation. Link: https://lore.kernel.org/r/20250805222855.10362-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/md5.h | 181 ++++++++++++++++++++++++++++- lib/crypto/Kconfig | 10 ++ lib/crypto/Makefile | 10 ++ lib/crypto/md5.c | 322 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 522 insertions(+), 1 deletion(-) create mode 100644 lib/crypto/md5.c (limited to 'include') diff --git a/include/crypto/md5.h b/include/crypto/md5.h index 28ee533a0507..c9aa5c3abc53 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -7,6 +7,7 @@ #define MD5_DIGEST_SIZE 16 #define MD5_HMAC_BLOCK_SIZE 64 +#define MD5_BLOCK_SIZE 64 #define MD5_BLOCK_WORDS 16 #define MD5_HASH_WORDS 4 #define MD5_STATE_SIZE 24 @@ -27,4 +28,182 @@ struct md5_state { u32 block[MD5_BLOCK_WORDS]; }; -#endif +/* State for the MD5 compression function */ +struct md5_block_state { + u32 h[MD5_HASH_WORDS]; +}; + +/** + * struct md5_ctx - Context for hashing a message with MD5 + * @state: the compression function state + * @bytecount: number of bytes processed so far + * @buf: partial block buffer; bytecount % MD5_BLOCK_SIZE bytes are valid + */ +struct md5_ctx { + struct md5_block_state state; + u64 bytecount; + u8 buf[MD5_BLOCK_SIZE] __aligned(__alignof__(__le64)); +}; + +/** + * md5_init() - Initialize an MD5 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider md5() instead. + * + * Context: Any context. + */ +void md5_init(struct md5_ctx *ctx); + +/** + * md5_update() - Update an MD5 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len); + +/** + * md5_final() - Finish computing an MD5 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting MD5 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); + +/** + * md5() - Compute MD5 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting MD5 message digest + * + * Context: Any context. + */ +void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]); + +/** + * struct hmac_md5_key - Prepared key for HMAC-MD5 + * @istate: private + * @ostate: private + */ +struct hmac_md5_key { + struct md5_block_state istate; + struct md5_block_state ostate; +}; + +/** + * struct hmac_md5_ctx - Context for computing HMAC-MD5 of a message + * @hash_ctx: private + * @ostate: private + */ +struct hmac_md5_ctx { + struct md5_ctx hash_ctx; + struct md5_block_state ostate; +}; + +/** + * hmac_md5_preparekey() - Prepare a key for HMAC-MD5 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_md5_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_md5_preparekey(struct hmac_md5_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_md5_init() - Initialize an HMAC-MD5 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_md5() instead. + * + * Context: Any context. + */ +void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key); + +/** + * hmac_md5_init_usingrawkey() - Initialize an HMAC-MD5 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_md5_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_md5_update() - Update an HMAC-MD5 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_md5_update(struct hmac_md5_ctx *ctx, + const u8 *data, size_t data_len) +{ + md5_update(&ctx->hash_ctx, data, data_len); +} + +/** + * hmac_md5_final() - Finish computing an HMAC-MD5 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-MD5 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); + +/** + * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-MD5 value + * + * If you're using the key only once, consider using hmac_md5_usingrawkey(). + * + * Context: Any context. + */ +void hmac_md5(const struct hmac_md5_key *key, + const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]); + +/** + * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-MD5 value + * + * If you're using the key multiple times, prefer to use hmac_md5_preparekey() + * followed by multiple calls to hmac_md5() instead. + * + * Context: Any context. + */ +void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[MD5_DIGEST_SIZE]); + +#endif /* _CRYPTO_MD5_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 1e6b008f8fca..e38e8ed779d8 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -101,6 +101,16 @@ config CRYPTO_LIB_CURVE25519 config CRYPTO_LIB_DES tristate +config CRYPTO_LIB_MD5 + tristate + help + The MD5 and HMAC-MD5 library functions. Select this if your module + uses any of the functions from . + +config CRYPTO_LIB_MD5_ARCH + bool + depends on CRYPTO_LIB_MD5 && !UML + config CRYPTO_LIB_POLY1305_RSIZE int default 2 if MIPS diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 539d5d59a50e..429573e8f8b3 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -59,6 +59,16 @@ libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o +libmd5-y := md5.o +ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y) +CFLAGS_md5.o += -I$(src)/$(SRCARCH) +endif # CONFIG_CRYPTO_LIB_MD5_ARCH + +################################################################################ + obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o libpoly1305-y += poly1305.o diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c new file mode 100644 index 000000000000..c0610ea1370e --- /dev/null +++ b/lib/crypto/md5.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * MD5 and HMAC-MD5 library functions + * + * md5_block_generic() is derived from cryptoapi implementation, originally + * based on the public domain implementation written by Colin Plumb in 1993. + * + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + * Copyright 2025 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct md5_block_state md5_iv = { + .h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 }, +}; + +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +static void md5_block_generic(struct md5_block_state *state, + const u8 data[MD5_BLOCK_SIZE]) +{ + u32 in[MD5_BLOCK_WORDS]; + u32 a, b, c, d; + + memcpy(in, data, MD5_BLOCK_SIZE); + le32_to_cpu_array(in, ARRAY_SIZE(in)); + + a = state->h[0]; + b = state->h[1]; + c = state->h[2]; + d = state->h[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; +} + +static void __maybe_unused md5_blocks_generic(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + md5_block_generic(state, data); + data += MD5_BLOCK_SIZE; + } while (--nblocks); +} + +#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH +#include "md5.h" /* $(SRCARCH)/md5.h */ +#else +#define md5_blocks md5_blocks_generic +#endif + +void md5_init(struct md5_ctx *ctx) +{ + ctx->state = md5_iv; + ctx->bytecount = 0; +} +EXPORT_SYMBOL_GPL(md5_init); + +void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount % MD5_BLOCK_SIZE; + + ctx->bytecount += len; + + if (partial + len >= MD5_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = MD5_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + md5_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / MD5_BLOCK_SIZE; + len %= MD5_BLOCK_SIZE; + + if (nblocks) { + md5_blocks(&ctx->state, data, nblocks); + data += nblocks * MD5_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(md5_update); + +static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % MD5_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > MD5_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial); + md5_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial); + *(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount); + md5_blocks(&ctx->state, ctx->buf, 1); + + cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h)); + memcpy(out, ctx->state.h, MD5_DIGEST_SIZE); +} + +void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + __md5_final(ctx, out); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(md5_final); + +void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]) +{ + struct md5_ctx ctx; + + md5_init(&ctx); + md5_update(&ctx, data, len); + md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(md5); + +static void __hmac_md5_preparekey(struct md5_block_state *istate, + struct md5_block_state *ostate, + const u8 *raw_key, size_t raw_key_len) +{ + union { + u8 b[MD5_BLOCK_SIZE]; + unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > MD5_BLOCK_SIZE)) + md5(raw_key, raw_key_len, derived_key.b); + else + memcpy(derived_key.b, raw_key, raw_key_len); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = md5_iv; + md5_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = md5_iv; + md5_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_md5_preparekey(struct hmac_md5_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len); +} +EXPORT_SYMBOL_GPL(hmac_md5_preparekey); + +void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key) +{ + ctx->hash_ctx.state = key->istate; + ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE; + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(hmac_md5_init); + +void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate, + raw_key, raw_key_len); + ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey); + +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + /* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */ + __md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf); + memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0, + MD5_BLOCK_SIZE - MD5_DIGEST_SIZE); + ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80; + *(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] = + cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE)); + + /* Compute the outer hash, which gives the HMAC value. */ + md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1); + cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h)); + memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE); + + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(hmac_md5_final); + +void hmac_md5(const struct hmac_md5_key *key, + const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]) +{ + struct hmac_md5_ctx ctx; + + hmac_md5_init(&ctx, key); + hmac_md5_update(&ctx, data, data_len); + hmac_md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_md5); + +void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[MD5_DIGEST_SIZE]) +{ + struct hmac_md5_ctx ctx; + + hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_md5_update(&ctx, data, data_len); + hmac_md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey); + +#ifdef md5_mod_init_arch +static int __init md5_mod_init(void) +{ + md5_mod_init_arch(); + return 0; +} +subsys_initcall(md5_mod_init); + +static void __exit md5_mod_exit(void) +{ +} +module_exit(md5_mod_exit); +#endif + +MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 1abe21ef1adf0c5b6dbb5878c2fa4573df8d29fc Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 23 Aug 2025 15:44:28 +0200 Subject: net: phy: introduce phy_id_compare_vendor() PHY ID helper Introduce phy_id_compare_vendor() PHY ID helper to compare a PHY ID with the PHY ID Vendor using the generic PHY ID Vendor mask. While at it also rework the PHY_ID_MATCH macro and move the mask to dedicated define so that PHY driver can make use of the mask if needed. Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250823134431.4854-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4c2b8b6e7187..b67079796402 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1268,9 +1268,13 @@ struct phy_driver { #define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -#define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) -#define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) -#define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) +#define PHY_ID_MATCH_EXTACT_MASK GENMASK(31, 0) +#define PHY_ID_MATCH_MODEL_MASK GENMASK(31, 4) +#define PHY_ID_MATCH_VENDOR_MASK GENMASK(31, 10) + +#define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = PHY_ID_MATCH_EXTACT_MASK +#define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = PHY_ID_MATCH_MODEL_MASK +#define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = PHY_ID_MATCH_VENDOR_MASK /** * phy_id_compare - compare @id1 with @id2 taking account of @mask @@ -1286,6 +1290,19 @@ static inline bool phy_id_compare(u32 id1, u32 id2, u32 mask) return !((id1 ^ id2) & mask); } +/** + * phy_id_compare_vendor - compare @id with @vendor mask + * @id: PHY ID + * @vendor_mask: PHY Vendor mask + * + * Return: true if the bits from @id match @vendor using the + * generic PHY Vendor mask. + */ +static inline bool phy_id_compare_vendor(u32 id, u32 vendor_mask) +{ + return phy_id_compare(id, vendor_mask, PHY_ID_MATCH_VENDOR_MASK); +} + /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device -- cgit v1.2.3 From 39e94fdce45fa611abf48472873e4ba2f67228a3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Aug 2025 22:36:11 +0200 Subject: net: phy: fixed: let fixed_phy_add always use addr 0 and remove return value We have only two users of fixed_phy_add(), both use address 0 and ignore the return value. So simplify fixed_phy_add() accordingly. Whilst at it, constify the fixed_phy_status configs. Note: fixed_phy_add() is a legacy function which shouldn't be used in new code, as it's use may be problematic: - No check whether a fixed phy exists already at the given address - If fixed_phy_register() is called afterwards by any other driver, then it will also use phy_addr 0, because fixed_phy_add() ignores the ida which manages address assignment Drivers using a fixed phy created by fixed_phy_add() in platform code, should dynamically create a fixed phy with fixed_phy_register() instead. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/762700e5-a0b1-41af-aa03-929822a39475@gmail.com Signed-off-by: Jakub Kicinski --- arch/m68k/coldfire/m5272.c | 4 ++-- arch/mips/bcm47xx/setup.c | 4 ++-- drivers/net/phy/fixed_phy.c | 4 ++-- include/linux/phy_fixed.h | 8 ++------ 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index 5b70dfdab368..918e2a3236c5 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -108,7 +108,7 @@ void __init config_BSP(char *commandp, int size) * an ethernet switch. In this case we need to use the fixed phy type, * and we need to declare it early in boot. */ -static struct fixed_phy_status nettel_fixed_phy_status __initdata = { +static const struct fixed_phy_status nettel_fixed_phy_status __initconst = { .link = 1, .speed = 100, .duplex = 0, @@ -119,7 +119,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(0, &nettel_fixed_phy_status); + fixed_phy_add(&nettel_fixed_phy_status); clkdev_add_table(m5272_clk_lookup, ARRAY_SIZE(m5272_clk_lookup)); return 0; } diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index de426a474b5b..a93a4266dc1e 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -256,7 +256,7 @@ static int __init bcm47xx_cpu_fixes(void) } arch_initcall(bcm47xx_cpu_fixes); -static struct fixed_phy_status bcm47xx_fixed_phy_status __initdata = { +static const struct fixed_phy_status bcm47xx_fixed_phy_status __initconst = { .link = 1, .speed = SPEED_100, .duplex = DUPLEX_FULL, @@ -282,7 +282,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(0, &bcm47xx_fixed_phy_status); + fixed_phy_add(&bcm47xx_fixed_phy_status); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 8ad49dc1105d..5a1badb9bbab 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -158,9 +158,9 @@ static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, return 0; } -int fixed_phy_add(int phy_addr, const struct fixed_phy_status *status) +void fixed_phy_add(const struct fixed_phy_status *status) { - return fixed_phy_add_gpiod(PHY_POLL, phy_addr, status, NULL); + fixed_phy_add_gpiod(PHY_POLL, 0, status, NULL); } EXPORT_SYMBOL_GPL(fixed_phy_add); diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 5399b9e41e35..6227a1bdefec 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,7 +17,7 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -int fixed_phy_add(int phy_id, const struct fixed_phy_status *status); +void fixed_phy_add(const struct fixed_phy_status *status); struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); @@ -26,11 +26,7 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline int fixed_phy_add(int phy_id, - const struct fixed_phy_status *status) -{ - return -ENODEV; -} +static inline void fixed_phy_add(const struct fixed_phy_status *status) {} static inline struct phy_device * fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) -- cgit v1.2.3 From d2b007374551ac09db16badde575cdd698f6fc92 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:50 +0300 Subject: devlink: Move graceful period parameter to reporter ops Move the default graceful period from a parameter to devlink_health_reporter_create() to a field in the devlink_health_reporter_ops structure. This change improves consistency, as the graceful period is inherently tied to the reporter's behavior and recovery policy. It simplifies the signature of devlink_health_reporter_create() and its internal helper functions. It also centralizes the reporter configuration at the ops structure, preparing the groundwork for a downstream patch that will introduce a devlink health reporter burst period attribute whose default value will similarly be provided by the driver via the ops structure. Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/main.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +- drivers/net/ethernet/huawei/hinic/hinic_devlink.c | 10 +++--- drivers/net/ethernet/intel/ice/devlink/health.c | 3 +- .../ethernet/marvell/octeontx2/af/rvu_devlink.c | 32 ++++++++++++----- .../mellanox/mlx5/core/diag/reporter_vnic.c | 2 +- .../ethernet/mellanox/mlx5/core/en/reporter_rx.c | 10 +++--- .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 10 +++--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/health.c | 41 +++++++++++++--------- drivers/net/ethernet/mellanox/mlxsw/core.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_devlink.c | 9 ++--- drivers/net/netdevsim/health.c | 4 +-- include/net/devlink.h | 11 +++--- net/devlink/health.c | 28 ++++++--------- 15 files changed, 97 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index 9b81e1c260c2..c7a2eff57632 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -280,7 +280,7 @@ static int pdsc_init_pf(struct pdsc *pdsc) goto err_out_del_dev; } - hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, 0, pdsc); + hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, pdsc); if (IS_ERR(hr)) { devl_unlock(dl); dev_warn(pdsc->dev, "Failed to create fw reporter: %pe\n", hr); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 4c4581b0342e..43fb75806cd6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -220,7 +220,7 @@ __bnxt_dl_reporter_create(struct bnxt *bp, { struct devlink_health_reporter *reporter; - reporter = devlink_health_reporter_create(bp->dl, ops, 0, bp); + reporter = devlink_health_reporter_create(bp->dl, ops, bp); if (IS_ERR(reporter)) { netdev_warn(bp->dev, "Failed to create %s health reporter, rc = %ld\n", ops->name, PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c index 03e42512a2d5..300bc267a259 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c @@ -443,8 +443,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv) struct devlink *devlink = priv_to_devlink(priv); priv->hw_fault_reporter = - devlink_health_reporter_create(devlink, &hinic_hw_fault_reporter_ops, - 0, priv); + devlink_health_reporter_create(devlink, + &hinic_hw_fault_reporter_ops, + priv); if (IS_ERR(priv->hw_fault_reporter)) { dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create hw fault reporter, err: %ld\n", PTR_ERR(priv->hw_fault_reporter)); @@ -452,8 +453,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv) } priv->fw_fault_reporter = - devlink_health_reporter_create(devlink, &hinic_fw_fault_reporter_ops, - 0, priv); + devlink_health_reporter_create(devlink, + &hinic_fw_fault_reporter_ops, + priv); if (IS_ERR(priv->fw_fault_reporter)) { dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create fw fault reporter, err: %ld\n", PTR_ERR(priv->fw_fault_reporter)); diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c index ab519c0f28bf..8e9a8a8178d4 100644 --- a/drivers/net/ethernet/intel/ice/devlink/health.c +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -450,9 +450,8 @@ ice_init_devlink_rep(struct ice_pf *pf, { struct devlink *devlink = priv_to_devlink(pf); struct devlink_health_reporter *rep; - const u64 graceful_period = 0; - rep = devl_health_reporter_create(devlink, ops, graceful_period, pf); + rep = devl_health_reporter_create(devlink, ops, pf); if (IS_ERR(rep)) { struct device *dev = ice_pf_to_dev(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index 27c3a2daaaa9..3735372539bd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -505,7 +505,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) rvu_reporters->nix_event_ctx = nix_event_context; rvu_reporters->rvu_hw_nix_intr_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_intr_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_intr_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_intr reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)); @@ -513,7 +515,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_gen_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_gen_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_gen_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_gen reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)); @@ -521,7 +525,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_err_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_err_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_err_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_err_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_err reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_err_reporter)); @@ -529,7 +535,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_ras_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_ras_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_ras_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_ras reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)); @@ -1051,7 +1059,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) rvu_reporters->npa_event_ctx = npa_event_context; rvu_reporters->rvu_hw_npa_intr_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_intr_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_intr_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_intr reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)); @@ -1059,7 +1069,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_gen_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_gen_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_gen_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_gen reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)); @@ -1067,7 +1079,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_err_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_err_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_err_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_err_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_err reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_err_reporter)); @@ -1075,7 +1089,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_ras_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_ras_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_ras_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_ras reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index 32bb769f1829..73f5b62b8c7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -135,7 +135,7 @@ void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev) health->vnic_reporter = devlink_health_reporter_create(devlink, &mlx5_reporter_vnic_ops, - 0, dev); + dev); if (IS_ERR(health->vnic_reporter)) mlx5_core_warn(dev, "Failed to create vnic reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 16c44d628eda..1b9ea72abc5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -651,22 +651,24 @@ void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c) mutex_unlock(&c->icosq_recovery_lock); } +#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 + static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { .name = "rx", .recover = mlx5e_rx_reporter_recover, .diagnose = mlx5e_rx_reporter_diagnose, .dump = mlx5e_rx_reporter_dump, + .default_graceful_period = MLX5E_REPORTER_RX_GRACEFUL_PERIOD, }; -#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 - void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) { + struct devlink_port *port = priv->netdev->devlink_port; struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port, + reporter = devlink_port_health_reporter_create(port, &mlx5_rx_reporter_ops, - MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 85d5cb39b107..7a4a77f6fe6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -539,22 +539,24 @@ void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq) mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); } +#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 + static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { .name = "tx", .recover = mlx5e_tx_reporter_recover, .diagnose = mlx5e_tx_reporter_diagnose, .dump = mlx5e_tx_reporter_dump, + .default_graceful_period = MLX5_REPORTER_TX_GRACEFUL_PERIOD, }; -#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 - void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) { + struct devlink_port *port = priv->netdev->devlink_port; struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port, + reporter = devlink_port_health_reporter_create(port, &mlx5_tx_reporter_ops, - MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create tx reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..b231e7855bca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1447,7 +1447,7 @@ static void mlx5e_rep_vnic_reporter_create(struct mlx5e_priv *priv, reporter = devl_port_health_reporter_create(dl_port, &mlx5_rep_vnic_reporter_ops, - 0, rpriv); + rpriv); if (IS_ERR(reporter)) { mlx5_core_err(priv->mdev, "Failed to create representor vnic reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index cf7a1edd0530..b63c5a221eb9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -669,54 +669,61 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) } } +#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 +#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 +#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 +#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD \ + MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD + +static +const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ecpf_ops = { + .name = "fw_fatal", + .recover = mlx5_fw_fatal_reporter_recover, + .dump = mlx5_fw_fatal_reporter_dump, + .default_graceful_period = + MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD, +}; + static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_pf_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, .dump = mlx5_fw_fatal_reporter_dump, + .default_graceful_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD, }; static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, + .default_graceful_period = + MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD, }; -#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 -#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 -#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 -#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD - void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) { const struct devlink_health_reporter_ops *fw_fatal_ops; struct mlx5_core_health *health = &dev->priv.health; const struct devlink_health_reporter_ops *fw_ops; struct devlink *devlink = priv_to_devlink(dev); - u64 grace_period; - fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops; fw_ops = &mlx5_fw_reporter_pf_ops; if (mlx5_core_is_ecpf(dev)) { - grace_period = MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD; + fw_fatal_ops = &mlx5_fw_fatal_reporter_ecpf_ops; } else if (mlx5_core_is_pf(dev)) { - grace_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD; + fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops; } else { /* VF or SF */ - grace_period = MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD; fw_fatal_ops = &mlx5_fw_fatal_reporter_ops; fw_ops = &mlx5_fw_reporter_ops; } - health->fw_reporter = - devl_health_reporter_create(devlink, fw_ops, 0, dev); + health->fw_reporter = devl_health_reporter_create(devlink, fw_ops, dev); if (IS_ERR(health->fw_reporter)) mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(health->fw_reporter)); - health->fw_fatal_reporter = - devl_health_reporter_create(devlink, - fw_fatal_ops, - grace_period, - dev); + health->fw_fatal_reporter = devl_health_reporter_create(devlink, + fw_fatal_ops, + dev); if (IS_ERR(health->fw_fatal_reporter)) mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", PTR_ERR(health->fw_fatal_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 2bb2b77351bd..980f3223f124 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2043,7 +2043,7 @@ static int mlxsw_core_health_init(struct mlxsw_core *mlxsw_core) return 0; fw_fatal = devl_health_reporter_create(devlink, &mlxsw_core_health_fw_fatal_ops, - 0, mlxsw_core); + mlxsw_core); if (IS_ERR(fw_fatal)) { dev_err(mlxsw_core->bus_info->dev, "Failed to create fw fatal reporter"); return PTR_ERR(fw_fatal); diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c index 1adc7fbb3f2f..94c5689b5abd 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c +++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c @@ -87,20 +87,21 @@ qed_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, return 0; } +#define QED_REPORTER_FW_GRACEFUL_PERIOD 0 + static const struct devlink_health_reporter_ops qed_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = qed_fw_fatal_reporter_recover, .dump = qed_fw_fatal_reporter_dump, + .default_graceful_period = QED_REPORTER_FW_GRACEFUL_PERIOD, }; -#define QED_REPORTER_FW_GRACEFUL_PERIOD 0 - void qed_fw_reporters_create(struct devlink *devlink) { struct qed_devlink *dl = devlink_priv(devlink); - dl->fw_reporter = devlink_health_reporter_create(devlink, &qed_fw_fatal_reporter_ops, - QED_REPORTER_FW_GRACEFUL_PERIOD, dl); + dl->fw_reporter = devlink_health_reporter_create(devlink, + &qed_fw_fatal_reporter_ops, dl); if (IS_ERR(dl->fw_reporter)) { DP_NOTICE(dl->cdev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(dl->fw_reporter)); diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 688f05316b5e..3bd0e7a489c3 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -183,14 +183,14 @@ int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink) health->empty_reporter = devl_health_reporter_create(devlink, &nsim_dev_empty_reporter_ops, - 0, health); + health); if (IS_ERR(health->empty_reporter)) return PTR_ERR(health->empty_reporter); health->dummy_reporter = devl_health_reporter_create(devlink, &nsim_dev_dummy_reporter_ops, - 0, health); + health); if (IS_ERR(health->dummy_reporter)) { err = PTR_ERR(health->dummy_reporter); goto err_empty_reporter_destroy; diff --git a/include/net/devlink.h b/include/net/devlink.h index 3119d053bc4d..c7ad7a981b39 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -746,6 +746,8 @@ enum devlink_health_reporter_state { * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status * @test: callback to trigger a test event + * @default_graceful_period: default min time (in msec) + * between recovery attempts */ struct devlink_health_reporter_ops { @@ -760,6 +762,7 @@ struct devlink_health_reporter_ops { struct netlink_ext_ack *extack); int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); + u64 default_graceful_period; }; /** @@ -1928,22 +1931,22 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); void devl_health_reporter_destroy(struct devlink_health_reporter *reporter); diff --git a/net/devlink/health.c b/net/devlink/health.c index b3ce8ecbb7fb..ba144b7426fa 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -108,11 +108,11 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; - if (WARN_ON(graceful_period && !ops->recover)) + if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); @@ -122,7 +122,7 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; - reporter->graceful_period = graceful_period; + reporter->graceful_period = ops->default_graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -134,13 +134,12 @@ __devlink_health_reporter_create(struct devlink *devlink, * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -150,8 +149,7 @@ devl_port_health_reporter_create(struct devlink_port *port, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(port->devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -164,14 +162,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); - reporter = devl_port_health_reporter_create(port, ops, - graceful_period, priv); + reporter = devl_port_health_reporter_create(port, ops, priv); devl_unlock(devlink); return reporter; } @@ -182,13 +179,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -197,8 +193,7 @@ devl_health_reporter_create(struct devlink *devlink, if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -210,13 +205,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); - reporter = devl_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = devl_health_reporter_create(devlink, ops, priv); devl_unlock(devlink); return reporter; } -- cgit v1.2.3 From 6a06d8c40510ba1ecf27977f528b1eb74f290a60 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:52 +0300 Subject: devlink: Introduce burst period for health reporter Currently, the devlink health reporter starts the grace period immediately after handling an error, blocking any further recoveries until it finished. However, when a single root cause triggers multiple errors in a short time frame, it is desirable to treat them as a bulk of errors and to allow their recoveries, avoiding premature blocking of subsequent related errors, and reducing the risk of inconsistent or incomplete error handling. To address this, introduce a configurable burst period for devlink health reporter. Start this period when the first error is handled, and allow recovery attempts for reported errors during this window. Once burst period expires, begin the grace period to block further recoveries until it concludes. Timeline summary: ----|--------|------------------------------/----------------------/-- error is error is burst period grace period reported recovered (recoveries allowed) (recoveries blocked) For calculating the burst period duration, use the same last_recovery_ts as the grace period. Update it on recovery only when the burst period is inactive (either disabled or at the first error). This patch implements the framework for the burst period and effectively sets its value to 0 at reporter creation, so the current behavior remains unchanged, which ensures backward compatibility. A downstream patch will make the burst period configurable. Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +++ net/devlink/health.c | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index c7ad7a981b39..5f44e702c25c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -748,6 +748,8 @@ enum devlink_health_reporter_state { * @test: callback to trigger a test event * @default_graceful_period: default min time (in msec) * between recovery attempts + * @default_burst_period: default time (in msec) for + * error recoveries before starting the grace period */ struct devlink_health_reporter_ops { @@ -763,6 +765,7 @@ struct devlink_health_reporter_ops { int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); u64 default_graceful_period; + u64 default_burst_period; }; /** diff --git a/net/devlink/health.c b/net/devlink/health.c index 9d0d4a9face7..94ab77f77add 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -60,6 +60,7 @@ struct devlink_health_reporter { struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; + u64 burst_period; bool auto_recover; bool auto_dump; u8 health_state; @@ -123,6 +124,7 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = ops->default_graceful_period; + reporter->burst_period = ops->default_burst_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -508,11 +510,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, devlink_nl_notify_send_desc(devlink, msg, &desc); } +static bool +devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter) +{ + unsigned long burst_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period); + + return time_is_after_jiffies(burst_threshold); +} + void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; + if (!devlink_health_reporter_in_burst(reporter)) + /* When burst period is set, last_recovery_ts marks the first + * recovery within the burst period, not necessarily the last + * one. + */ + reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); @@ -599,7 +615,11 @@ devlink_health_recover_abort(struct devlink_health_reporter *reporter, if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) return true; + if (devlink_health_reporter_in_burst(reporter)) + return false; + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period) + msecs_to_jiffies(reporter->graceful_period); if (reporter->last_recovery_ts && reporter->recovery_count && time_is_after_jiffies(recover_ts_threshold)) -- cgit v1.2.3 From da0e2197645c8e01bb6080c7a2b86d9a56cc64a9 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:53 +0300 Subject: devlink: Make health reporter burst period configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable configuration of the burst period — a time window starting from the first error recovery, during which the reporter allows recovery attempts for each reported error. This feature is helpful when a single underlying issue causes multiple errors, as it delays the start of the grace period to allow sufficient time for recovering all related errors. For example, if multiple TX queues time out simultaneously, a sufficient burst period could allow all affected TX queues to be recovered within that window. Without this period, only the first TX queue that reports a timeout will undergo recovery, while the remaining TX queues will be blocked once the grace period begins. Configuration example: $ devlink health set pci/0000:00:09.0 reporter tx burst_period 500 Configuration example with ynl: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/devlink.yaml \ --do health-reporter-set --json '{ "bus-name": "auxiliary", "dev-name": "mlx5_core.eth.0", "port-index": 65535, "health-reporter-name": "tx", "health-reporter-burst-period": 500 }' Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Reviewed-by: Carolina Jubran Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 7 ++++++ .../networking/devlink/devlink-health.rst | 2 +- include/uapi/linux/devlink.h | 2 ++ net/devlink/health.c | 28 ++++++++++++++++++++-- net/devlink/netlink_gen.c | 5 ++-- 5 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bb87111d5e16..3db59c965869 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -853,6 +853,10 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-rate-tc-bws + - + name: health-reporter-burst-period + type: u64 + doc: Time (in msec) for recoveries before starting the grace period. - name: dl-dev-stats subset-of: devlink @@ -1216,6 +1220,8 @@ attribute-sets: name: health-reporter-dump-ts-ns - name: health-reporter-auto-dump + - + name: health-reporter-burst-period - name: dl-attr-stats @@ -1961,6 +1967,7 @@ operations: - health-reporter-graceful-period - health-reporter-auto-recover - health-reporter-auto-dump + - health-reporter-burst-period - name: health-reporter-recover diff --git a/Documentation/networking/devlink/devlink-health.rst b/Documentation/networking/devlink/devlink-health.rst index e0b8cfed610a..4d10536377ab 100644 --- a/Documentation/networking/devlink/devlink-health.rst +++ b/Documentation/networking/devlink/devlink-health.rst @@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions: * Auto recovery attempt is being done. Depends on: - Auto-recovery configuration - - Grace period vs. time passed since last recover + - Grace period (and burst period) vs. time passed since last recover Devlink formatted message ========================= diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9fcb25a0f447..bcad11a787a5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -636,6 +636,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, /* u64 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/health.c b/net/devlink/health.c index 94ab77f77add..136a67c36a20 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -116,6 +116,9 @@ __devlink_health_reporter_create(struct devlink *devlink, if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); + if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) + return ERR_PTR(-EINVAL); + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); @@ -293,6 +296,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period)) goto reporter_nest_cancel; + if (reporter->ops->recover && + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, + reporter->burst_period)) + goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) @@ -458,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) { reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + if (!reporter->graceful_period) + reporter->burst_period = 0; + } + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) { + u64 burst_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]); + + if (!reporter->graceful_period && burst_period) { + NL_SET_ERR_MSG_MOD(info->extack, + "Cannot set burst period without a grace period."); + return -EINVAL; + } + + reporter->burst_period = burst_period; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index d97c326a9045..9fd00977d59e 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN }; /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */ -static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = { +static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, }, + [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, }, }; /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */ @@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_health_reporter_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_health_reporter_set_nl_policy, - .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { -- cgit v1.2.3 From 1bec9d0c0046fe4e2bfb6a1c5aadcb5d56cdb0fb Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Aug 2025 15:37:43 +0200 Subject: ipv4: Convert ->flowi4_tos to dscp_t. Convert the ->flowic_tos field of struct flowi_common from __u8 to dscp_t, rename it ->flowic_dscp and propagate these changes to struct flowi and struct flowi4. We've had several bugs in the past where ECN bits could interfere with IPv4 routing, because these bits were not properly cleared when setting ->flowi4_tos. These bugs should be fixed now and the dscp_t type has been introduced to ensure that variables carrying DSCP values don't accidentally have any ECN bits set. Several variables and structure fields have been converted to dscp_t already, but the main IPv4 routing structure, struct flowi4, is still using a __u8. To avoid any future regression, this patch converts it to dscp_t. There are many users to convert at once. Fortunately, around half of ->flowi4_tos users already have a dscp_t value at hand, which they currently convert to __u8 using inet_dscp_to_dsfield(). For all of these users, we just need to drop that conversion. But, although we try to do the __u8 <-> dscp_t conversions at the boundaries of the network or of user space, some places still store TOS/DSCP variables as __u8 in core networking code. Those can hardly be converted either because the data structure is part of UAPI or because the same variable or field is also used for handling ECN in other parts of the code. In all of these cases where we don't have a dscp_t variable at hand, we need to use inet_dsfield_to_dscp() when interacting with ->flowi4_dscp. Changes since v1: * Fix space alignment in __bpf_redirect_neigh_v4() (Ido). Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/29acecb45e911d17446b9a3dbdb1ab7b821ea371.1756128932.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 7 ++++--- drivers/net/ethernet/sfc/tc_encap_actions.c | 4 +++- drivers/net/gtp.c | 7 ++++--- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- drivers/net/vrf.c | 4 ++-- include/net/flow.h | 11 ++++++----- include/net/inet_dscp.h | 6 ++++++ include/net/ip_fib.h | 2 +- include/net/ip_tunnels.h | 4 +++- include/net/route.h | 2 +- include/trace/events/fib.h | 4 +++- net/core/filter.c | 4 ++-- net/core/lwt_bpf.c | 4 ++-- net/ipv4/fib_frontend.c | 7 ++++--- net/ipv4/fib_rules.c | 4 ++-- net/ipv4/icmp.c | 5 +++-- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/ipv4/ipmr.c | 3 ++- net/ipv4/netfilter.c | 4 ++-- net/ipv4/netfilter/ipt_rpfilter.c | 4 ++-- net/ipv4/netfilter/nf_dup_ipv4.c | 4 ++-- net/ipv4/netfilter/nft_fib_ipv4.c | 4 ++-- net/ipv4/route.c | 8 ++++---- net/ipv4/udp_tunnel_core.c | 3 ++- net/ipv4/xfrm4_policy.c | 4 ++-- net/netfilter/nft_flow_offload.c | 4 ++-- net/sctp/protocol.c | 3 ++- net/xfrm/xfrm_policy.c | 6 +++--- 30 files changed, 81 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/amt.c b/drivers/net/amt.c index ed86537b2f61..902c817a0dea 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include static struct workqueue_struct *amt_wq; @@ -1018,7 +1020,7 @@ static bool amt_send_membership_update(struct amt_dev *amt, fl4.flowi4_oif = amt->stream_dev->ifindex; fl4.daddr = amt->remote_ip; fl4.saddr = amt->local_ip; - fl4.flowi4_tos = AMT_TOS; + fl4.flowi4_dscp = inet_dsfield_to_dscp(AMT_TOS); fl4.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(amt->net, &fl4); if (IS_ERR(rt)) { @@ -1133,7 +1135,7 @@ static bool amt_send_membership_query(struct amt_dev *amt, fl4.flowi4_oif = amt->stream_dev->ifindex; fl4.daddr = tunnel->ip4; fl4.saddr = amt->local_ip; - fl4.flowi4_tos = AMT_TOS; + fl4.flowi4_dscp = inet_dsfield_to_dscp(AMT_TOS); fl4.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(amt->net, &fl4); if (IS_ERR(rt)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 2162d776fe35..a14f216048cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* Copyright (c) 2018 Mellanox Technologies. */ -#include +#include +#include #include #include #include @@ -233,7 +234,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.flowi4_dscp = inet_dsfield_to_dscp(tun_key->tos); attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; @@ -349,7 +350,7 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.flowi4_dscp = inet_dsfield_to_dscp(tun_key->tos); attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; diff --git a/drivers/net/ethernet/sfc/tc_encap_actions.c b/drivers/net/ethernet/sfc/tc_encap_actions.c index e872f926e438..eef06e48185d 100644 --- a/drivers/net/ethernet/sfc/tc_encap_actions.c +++ b/drivers/net/ethernet/sfc/tc_encap_actions.c @@ -11,6 +11,8 @@ #include "tc_encap_actions.h" #include "tc.h" #include "mae.h" +#include +#include #include #include #include @@ -99,7 +101,7 @@ static int efx_bind_neigh(struct efx_nic *efx, case EFX_ENCAP_TYPE_GENEVE: flow4.flowi4_proto = IPPROTO_UDP; flow4.fl4_dport = encap->key.tp_dst; - flow4.flowi4_tos = encap->key.tos; + flow4.flowi4_dscp = inet_dsfield_to_dscp(encap->key.tos); flow4.daddr = encap->key.u.ipv4.dst; flow4.saddr = encap->key.u.ipv4.src; break; diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4b668ebaa0f7..5cb59d72bc82 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -21,9 +21,10 @@ #include #include +#include +#include #include #include -#include #include #include #include @@ -352,7 +353,7 @@ static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))); + fl4->flowi4_dscp = inet_sk_dscp(inet_sk(sk)); fl4->flowi4_scope = ip_sock_rt_scope(sk); fl4->flowi4_proto = sk->sk_protocol; @@ -2401,7 +2402,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) udp_tunnel_xmit_skb(rt, sk, skb_to_send, fl4.saddr, fl4.daddr, - fl4.flowi4_tos, + inet_dscp_to_dsfield(fl4.flowi4_dscp), ip4_dst_hoplimit(&rt->dst), 0, port, port, diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index e3e65772c599..d7e3ddbcab6f 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -2,7 +2,7 @@ /* Copyright (c) 2014 Mahesh Bandewar */ -#include +#include #include #include "ipvlan.h" @@ -433,7 +433,7 @@ static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) ip4h = ip_hdr(skb); fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + fl4.flowi4_dscp = ip4h_dscp(ip4h); rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 3ccd649913b5..571847a7f86d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #define DRV_NAME "vrf" #define DRV_VERSION "1.1" @@ -505,7 +505,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; diff --git a/include/net/flow.h b/include/net/flow.h index a1839c278d87..ae9481c40063 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -12,6 +12,7 @@ #include #include #include +#include struct flow_keys; @@ -32,7 +33,7 @@ struct flowi_common { int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; - __u8 flowic_tos; + dscp_t flowic_dscp; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; @@ -70,7 +71,7 @@ struct flowi4 { #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark -#define flowi4_tos __fl_common.flowic_tos +#define flowi4_dscp __fl_common.flowic_dscp #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags @@ -103,7 +104,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; @@ -141,7 +142,7 @@ struct flowi6 { #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; - /* Note: flowi6_tos is encoded in flowlabel, too. */ + /* Note: flowi6_dscp is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport @@ -163,7 +164,7 @@ struct flowi { #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark -#define flowi_tos u.__fl_common.flowic_tos +#define flowi_dscp u.__fl_common.flowic_dscp #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h index 72f250dffada..1aa9f04ed1ab 100644 --- a/include/net/inet_dscp.h +++ b/include/net/inet_dscp.h @@ -39,6 +39,12 @@ typedef u8 __bitwise dscp_t; #define INET_DSCP_MASK 0xfc +/* A few places in the IPv4 code need to ignore the three high order bits of + * DSCP because of backward compatibility (as these bits used to represent the + * IPv4 Precedence in RFC 791's TOS field and were ignored). + */ +#define INET_DSCP_LEGACY_TOS_MASK ((__force dscp_t)0x1c) + static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) { return (__force dscp_t)(dsfield & INET_DSCP_MASK); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 48bb3cf41469..b4495c38e0a0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -440,7 +440,7 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) { - return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos)); + return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK); } /* Exported by fib_frontend.c */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 8cf1380f3656..4314a97702ea 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -11,7 +11,9 @@ #include #include +#include #include +#include #include #include #include @@ -362,7 +364,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; diff --git a/include/net/route.h b/include/net/route.h index 7ea840daa775..c71998f464f8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -189,7 +189,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = inet_dscp_to_dsfield(dscp), + .flowi4_dscp = dscp, .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 20b914250ce9..feb28b359eff 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -44,7 +46,7 @@ TRACE_EVENT(fib_table_lookup, __entry->err = err; __entry->oif = flp->flowi4_oif; __entry->iif = flp->flowi4_iif; - __entry->tos = flp->flowi4_tos; + __entry->tos = inet_dscp_to_dsfield(flp->flowi4_dscp); __entry->scope = flp->flowi4_scope; __entry->flags = flp->flowi4_flags; diff --git a/net/core/filter.c b/net/core/filter.c index 5da1cad66be2..b005363f482c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2373,7 +2373,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), + .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -6020,7 +6020,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index ae74634310a3..9f40be0c3e71 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -8,12 +8,12 @@ #include #include #include +#include #include #include #include #include #include -#include struct bpf_lwt_prog { struct bpf_prog *prog; @@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6e1b94796f67..1dab44e13d3b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), + .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; @@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, + .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index fa58d6620ed6..51f0193092f0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && - (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) + if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 91765057aa1d..7248c15cbd75 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); + fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb)); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); @@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f5b9004d6938..761a53c6a89a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -44,7 +45,6 @@ #include #include #include -#include /* Problems & solutions @@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4 = { .flowi4_oif = t->parms.link, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)), + .flowi4_dscp = ip4h_dscp(&t->parms.iph), .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_proto = IPPROTO_GRE, .saddr = t->parms.iph.saddr, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 84e7f8a2f50f..2b96651d719b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -63,6 +63,7 @@ #include #include +#include #include #include #include @@ -485,7 +486,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e86a8a862c41..345e5faac634 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e60e54e7945d..ce310eb779e0 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index a27782d7653e..6d9bf5106868 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + flow.flowi4_dscp = ip4h_dscp(iph); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ed08fb78cfa8..9a773502f10a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -12,10 +12,10 @@ #include #include #include +#include #include #include #include -#include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include @@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7e7c49535e3f..82af6cd76d13 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include @@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1f212b2ce4c6..771f6986ed05 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2690,7 +2691,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3333,7 +3333,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index fce945f23069..54386e06a813 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7fb6205619e7..58faf1ddd2b1 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); + fl4->flowi4_dscp = params->dscp; fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50..14dd1c0698c3 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 3b2373b3bd5d..9dbc24af749b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7111184eef59..62486f866975 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) - return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); + return fl->u.ip4.flowi4_dscp; return 0; } @@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve } fl4->flowi4_proto = flkeys->basic.ip_proto; - fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); } #if IS_ENABLED(CONFIG_IPV6) @@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; - fl1->flowi_tos = fl->flowi_tos; + fl1->flowi_dscp = fl->flowi_dscp; nf_nat_decode_session(newskb, fl1, family); ret = false; -- cgit v1.2.3 From 095928e7d80186c524013a5b5d54889fa2ec1eaa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Aug 2025 21:36:43 -0400 Subject: ipv6: sr: Use HMAC-SHA1 and HMAC-SHA256 library functions Use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. Pre-allocating per-CPU hash transformation objects and descriptors is no longer needed, and a microbenchmark on x86_64 shows seg6_hmac_compute() (with HMAC-SHA256) dropping from ~2494 cycles to ~1978 cycles, a 20% improvement. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250824013644.71928-2-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/seg6_hmac.h | 12 --- net/ipv6/Kconfig | 7 +- net/ipv6/seg6.c | 7 -- net/ipv6/seg6_hmac.c | 207 +++++++----------------------------------------- 4 files changed, 30 insertions(+), 203 deletions(-) (limited to 'include') diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 24f733b3e3fe..3fe4123dbbf0 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -19,7 +19,6 @@ #include #include -#define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 struct seg6_hmac_info { @@ -32,13 +31,6 @@ struct seg6_hmac_info { u8 alg_id; }; -struct seg6_hmac_algo { - u8 alg_id; - char name[64]; - struct crypto_shash * __percpu *tfms; - struct shash_desc * __percpu *shashs; -}; - extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output); @@ -50,13 +42,9 @@ extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh); extern bool seg6_hmac_validate_skb(struct sk_buff *skb); #ifdef CONFIG_IPV6_SEG6_HMAC -extern int seg6_hmac_init(void); -extern void seg6_hmac_exit(void); extern int seg6_hmac_net_init(struct net *net); extern void seg6_hmac_net_exit(struct net *net); #else -static inline int seg6_hmac_init(void) { return 0; } -static inline void seg6_hmac_exit(void) {} static inline int seg6_hmac_net_init(struct net *net) { return 0; } static inline void seg6_hmac_net_exit(struct net *net) {} #endif diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 1c9c686d9522..b8f9a8c0302e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL config IPV6_SEG6_HMAC bool "IPv6: Segment Routing HMAC support" depends on IPV6 - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS help Support for HMAC signature generation and verification of SR-enabled packets. diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 180da19c148c..a5c4c629b788 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -522,16 +522,10 @@ int __init seg6_init(void) if (err) goto out_unregister_iptun; - err = seg6_hmac_init(); - if (err) - goto out_unregister_seg6; - pr_info("Segment Routing with IPv6\n"); out: return err; -out_unregister_seg6: - seg6_local_exit(); out_unregister_iptun: seg6_iptunnel_exit(); out_unregister_genl: @@ -543,7 +537,6 @@ out_unregister_pernet: void seg6_exit(void) { - seg6_hmac_exit(); seg6_local_exit(); seg6_iptunnel_exit(); genl_unregister_family(&seg6_genl_family); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index fd58426f222b..61f6019df55b 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -34,7 +33,8 @@ #include #include -#include +#include +#include #include #include #include @@ -78,17 +78,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = seg6_hmac_cmpfn, }; -static struct seg6_hmac_algo hmac_algos[] = { - { - .alg_id = SEG6_HMAC_ALGO_SHA1, - .name = "hmac(sha1)", - }, - { - .alg_id = SEG6_HMAC_ALGO_SHA256, - .name = "hmac(sha256)", - }, -}; - static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; @@ -108,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) return tlv; } -static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) -{ - struct seg6_hmac_algo *algo; - int i, alg_count; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - if (algo->alg_id == alg_id) - return algo; - } - - return NULL; -} - -static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, - u8 *output, int outlen) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int ret, dgsize; - - algo = __hmac_get_algo(hinfo->alg_id); - if (!algo) - return -ENOENT; - - tfm = *this_cpu_ptr(algo->tfms); - - dgsize = crypto_shash_digestsize(tfm); - if (dgsize > outlen) { - pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", - dgsize, outlen); - return -ENOMEM; - } - - ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); - goto failed; - } - - shash = *this_cpu_ptr(algo->shashs); - shash->tfm = tfm; - - ret = crypto_shash_digest(shash, text, psize, output); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); - goto failed; - } - - return dgsize; - -failed: - return ret; -} - int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); - u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; - int plen, i, dgsize, wrsize; + int plen, i, ret = 0; char *ring, *off; - /* a 160-byte buffer for digest output allows to store highest known - * hash function (RadioGatun) with up to 1216 bits - */ - /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; @@ -219,22 +146,26 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, off += 16; } - dgsize = __do_hmac(hinfo, ring, plen, tmp_out, - SEG6_HMAC_MAX_DIGESTSIZE); + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, + output); + static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); + memset(&output[SHA1_DIGEST_SIZE], 0, + SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, + output); + static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); + break; + default: + ret = -ENOENT; + break; + } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); - - if (dgsize < 0) - return dgsize; - - wrsize = SEG6_HMAC_FIELD_LEN; - if (wrsize > dgsize) - wrsize = dgsize; - - memset(output, 0, SEG6_HMAC_FIELD_LEN); - memcpy(output, tmp_out, wrsize); - - return 0; + return ret; } EXPORT_SYMBOL(seg6_hmac_compute); @@ -305,8 +236,13 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; - if (!__hmac_get_algo(hinfo->alg_id)) + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + case SEG6_HMAC_ALGO_SHA256: + break; + default: return -EINVAL; + } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); @@ -363,65 +299,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_algo(void) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - int ret = -ENOMEM; - - alg_count = ARRAY_SIZE(hmac_algos); - - for (i = 0; i < alg_count; i++) { - struct crypto_shash **p_tfm; - int shsize; - - algo = &hmac_algos[i]; - algo->tfms = alloc_percpu(struct crypto_shash *); - if (!algo->tfms) - goto error_out; - - for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto error_out; - } - p_tfm = per_cpu_ptr(algo->tfms, cpu); - *p_tfm = tfm; - } - - p_tfm = raw_cpu_ptr(algo->tfms); - tfm = *p_tfm; - - shsize = sizeof(*shash) + crypto_shash_descsize(tfm); - - algo->shashs = alloc_percpu(struct shash_desc *); - if (!algo->shashs) - goto error_out; - - for_each_possible_cpu(cpu) { - shash = kzalloc_node(shsize, GFP_KERNEL, - cpu_to_node(cpu)); - if (!shash) - goto error_out; - *per_cpu_ptr(algo->shashs, cpu) = shash; - } - } - - return 0; - -error_out: - seg6_hmac_exit(); - return ret; -} - -int __init seg6_hmac_init(void) -{ - return seg6_hmac_init_algo(); -} - int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); @@ -429,36 +306,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -void seg6_hmac_exit(void) -{ - struct seg6_hmac_algo *algo = NULL; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - - if (algo->shashs) { - for_each_possible_cpu(cpu) { - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - } - free_percpu(algo->shashs); - } - - if (algo->tfms) { - for_each_possible_cpu(cpu) { - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); - } - free_percpu(algo->tfms); - } - } -} -EXPORT_SYMBOL(seg6_hmac_exit); - void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); -- cgit v1.2.3 From fe60065689048edf4df99fffdb180a2166f9a54d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Aug 2025 21:36:44 -0400 Subject: ipv6: sr: Prepare HMAC key ahead of time Prepare the HMAC key when it is added to the kernel, instead of preparing it implicitly for every packet. This significantly improves the performance of seg6_hmac_compute(). A microbenchmark on x86_64 shows seg6_hmac_compute() (with HMAC-SHA256) dropping from ~1978 cycles to ~1419 cycles, a 28% improvement. The size of 'struct seg6_hmac_info' increases by 128 bytes, but that should be fine, since there should not be a massive number of keys. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250824013644.71928-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/seg6_hmac.h | 8 ++++++++ net/ipv6/seg6_hmac.c | 14 +++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 3fe4123dbbf0..e9f41725933e 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -9,6 +9,8 @@ #ifndef _NET_SEG6_HMAC_H #define _NET_SEG6_HMAC_H +#include +#include #include #include #include @@ -26,9 +28,15 @@ struct seg6_hmac_info { struct rcu_head rcu; u32 hmackeyid; + /* The raw key, kept only so it can be returned back to userspace */ char secret[SEG6_HMAC_SECRET_LEN]; u8 slen; u8 alg_id; + /* The prepared key, which the calculations actually use */ + union { + struct hmac_sha1_key sha1; + struct hmac_sha256_key sha256; + } key; }; extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 61f6019df55b..ee6bac0160ac 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -148,19 +148,18 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: - hmac_sha1_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, - output); + hmac_sha1(&hinfo->key.sha1, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); memset(&output[SHA1_DIGEST_SIZE], 0, SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); break; case SEG6_HMAC_ALGO_SHA256: - hmac_sha256_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, - output); + hmac_sha256(&hinfo->key.sha256, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); break; default: - ret = -ENOENT; + WARN_ON_ONCE(1); + ret = -EINVAL; break; } local_unlock_nested_bh(&hmac_storage.bh_lock); @@ -238,7 +237,12 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_preparekey(&hinfo->key.sha1, + hinfo->secret, hinfo->slen); + break; case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_preparekey(&hinfo->key.sha256, + hinfo->secret, hinfo->slen); break; default: return -EINVAL; -- cgit v1.2.3 From 4b59300ba4d2362b02c2a9077047bbabceea67d7 Mon Sep 17 00:00:00 2001 From: Marie Zhussupova Date: Tue, 26 Aug 2025 17:13:31 +0800 Subject: kunit: Add parent kunit for parameterized test context Currently, KUnit parameterized tests lack a mechanism to share resources across parameter runs because the same `struct kunit` instance is cleaned up and reused for each run. This patch introduces parameterized test context, enabling test users to share resources between parameter runs. It also allows setting up resources that need to be available for all parameter runs only once, which is helpful in cases where setup is expensive. To establish a parameterized test context, this patch adds a parent pointer field to `struct kunit`. This allows resources added to the parent `struct kunit` to be shared and accessible across all parameter runs. In kunit_run_tests(), the default `struct kunit` created is now designated to act as the parameterized test context whenever a test is parameterized. Subsequently, a new `struct kunit` is made for each parameter run, and its parent pointer is set to the `struct kunit` that holds the parameterized test context. Link: https://lore.kernel.org/r/20250826091341.1427123-2-davidgow@google.com Reviewed-by: David Gow Reviewed-by: Rae Moar Signed-off-by: Marie Zhussupova Signed-off-by: David Gow Signed-off-by: Shuah Khan --- include/kunit/test.h | 8 ++++++-- lib/kunit/test.c | 34 ++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index d958ee53050e..9766403afd56 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -268,14 +268,18 @@ struct kunit_suite_set { * * @priv: for user to store arbitrary data. Commonly used to pass data * created in the init function (see &struct kunit_suite). + * @parent: reference to the parent context of type struct kunit that can + * be used for storing shared resources. * * Used to store information about the current context under which the test * is running. Most of this data is private and should only be accessed - * indirectly via public functions; the one exception is @priv which can be - * used by the test writer to store arbitrary data. + * indirectly via public functions; the two exceptions are @priv and @parent + * which can be used by the test writer to store arbitrary data and access the + * parent context, respectively. */ struct kunit { void *priv; + struct kunit *parent; /* private: internal use only. */ const char *name; /* Read only after initialization! */ diff --git a/lib/kunit/test.c b/lib/kunit/test.c index d2bfa331a2b1..587b5c51db58 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -647,6 +647,7 @@ int kunit_run_tests(struct kunit_suite *suite) struct kunit_case *test_case; struct kunit_result_stats suite_stats = { 0 }; struct kunit_result_stats total_stats = { 0 }; + const void *curr_param; /* Taint the kernel so we know we've run tests. */ add_taint(TAINT_TEST, LOCKDEP_STILL_OK); @@ -679,37 +680,42 @@ int kunit_run_tests(struct kunit_suite *suite) } else { /* Get initial param. */ param_desc[0] = '\0'; - test.param_value = test_case->generate_params(NULL, param_desc); + /* TODO: Make generate_params try-catch */ + curr_param = test_case->generate_params(NULL, param_desc); test_case->status = KUNIT_SKIPPED; kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT "KTAP version 1\n"); kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT "# Subtest: %s", test_case->name); - while (test.param_value) { - kunit_run_case_catch_errors(suite, test_case, &test); + while (curr_param) { + struct kunit param_test = { + .param_value = curr_param, + .param_index = ++test.param_index, + .parent = &test, + }; + kunit_init_test(¶m_test, test_case->name, test_case->log); + kunit_run_case_catch_errors(suite, test_case, ¶m_test); if (param_desc[0] == '\0') { snprintf(param_desc, sizeof(param_desc), - "param-%d", test.param_index); + "param-%d", param_test.param_index); } - kunit_print_ok_not_ok(&test, KUNIT_LEVEL_CASE_PARAM, - test.status, - test.param_index + 1, + kunit_print_ok_not_ok(¶m_test, KUNIT_LEVEL_CASE_PARAM, + param_test.status, + param_test.param_index, param_desc, - test.status_comment); + param_test.status_comment); - kunit_update_stats(¶m_stats, test.status); + kunit_update_stats(¶m_stats, param_test.status); /* Get next param. */ param_desc[0] = '\0'; - test.param_value = test_case->generate_params(test.param_value, param_desc); - test.param_index++; - test.status = KUNIT_SUCCESS; - test.status_comment[0] = '\0'; - test.priv = NULL; + curr_param = test_case->generate_params(curr_param, param_desc); } + /* TODO: Put this kunit_cleanup into a try-catch. */ + kunit_cleanup(&test); } kunit_print_attr((void *)test_case, true, KUNIT_LEVEL_CASE); -- cgit v1.2.3 From 241423580e5e8d8b10b14b382379f4928b87be17 Mon Sep 17 00:00:00 2001 From: Marie Zhussupova Date: Tue, 26 Aug 2025 17:13:32 +0800 Subject: kunit: Introduce param_init/exit for parameterized test context management Add (*param_init) and (*param_exit) function pointers to `struct kunit_case`. Users will be able to set them via the new KUNIT_CASE_PARAM_WITH_INIT() macro. param_init/exit will be invoked by kunit_run_tests() once before and once after the parameterized test, respectively. They will receive the `struct kunit` that holds the parameterized test context; facilitating init and exit for shared state. This patch also sets param_init/exit to None in rust/kernel/kunit.rs. Link: https://lore.kernel.org/r/20250826091341.1427123-3-davidgow@google.com Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Marie Zhussupova Signed-off-by: David Gow Signed-off-by: Shuah Khan --- include/kunit/test.h | 25 +++++++++++++++++++++++++ lib/kunit/test.c | 27 ++++++++++++++++++++++++++- rust/kernel/kunit.rs | 4 ++++ 3 files changed, 55 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 9766403afd56..fc8fd55b2dfb 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -92,6 +92,8 @@ struct kunit_attributes { * @name: the name of the test case. * @generate_params: the generator function for parameterized tests. * @attr: the attributes associated with the test + * @param_init: The init function to run before a parameterized test. + * @param_exit: The exit function to run after a parameterized test. * * A test case is a function with the signature, * ``void (*)(struct kunit *)`` @@ -128,6 +130,8 @@ struct kunit_case { const char *name; const void* (*generate_params)(const void *prev, char *desc); struct kunit_attributes attr; + int (*param_init)(struct kunit *test); + void (*param_exit)(struct kunit *test); /* private: internal use only. */ enum kunit_status status; @@ -218,6 +222,27 @@ static inline char *kunit_status_to_ok_not_ok(enum kunit_status status) .generate_params = gen_params, \ .attr = attributes, .module_name = KBUILD_MODNAME} +/** + * KUNIT_CASE_PARAM_WITH_INIT - Define a parameterized KUnit test case with custom + * param_init() and param_exit() functions. + * @test_name: The function implementing the test case. + * @gen_params: The function to generate parameters for the test case. + * @init: A reference to the param_init() function to run before a parameterized test. + * @exit: A reference to the param_exit() function to run after a parameterized test. + * + * Provides the option to register param_init() and param_exit() functions. + * param_init/exit will be passed the parameterized test context and run once + * before and once after the parameterized test. The init function can be used + * to add resources to share between parameter runs, and any other setup logic. + * The exit function can be used to clean up resources that were not managed by + * the parameterized test, and any other teardown logic. + */ +#define KUNIT_CASE_PARAM_WITH_INIT(test_name, gen_params, init, exit) \ + { .run_case = test_name, .name = #test_name, \ + .generate_params = gen_params, \ + .param_init = init, .param_exit = exit, \ + .module_name = KBUILD_MODNAME} + /** * struct kunit_suite - describes a related collection of &struct kunit_case * diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 587b5c51db58..0fe61dec5a96 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -641,6 +641,20 @@ static void kunit_accumulate_stats(struct kunit_result_stats *total, total->total += add.total; } +static void kunit_init_parent_param_test(struct kunit_case *test_case, struct kunit *test) +{ + if (test_case->param_init) { + int err = test_case->param_init(test); + + if (err) { + kunit_err(test_case, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT + "# failed to initialize parent parameter test (%d)", err); + test->status = KUNIT_FAILURE; + test_case->status = KUNIT_FAILURE; + } + } +} + int kunit_run_tests(struct kunit_suite *suite) { char param_desc[KUNIT_PARAM_DESC_SIZE]; @@ -678,6 +692,11 @@ int kunit_run_tests(struct kunit_suite *suite) kunit_run_case_catch_errors(suite, test_case, &test); kunit_update_stats(¶m_stats, test.status); } else { + kunit_init_parent_param_test(test_case, &test); + if (test_case->status == KUNIT_FAILURE) { + kunit_update_stats(¶m_stats, test.status); + goto test_case_end; + } /* Get initial param. */ param_desc[0] = '\0'; /* TODO: Make generate_params try-catch */ @@ -714,10 +733,16 @@ int kunit_run_tests(struct kunit_suite *suite) param_desc[0] = '\0'; curr_param = test_case->generate_params(curr_param, param_desc); } + /* + * TODO: Put into a try catch. Since we don't need suite->exit + * for it we can't reuse kunit_try_run_cleanup for this yet. + */ + if (test_case->param_exit) + test_case->param_exit(&test); /* TODO: Put this kunit_cleanup into a try-catch. */ kunit_cleanup(&test); } - +test_case_end: kunit_print_attr((void *)test_case, true, KUNIT_LEVEL_CASE); kunit_print_test_stats(&test, param_stats); diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs index 41efd87595d6..b1c97f8029c7 100644 --- a/rust/kernel/kunit.rs +++ b/rust/kernel/kunit.rs @@ -210,6 +210,8 @@ pub const fn kunit_case( status: kernel::bindings::kunit_status_KUNIT_SUCCESS, module_name: core::ptr::null_mut(), log: core::ptr::null_mut(), + param_init: None, + param_exit: None, } } @@ -229,6 +231,8 @@ pub const fn kunit_case_null() -> kernel::bindings::kunit_case { status: kernel::bindings::kunit_status_KUNIT_SUCCESS, module_name: core::ptr::null_mut(), log: core::ptr::null_mut(), + param_init: None, + param_exit: None, } } -- cgit v1.2.3 From b9a214b5f6aa55870b5678f31084f85c0c11ffdc Mon Sep 17 00:00:00 2001 From: Marie Zhussupova Date: Tue, 26 Aug 2025 17:13:33 +0800 Subject: kunit: Pass parameterized test context to generate_params() To enable more complex parameterized testing scenarios, the generate_params() function needs additional context beyond just the previously generated parameter. This patch modifies the generate_params() function signature to include an extra `struct kunit *test` argument, giving test users access to the parameterized test context when generating parameters. The `struct kunit *test` argument was added as the first parameter to the function signature as it aligns with the convention of other KUnit functions that accept `struct kunit *test` first. This also mirrors the "this" or "self" reference found in object-oriented programming languages. This patch also modifies xe_pci_live_device_gen_param() in xe_pci.c and nthreads_gen_params() in kcsan_test.c to reflect this signature change. Link: https://lore.kernel.org/r/20250826091341.1427123-4-davidgow@google.com Reviewed-by: David Gow Reviewed-by: Rae Moar Acked-by: Marco Elver Acked-by: Rodrigo Vivi Signed-off-by: Marie Zhussupova [Catch some additional gen_params signatures in drm/xe/tests --David] Signed-off-by: David Gow Signed-off-by: Shuah Khan --- drivers/gpu/drm/xe/tests/xe_pci.c | 14 +++++++------- drivers/gpu/drm/xe/tests/xe_pci_test.h | 9 +++++---- include/kunit/test.h | 9 ++++++--- kernel/kcsan/kcsan_test.c | 2 +- lib/kunit/test.c | 5 +++-- 5 files changed, 22 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/xe/tests/xe_pci.c b/drivers/gpu/drm/xe/tests/xe_pci.c index 9c715e59f030..f707e0a54295 100644 --- a/drivers/gpu/drm/xe/tests/xe_pci.c +++ b/drivers/gpu/drm/xe/tests/xe_pci.c @@ -44,9 +44,9 @@ KUNIT_ARRAY_PARAM(pci_id, pciidlist, xe_pci_id_kunit_desc); * * Return: pointer to the next parameter or NULL if no more parameters */ -const void *xe_pci_graphics_ip_gen_param(const void *prev, char *desc) +const void *xe_pci_graphics_ip_gen_param(struct kunit *test, const void *prev, char *desc) { - return graphics_ip_gen_params(prev, desc); + return graphics_ip_gen_params(test, prev, desc); } EXPORT_SYMBOL_IF_KUNIT(xe_pci_graphics_ip_gen_param); @@ -61,9 +61,9 @@ EXPORT_SYMBOL_IF_KUNIT(xe_pci_graphics_ip_gen_param); * * Return: pointer to the next parameter or NULL if no more parameters */ -const void *xe_pci_media_ip_gen_param(const void *prev, char *desc) +const void *xe_pci_media_ip_gen_param(struct kunit *test, const void *prev, char *desc) { - return media_ip_gen_params(prev, desc); + return media_ip_gen_params(test, prev, desc); } EXPORT_SYMBOL_IF_KUNIT(xe_pci_media_ip_gen_param); @@ -78,9 +78,9 @@ EXPORT_SYMBOL_IF_KUNIT(xe_pci_media_ip_gen_param); * * Return: pointer to the next parameter or NULL if no more parameters */ -const void *xe_pci_id_gen_param(const void *prev, char *desc) +const void *xe_pci_id_gen_param(struct kunit *test, const void *prev, char *desc) { - const struct pci_device_id *pci = pci_id_gen_params(prev, desc); + const struct pci_device_id *pci = pci_id_gen_params(test, prev, desc); return pci->driver_data ? pci : NULL; } @@ -159,7 +159,7 @@ EXPORT_SYMBOL_IF_KUNIT(xe_pci_fake_device_init); * Return: pointer to the next &struct xe_device ready to be used as a parameter * or NULL if there are no more Xe devices on the system. */ -const void *xe_pci_live_device_gen_param(const void *prev, char *desc) +const void *xe_pci_live_device_gen_param(struct kunit *test, const void *prev, char *desc) { const struct xe_device *xe = prev; struct device *dev = xe ? xe->drm.dev : NULL; diff --git a/drivers/gpu/drm/xe/tests/xe_pci_test.h b/drivers/gpu/drm/xe/tests/xe_pci_test.h index ce4d2b86b778..6d8bc56f7bde 100644 --- a/drivers/gpu/drm/xe/tests/xe_pci_test.h +++ b/drivers/gpu/drm/xe/tests/xe_pci_test.h @@ -7,6 +7,7 @@ #define _XE_PCI_TEST_H_ #include +#include #include "xe_platform_types.h" #include "xe_sriov_types.h" @@ -25,9 +26,9 @@ struct xe_pci_fake_data { int xe_pci_fake_device_init(struct xe_device *xe); -const void *xe_pci_graphics_ip_gen_param(const void *prev, char *desc); -const void *xe_pci_media_ip_gen_param(const void *prev, char *desc); -const void *xe_pci_id_gen_param(const void *prev, char *desc); -const void *xe_pci_live_device_gen_param(const void *prev, char *desc); +const void *xe_pci_graphics_ip_gen_param(struct kunit *test, const void *prev, char *desc); +const void *xe_pci_media_ip_gen_param(struct kunit *test, const void *prev, char *desc); +const void *xe_pci_id_gen_param(struct kunit *test, const void *prev, char *desc); +const void *xe_pci_live_device_gen_param(struct kunit *test, const void *prev, char *desc); #endif diff --git a/include/kunit/test.h b/include/kunit/test.h index fc8fd55b2dfb..8eba1b03c3e3 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -128,7 +128,8 @@ struct kunit_attributes { struct kunit_case { void (*run_case)(struct kunit *test); const char *name; - const void* (*generate_params)(const void *prev, char *desc); + const void* (*generate_params)(struct kunit *test, + const void *prev, char *desc); struct kunit_attributes attr; int (*param_init)(struct kunit *test); void (*param_exit)(struct kunit *test); @@ -1703,7 +1704,8 @@ do { \ * Define function @name_gen_params which uses @array to generate parameters. */ #define KUNIT_ARRAY_PARAM(name, array, get_desc) \ - static const void *name##_gen_params(const void *prev, char *desc) \ + static const void *name##_gen_params(struct kunit *test, \ + const void *prev, char *desc) \ { \ typeof((array)[0]) *__next = prev ? ((typeof(__next)) prev) + 1 : (array); \ if (__next - (array) < ARRAY_SIZE((array))) { \ @@ -1724,7 +1726,8 @@ do { \ * Define function @name_gen_params which uses @array to generate parameters. */ #define KUNIT_ARRAY_PARAM_DESC(name, array, desc_member) \ - static const void *name##_gen_params(const void *prev, char *desc) \ + static const void *name##_gen_params(struct kunit *test, \ + const void *prev, char *desc) \ { \ typeof((array)[0]) *__next = prev ? ((typeof(__next)) prev) + 1 : (array); \ if (__next - (array) < ARRAY_SIZE((array))) { \ diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 49ab81faaed9..a13a090bb2a7 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1383,7 +1383,7 @@ static void test_atomic_builtins_missing_barrier(struct kunit *test) * The thread counts are chosen to cover potentially interesting boundaries and * corner cases (2 to 5), and then stress the system with larger counts. */ -static const void *nthreads_gen_params(const void *prev, char *desc) +static const void *nthreads_gen_params(struct kunit *test, const void *prev, char *desc) { long nthreads = (long)prev; diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 0fe61dec5a96..50705248abad 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -700,7 +700,7 @@ int kunit_run_tests(struct kunit_suite *suite) /* Get initial param. */ param_desc[0] = '\0'; /* TODO: Make generate_params try-catch */ - curr_param = test_case->generate_params(NULL, param_desc); + curr_param = test_case->generate_params(&test, NULL, param_desc); test_case->status = KUNIT_SKIPPED; kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT "KTAP version 1\n"); @@ -731,7 +731,8 @@ int kunit_run_tests(struct kunit_suite *suite) /* Get next param. */ param_desc[0] = '\0'; - curr_param = test_case->generate_params(curr_param, param_desc); + curr_param = test_case->generate_params(&test, curr_param, + param_desc); } /* * TODO: Put into a try catch. Since we don't need suite->exit -- cgit v1.2.3 From b820b9077b7f4008cc44a40261aefa681c63c7d3 Mon Sep 17 00:00:00 2001 From: Marie Zhussupova Date: Tue, 26 Aug 2025 17:13:34 +0800 Subject: kunit: Enable direct registration of parameter arrays to a KUnit test KUnit parameterized tests currently support two primary methods f or getting parameters: 1. Defining custom logic within a generate_params() function. 2. Using the KUNIT_ARRAY_PARAM() and KUNIT_ARRAY_PARAM_DESC() macros with a pre-defined static array and passing the created *_gen_params() to KUNIT_CASE_PARAM(). These methods present limitations when dealing with dynamically generated parameter arrays, or in scenarios where populating parameters sequentially via generate_params() is inefficient or overly complex. This patch addresses these limitations by adding a new `params_array` field to `struct kunit`, of the type `kunit_params`. The `struct kunit_params` is designed to store the parameter array itself, along with essential metadata including the parameter count, parameter size, and a get_description() function for providing custom descriptions for individual parameters. The `params_array` field can be populated by calling the new kunit_register_params_array() macro from within a param_init() function. This will register the array as part of the parameterized test context. The user will then need to pass kunit_array_gen_params() to the KUNIT_CASE_PARAM_WITH_INIT() macro as the generator function, if not providing their own. kunit_array_gen_params() is a KUnit helper that will use the registered array to generate parameters. The arrays passed to KUNIT_ARRAY_PARAM(,DESC) will also be registered to the parameterized test context for consistency as well as for higher availability of the parameter count that will be used for outputting a KTAP test plan for a parameterized test. This modification provides greater flexibility to the KUnit framework, allowing testers to easily register and utilize both dynamic and static parameter arrays. Link: https://lore.kernel.org/r/20250826091341.1427123-5-davidgow@google.com Reviewed-by: David Gow Reviewed-by: Rae Moar Signed-off-by: Marie Zhussupova [Only output the test plan if using kunit_array_gen_params --David] Signed-off-by: David Gow Signed-off-by: Shuah Khan --- include/kunit/test.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++----- lib/kunit/test.c | 32 ++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 8eba1b03c3e3..5ec5182b5e57 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -234,9 +234,13 @@ static inline char *kunit_status_to_ok_not_ok(enum kunit_status status) * Provides the option to register param_init() and param_exit() functions. * param_init/exit will be passed the parameterized test context and run once * before and once after the parameterized test. The init function can be used - * to add resources to share between parameter runs, and any other setup logic. - * The exit function can be used to clean up resources that were not managed by - * the parameterized test, and any other teardown logic. + * to add resources to share between parameter runs, pass parameter arrays, + * and any other setup logic. The exit function can be used to clean up resources + * that were not managed by the parameterized test, and any other teardown logic. + * + * Note: If you are registering a parameter array in param_init() with + * kunit_register_param_array() then you need to pass kunit_array_gen_params() + * to this as the generator function. */ #define KUNIT_CASE_PARAM_WITH_INIT(test_name, gen_params, init, exit) \ { .run_case = test_name, .name = #test_name, \ @@ -289,6 +293,20 @@ struct kunit_suite_set { struct kunit_suite * const *end; }; +/* Stores the pointer to the parameter array and its metadata. */ +struct kunit_params { + /* + * Reference to the parameter array for a parameterized test. This + * is NULL if a parameter array wasn't directly passed to the + * parameterized test context struct kunit via kunit_register_params_array(). + */ + const void *params; + /* Reference to a function that gets the description of a parameter. */ + void (*get_description)(struct kunit *test, const void *param, char *desc); + size_t num_params; + size_t elem_size; +}; + /** * struct kunit - represents a running instance of a test. * @@ -296,16 +314,18 @@ struct kunit_suite_set { * created in the init function (see &struct kunit_suite). * @parent: reference to the parent context of type struct kunit that can * be used for storing shared resources. + * @params_array: for storing the parameter array. * * Used to store information about the current context under which the test * is running. Most of this data is private and should only be accessed - * indirectly via public functions; the two exceptions are @priv and @parent - * which can be used by the test writer to store arbitrary data and access the - * parent context, respectively. + * indirectly via public functions; the exceptions are @priv, @parent and + * @params_array which can be used by the test writer to store arbitrary data, + * access the parent context, and to store the parameter array, respectively. */ struct kunit { void *priv; struct kunit *parent; + struct kunit_params params_array; /* private: internal use only. */ const char *name; /* Read only after initialization! */ @@ -376,6 +396,8 @@ void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr) struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set, struct kunit_suite_set suite_set); +const void *kunit_array_gen_params(struct kunit *test, const void *prev, char *desc); + #if IS_BUILTIN(CONFIG_KUNIT) int kunit_run_all_tests(void); #else @@ -1708,6 +1730,8 @@ do { \ const void *prev, char *desc) \ { \ typeof((array)[0]) *__next = prev ? ((typeof(__next)) prev) + 1 : (array); \ + if (!prev) \ + kunit_register_params_array(test, array, ARRAY_SIZE(array), NULL); \ if (__next - (array) < ARRAY_SIZE((array))) { \ void (*__get_desc)(typeof(__next), char *) = get_desc; \ if (__get_desc) \ @@ -1730,6 +1754,8 @@ do { \ const void *prev, char *desc) \ { \ typeof((array)[0]) *__next = prev ? ((typeof(__next)) prev) + 1 : (array); \ + if (!prev) \ + kunit_register_params_array(test, array, ARRAY_SIZE(array), NULL); \ if (__next - (array) < ARRAY_SIZE((array))) { \ strscpy(desc, __next->desc_member, KUNIT_PARAM_DESC_SIZE); \ return __next; \ @@ -1737,6 +1763,33 @@ do { \ return NULL; \ } +/** + * kunit_register_params_array() - Register parameter array for a KUnit test. + * @test: The KUnit test structure to which parameters will be added. + * @array: An array of test parameters. + * @param_count: Number of parameters. + * @get_desc: Function that generates a string description for a given parameter + * element. + * + * This macro initializes the @test's parameter array data, storing information + * including the parameter array, its count, the element size, and the parameter + * description function within `test->params_array`. + * + * Note: If using this macro in param_init(), kunit_array_gen_params() + * will then need to be manually provided as the parameter generator function to + * KUNIT_CASE_PARAM_WITH_INIT(). kunit_array_gen_params() is a KUnit + * function that uses the registered array to generate parameters + */ +#define kunit_register_params_array(test, array, param_count, get_desc) \ + do { \ + struct kunit *_test = (test); \ + const typeof((array)[0]) * _params_ptr = &(array)[0]; \ + _test->params_array.params = _params_ptr; \ + _test->params_array.num_params = (param_count); \ + _test->params_array.elem_size = sizeof(*_params_ptr); \ + _test->params_array.get_description = (get_desc); \ + } while (0) + // TODO(dlatypov@google.com): consider eventually migrating users to explicitly // include resource.h themselves if they need it. #include diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 50705248abad..bb66ea1a3eac 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -337,6 +337,14 @@ void __kunit_do_failed_assertion(struct kunit *test, } EXPORT_SYMBOL_GPL(__kunit_do_failed_assertion); +static void kunit_init_params(struct kunit *test) +{ + test->params_array.params = NULL; + test->params_array.get_description = NULL; + test->params_array.num_params = 0; + test->params_array.elem_size = 0; +} + void kunit_init_test(struct kunit *test, const char *name, struct string_stream *log) { spin_lock_init(&test->lock); @@ -347,6 +355,7 @@ void kunit_init_test(struct kunit *test, const char *name, struct string_stream string_stream_clear(log); test->status = KUNIT_SUCCESS; test->status_comment[0] = '\0'; + kunit_init_params(test); } EXPORT_SYMBOL_GPL(kunit_init_test); @@ -641,6 +650,23 @@ static void kunit_accumulate_stats(struct kunit_result_stats *total, total->total += add.total; } +const void *kunit_array_gen_params(struct kunit *test, const void *prev, char *desc) +{ + struct kunit_params *params_arr = &test->params_array; + const void *param; + + if (test->param_index < params_arr->num_params) { + param = (char *)params_arr->params + + test->param_index * params_arr->elem_size; + + if (params_arr->get_description) + params_arr->get_description(test, param, desc); + return param; + } + return NULL; +} +EXPORT_SYMBOL_GPL(kunit_array_gen_params); + static void kunit_init_parent_param_test(struct kunit_case *test_case, struct kunit *test) { if (test_case->param_init) { @@ -706,6 +732,12 @@ int kunit_run_tests(struct kunit_suite *suite) "KTAP version 1\n"); kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT "# Subtest: %s", test_case->name); + if (test.params_array.params && + test_case->generate_params == kunit_array_gen_params) { + kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT + KUNIT_SUBTEST_INDENT "1..%zd\n", + test.params_array.num_params); + } while (curr_param) { struct kunit param_test = { -- cgit v1.2.3 From 19a9a1ab5c3dce65fff4ac50700117039c23d525 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:32 -0700 Subject: KVM: Rename CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GUEST_MEMFD Rename the Kconfig option CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GUEST_MEMFD. The original name implied that the feature only supported "private" memory. However, CONFIG_KVM_PRIVATE_MEM enables guest_memfd in general, which is not exclusively for private memory. Subsequent patches in this series will add guest_memfd support for non-CoCo VMs, whose memory is not private. Renaming the Kconfig option to CONFIG_KVM_GUEST_MEMFD more accurately reflects its broader scope as the main Kconfig option for all guest_memfd-backed memory. This provides clearer semantics for the option and avoids confusion as new features are introduced. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- include/linux/kvm_host.h | 14 +++++++------- virt/kvm/Kconfig | 8 ++++---- virt/kvm/Makefile.kvm | 2 +- virt/kvm/kvm_main.c | 4 ++-- virt/kvm/kvm_mm.h | 4 ++-- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..7b0f2b3e492d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2276,7 +2276,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #else #define kvm_arch_has_private_mem(kvm) false diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..8cdc0b3cc1b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -602,7 +602,7 @@ struct kvm_memory_slot { short id; u16 as_id; -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD struct { /* * Writes protected by kvm->slots_lock. Acquiring a @@ -720,10 +720,10 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) #endif /* - * Arch code must define kvm_arch_has_private_mem if support for private memory - * is enabled. + * Arch code must define kvm_arch_has_private_mem if support for guest_memfd is + * enabled. */ -#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; @@ -2505,7 +2505,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { - return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && + return IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) && kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else @@ -2515,7 +2515,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order); @@ -2528,7 +2528,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, KVM_BUG_ON(1, kvm); return -EIO; } -#endif /* CONFIG_KVM_PRIVATE_MEM */ +#endif /* CONFIG_KVM_GUEST_MEMFD */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 727b542074e7..e4b400feff94 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -112,19 +112,19 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES depends on KVM_GENERIC_MMU_NOTIFIER bool -config KVM_PRIVATE_MEM +config KVM_GUEST_MEMFD select XARRAY_MULTI bool config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_PRIVATE_MEM + select KVM_GUEST_MEMFD bool config HAVE_KVM_ARCH_GMEM_PREPARE bool - depends on KVM_PRIVATE_MEM + depends on KVM_GUEST_MEMFD config HAVE_KVM_ARCH_GMEM_INVALIDATE bool - depends on KVM_PRIVATE_MEM + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index 724c89af78af..d047d4cf58c9 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm @@ -12,4 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o -kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o +kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..25a94eed75fd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4915,7 +4915,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); #endif -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return !kvm || kvm_arch_has_private_mem(kvm); #endif @@ -5352,7 +5352,7 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CREATE_GUEST_MEMFD: { struct kvm_create_guest_memfd guest_memfd; diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index acef3f5c582a..31defb08ccba 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -67,7 +67,7 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, } #endif /* HAVE_KVM_PFNCACHE */ -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD void kvm_gmem_init(struct module *module); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -91,6 +91,6 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot) { WARN_ON_ONCE(1); } -#endif /* CONFIG_KVM_PRIVATE_MEM */ +#endif /* CONFIG_KVM_GUEST_MEMFD */ #endif /* __KVM_MM_H__ */ -- cgit v1.2.3 From 36cf63bb5df68836e55e2839f8174b404d47670b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:36 -0700 Subject: KVM: Rename CONFIG_KVM_GENERIC_PRIVATE_MEM to CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE The original name was vague regarding its functionality. This Kconfig option specifically enables and gates the kvm_gmem_populate() function, which is responsible for populating a GPA range with guest data. The new name, HAVE_KVM_ARCH_GMEM_POPULATE, describes the purpose of the option: to enable arch-specific guest_memfd population mechanisms. It also follows the same pattern as the other HAVE_KVM_ARCH_* configuration options. This improves clarity for developers and ensures the name accurately reflects the functionality it controls, especially as guest_memfd support expands beyond purely "private" memory scenarios. Temporarily keep KVM_GENERIC_PRIVATE_MEM as an x86-only config so as to minimize churn, and to hopefully make it easier to see what features require HAVE_KVM_ARCH_GMEM_POPULATE. On that note, omit GMEM_POPULATE for KVM_X86_SW_PROTECTED_VM, as regular ol' memset() suffices for software-protected VMs. As for KVM_GENERIC_PRIVATE_MEM, a future change will select KVM_GUEST_MEMFD for all 64-bit KVM builds, at which point the intermediate config will become obsolete and can/will be dropped. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 14 ++++++++++---- include/linux/kvm_host.h | 2 +- virt/kvm/Kconfig | 9 ++++----- virt/kvm/guest_memfd.c | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 13ab7265b505..c763446d9b9f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -79,11 +79,16 @@ config KVM_WERROR If in doubt, say "N". +config KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_GUEST_MEMFD + bool + config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 - select KVM_GENERIC_PRIVATE_MEM + select KVM_X86_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -133,8 +138,8 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST - select KVM_GENERIC_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_X86_PRIVATE_MEM + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) confidential VMs on Intel processors. @@ -157,9 +162,10 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_GENERIC_PRIVATE_MEM + select KVM_X86_PRIVATE_MEM select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching encrypted VMs which use Secure Encrypted Virtualization (SEV), Secure Encrypted Virtualization with diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8cdc0b3cc1b1..ddfb6cfe20a6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2534,7 +2534,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif -#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e4b400feff94..1b7d5be0b6c4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -116,11 +116,6 @@ config KVM_GUEST_MEMFD select XARRAY_MULTI bool -config KVM_GENERIC_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_GUEST_MEMFD - bool - config HAVE_KVM_ARCH_GMEM_PREPARE bool depends on KVM_GUEST_MEMFD @@ -128,3 +123,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_GUEST_MEMFD + +config HAVE_KVM_ARCH_GMEM_POPULATE + bool + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 7d85cc33c0bb..b2b50560e80e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -627,7 +627,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); -#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { -- cgit v1.2.3 From 923310be23b275f730e8869abc783db6296fc043 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:37 -0700 Subject: KVM: Rename kvm_slot_can_be_private() to kvm_slot_has_gmem() Rename kvm_slot_can_be_private() to kvm_slot_has_gmem() to improve clarity and accurately reflect its purpose. The function kvm_slot_can_be_private() was previously used to check if a given kvm_memory_slot is backed by guest_memfd. However, its name implied that the memory in such a slot was exclusively "private". As guest_memfd support expands to include non-private memory (e.g., shared host mappings), it's important to remove this association. The new name, kvm_slot_has_gmem(), states that the slot is backed by guest_memfd without making assumptions about the memory's privacy attributes. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/svm/sev.c | 4 ++-- include/linux/kvm_host.h | 2 +- virt/kvm/guest_memfd.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..fdc2824755ee 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3312,7 +3312,7 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn) { - bool is_private = kvm_slot_can_be_private(slot) && + bool is_private = kvm_slot_has_gmem(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); @@ -4551,7 +4551,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, { int max_order, r; - if (!kvm_slot_can_be_private(fault->slot)) { + if (!kvm_slot_has_gmem(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0635bd71c10e..966a330dd294 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) mutex_lock(&kvm->slots_lock); memslot = gfn_to_memslot(kvm, params.gfn_start); - if (!kvm_slot_can_be_private(memslot)) { + if (!kvm_slot_has_gmem(memslot)) { ret = -EINVAL; goto out; } @@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) } slot = gfn_to_memslot(kvm, gfn); - if (!kvm_slot_can_be_private(slot)) { + if (!kvm_slot_has_gmem(slot)) { pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", gpa); return; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ddfb6cfe20a6..4c5e0a898652 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -615,7 +615,7 @@ struct kvm_memory_slot { #endif }; -static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +static inline bool kvm_slot_has_gmem(const struct kvm_memory_slot *slot) { return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index b2b50560e80e..a99e11b8b77f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -643,7 +643,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); - if (!kvm_slot_can_be_private(slot)) + if (!kvm_slot_has_gmem(slot)) return -EINVAL; file = kvm_gmem_get_file(slot); -- cgit v1.2.3 From 69116e01f6fee030db45d269f28f9c300b8dc9d6 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:38 -0700 Subject: KVM: Fix comments that refer to slots_lock Fix comments so that they refer to slots_lock instead of slots_locks (remove trailing s). Reviewed-by: David Hildenbrand Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4c5e0a898652..5c25b03d3d50 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -860,7 +860,7 @@ struct kvm { struct notifier_block pm_notifier; #endif #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES - /* Protected by slots_locks (for writes) and RCU (for reads) */ + /* Protected by slots_lock (for writes) and RCU (for reads) */ struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25a94eed75fd..aa86dfd757db 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot - * must be serialized by slots_locks to ensure the TLB flush from one + * must be serialized by slots_lock to ensure the TLB flush from one * operation is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); -- cgit v1.2.3 From 68d189938709a5918d7308eb922c30bcbf16ebb9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:39 -0700 Subject: KVM: Fix comment that refers to kvm uapi header path The comment that points to the path where the user-visible memslot flags are refers to an outdated path and has a typo. Update the comment to refer to the correct path. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c25b03d3d50..56ea8c862cfd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -52,7 +52,7 @@ /* * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * used in kvm, other bits are visible for userspace which are defined in - * include/linux/kvm_h. + * include/uapi/linux/kvm.h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) -- cgit v1.2.3 From d1e54dd08f163a9021433020d16a8f8f70ddc41c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:40 -0700 Subject: KVM: x86: Enable KVM_GUEST_MEMFD for all 64-bit builds Enable KVM_GUEST_MEMFD for all KVM x86 64-bit builds, i.e. for "default" VM types when running on 64-bit KVM. This will allow using guest_memfd to back non-private memory for all VM shapes, by supporting mmap() on guest_memfd. Opportunistically clean up various conditionals that become tautologies once x86 selects KVM_GUEST_MEMFD more broadly. Specifically, because SW protected VMs, SEV, and TDX are all 64-bit only, private memory no longer needs to take explicit dependencies on KVM_GUEST_MEMFD, because it is effectively a prerequisite. Suggested-by: Sean Christopherson Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Reviewed-by: David Hildenbrand Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 +--- arch/x86/kvm/Kconfig | 12 ++++-------- include/linux/kvm_host.h | 9 ++------- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 9 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7b0f2b3e492d..50366a1ca192 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_GUEST_MEMFD +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) -#else -#define kvm_arch_has_private_mem(kvm) false #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c763446d9b9f..4e43923656d0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -47,6 +47,7 @@ config KVM_X86 select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY select KVM_WERROR if WERROR + select KVM_GUEST_MEMFD if X86_64 config KVM tristate "Kernel-based Virtual Machine (KVM) support" @@ -79,16 +80,11 @@ config KVM_WERROR If in doubt, say "N". -config KVM_X86_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_GUEST_MEMFD - bool - config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -138,7 +134,7 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) @@ -162,7 +158,7 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE select HAVE_KVM_ARCH_GMEM_POPULATE diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 56ea8c862cfd..4d1c44622056 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -719,11 +719,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif -/* - * Arch code must define kvm_arch_has_private_mem if support for guest_memfd is - * enabled. - */ -#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) +#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; @@ -2505,8 +2501,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { - return IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) && - kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; + return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa86dfd757db..4f57cb92e109 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1588,7 +1588,7 @@ static int check_memory_region_flags(struct kvm *kvm, { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; - if (kvm_arch_has_private_mem(kvm)) + if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) valid_flags |= KVM_MEM_GUEST_MEMFD; /* Dirty logging private memory is not currently supported. */ @@ -4917,7 +4917,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: - return !kvm || kvm_arch_has_private_mem(kvm); + return 1; #endif default: break; -- cgit v1.2.3 From a12578e1477cbfb547256ed8dee6d5142a59cdcd Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:41 -0700 Subject: KVM: guest_memfd: Add plumbing to host to map guest_memfd pages Introduce the core infrastructure to enable host userspace to mmap() guest_memfd-backed memory. This is needed for several evolving KVM use cases: * Non-CoCo VM backing: Allows VMMs like Firecracker to run guests entirely backed by guest_memfd, even for non-CoCo VMs [1]. This provides a unified memory management model and simplifies guest memory handling. * Direct map removal for enhanced security: This is an important step for direct map removal of guest memory [2]. By allowing host userspace to fault in guest_memfd pages directly, we can avoid maintaining host kernel direct maps of guest memory. This provides additional hardening against Spectre-like transient execution attacks by removing a potential attack surface within the kernel. * Future guest_memfd features: This also lays the groundwork for future enhancements to guest_memfd, such as supporting huge pages and enabling in-place sharing of guest memory with the host for CoCo platforms that permit it [3]. Enable the basic mmap and fault handling logic within guest_memfd, but hold off on allow userspace to actually do mmap() until the architecture support is also in place. [1] https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hiding [2] https://lore.kernel.org/linux-mm/cc1bb8e9bc3e1ab637700a4d3defeec95b55060a.camel@amazon.com [3] https://lore.kernel.org/all/c1c9591d-218a-495c-957b-ba356c8f8e09@redhat.com/T/#u Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Acked-by: David Hildenbrand Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ++++++++ include/linux/kvm_host.h | 4 +++ virt/kvm/guest_memfd.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 604490b1cb19..33fba801b205 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13521,6 +13521,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory + * (the private vs. shared tracking needs to be moved into guest_memfd). + */ +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return !kvm_arch_has_private_mem(kvm); +} + #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { @@ -13534,6 +13544,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif +#endif int kvm_spec_ctrl_test_value(u64 value) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4d1c44622056..26bad600f9fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -726,6 +726,10 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) } #endif +#ifdef CONFIG_KVM_GUEST_MEMFD +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm); +#endif + #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index a99e11b8b77f..67e7cd7210ef 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -312,7 +312,72 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) return gfn - slot->base_gfn + slot->gmem.pgoff; } +static bool kvm_gmem_supports_mmap(struct inode *inode) +{ + return false; +} + +static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct folio *folio; + vm_fault_t ret = VM_FAULT_LOCKED; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + folio = kvm_gmem_get_folio(inode, vmf->pgoff); + if (IS_ERR(folio)) { + int err = PTR_ERR(folio); + + if (err == -EAGAIN) + return VM_FAULT_RETRY; + + return vmf_error(err); + } + + if (WARN_ON_ONCE(folio_test_large(folio))) { + ret = VM_FAULT_SIGBUS; + goto out_folio; + } + + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + kvm_gmem_mark_prepared(folio); + } + + vmf->page = folio_file_page(folio, vmf->pgoff); + +out_folio: + if (ret != VM_FAULT_LOCKED) { + folio_unlock(folio); + folio_put(folio); + } + + return ret; +} + +static const struct vm_operations_struct kvm_gmem_vm_ops = { + .fault = kvm_gmem_fault_user_mapping, +}; + +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!kvm_gmem_supports_mmap(file_inode(file))) + return -ENODEV; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != + (VM_SHARED | VM_MAYSHARE)) { + return -EINVAL; + } + + vma->vm_ops = &kvm_gmem_vm_ops; + + return 0; +} + static struct file_operations kvm_gmem_fops = { + .mmap = kvm_gmem_mmap, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -391,6 +456,11 @@ static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, }; +bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return true; +} + static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; -- cgit v1.2.3 From 576d035e2aef52f8d8d3ce29af556d4c6bd2e0fe Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:42 -0700 Subject: KVM: guest_memfd: Track guest_memfd mmap support in memslot Add a new internal flag, KVM_MEMSLOT_GMEM_ONLY, to the top half of memslot->flags (which makes it strictly for KVM's internal use). This flag tracks when a guest_memfd-backed memory slot supports host userspace mmap operations, which implies that all memory, not just private memory for CoCo VMs, is consumed through guest_memfd: "gmem only". This optimization avoids repeatedly checking the underlying guest_memfd file for mmap support, which would otherwise require taking and releasing a reference on the file for each check. By caching this information directly in the memslot, we reduce overhead and simplify the logic involved in handling guest_memfd-backed pages for host mappings. Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 11 ++++++++++- virt/kvm/guest_memfd.c | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 26bad600f9fa..8b47891adca1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -54,7 +54,8 @@ * used in kvm, other bits are visible for userspace which are defined in * include/uapi/linux/kvm.h. */ -#define KVM_MEMSLOT_INVALID (1UL << 16) +#define KVM_MEMSLOT_INVALID (1UL << 16) +#define KVM_MEMSLOT_GMEM_ONLY (1UL << 17) /* * Bit 63 of the memslot generation number is an "update in-progress flag", @@ -2490,6 +2491,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } +static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot) +{ + if (!IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) + return false; + + return slot->flags & KVM_MEMSLOT_GMEM_ONLY; +} + #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 67e7cd7210ef..d5b445548af4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -578,6 +578,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, */ WRITE_ONCE(slot->gmem.file, file); slot->gmem.pgoff = start; + if (kvm_gmem_supports_mmap(inode)) + slot->flags |= KVM_MEMSLOT_GMEM_ONLY; xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); -- cgit v1.2.3 From 3d3a04fad25a6621828518a2abe536142d2c1a7d Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:52 -0700 Subject: KVM: Allow and advertise support for host mmap() on guest_memfd files Now that all the x86 and arm64 plumbing for mmap() on guest_memfd is in place, allow userspace to set GUEST_MEMFD_FLAG_MMAP and advertise support via a new capability, KVM_CAP_GUEST_MEMFD_MMAP. The availability of this capability is determined per architecture, and its enablement for a specific guest_memfd instance is controlled by the GUEST_MEMFD_FLAG_MMAP flag at creation time. Update the KVM API documentation to detail the KVM_CAP_GUEST_MEMFD_MMAP capability, the associated GUEST_MEMFD_FLAG_MMAP, and provide essential information regarding support for mmap in guest_memfd. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 9 +++++++++ include/uapi/linux/kvm.h | 2 ++ virt/kvm/guest_memfd.c | 7 ++++++- virt/kvm/kvm_main.c | 2 ++ 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6aa40ee05a4a..c17a87a0a5ac 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). +When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field +supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation +enables mmap() and faulting of guest_memfd memory to host userspace. + +When the KVM MMU performs a PFN lookup to service a guest fault and the backing +guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be +consumed from guest_memfd, regardless of whether it is a shared or a private +fault. + See KVM_SET_USER_MEMORY_REGION2 for additional details. 4.143 KVM_PRE_FAULT_MEMORY diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f0f0d49d2544..6efa98a57ec1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -962,6 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 +#define KVM_CAP_GUEST_MEMFD_MMAP 244 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1598,6 +1599,7 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) struct kvm_create_guest_memfd { __u64 size; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index d5b445548af4..08a6bc7d25b6 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -314,7 +314,9 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) static bool kvm_gmem_supports_mmap(struct inode *inode) { - return false; + const u64 flags = (u64)inode->i_private; + + return flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -522,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 flags = args->flags; u64 valid_flags = 0; + if (kvm_arch_supports_gmem_mmap(kvm)) + valid_flags |= GUEST_MEMFD_FLAG_MMAP; + if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4f57cb92e109..18f29ef93543 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4918,6 +4918,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; + case KVM_CAP_GUEST_MEMFD_MMAP: + return !kvm || kvm_arch_supports_gmem_mmap(kvm); #endif default: break; -- cgit v1.2.3 From 7a37f55af7af868119b4fb69285f5fa03ba8cf35 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 5 Aug 2025 17:10:56 +0200 Subject: fuse: add COPY_FILE_RANGE_64 that allows large copies The FUSE protocol uses struct fuse_write_out to convey the return value of copy_file_range, which is restricted to uint32_t. But the COPY_FILE_RANGE interface supports a 64-bit size copies and there's no reason why copies should be limited to 32-bit. Introduce a new op COPY_FILE_RANGE_64, which is identical, except the number of bytes copied is returned in a 64-bit value. If the fuse server does not support COPY_FILE_RANGE_64, fall back to COPY_FILE_RANGE. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 44 ++++++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 12 +++++++++++- 3 files changed, 46 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..867b5fde1237 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,10 +2960,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), + .len = len, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3013,33 +3015,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } - if (!err && outarg.size > len) - err = -EIO; - if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e..d68eea4dd743 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -856,6 +856,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 122d6586e8d4..94621f68a5cc 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -235,6 +235,10 @@ * * 7.44 * - add FUSE_NOTIFY_INC_EPOCH + * + * 7.45 + * - add FUSE_COPY_FILE_RANGE_64 + * - add struct fuse_copy_file_range_out */ #ifndef _LINUX_FUSE_H @@ -270,7 +274,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 44 +#define FUSE_KERNEL_MINOR_VERSION 45 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -657,6 +661,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_COPY_FILE_RANGE_64 = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1148,6 +1153,11 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +/* For FUSE_COPY_FILE_RANGE_64 */ +struct fuse_copy_file_range_out { + uint64_t bytes_copied; +}; + #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) #define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) struct fuse_setupmapping_in { -- cgit v1.2.3 From dd6a5a71c811289eec234e78cb9ca34d055d2ad5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 10 Jun 2025 13:52:28 +0900 Subject: sched/wait: Add wait_event_state_exclusive() Allows exclusive waits with a custom @state. Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Miklos Szeredi --- include/linux/wait.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 09855d819418..f648044466d5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -965,6 +965,18 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_state_exclusive(wq, condition, state) \ + ___wait_event(wq, condition, state, 1, 0, schedule()) + +#define wait_event_state_exclusive(wq, condition, state) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_state_exclusive(wq, condition, state); \ + __ret; \ +}) + #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ -- cgit v1.2.3 From 494d2f508883a6e5c4530e5c6b3c8b2bbfb7318d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 7 Jul 2025 16:46:05 -0700 Subject: fuse: use default writeback accounting commit 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") removed temp folios for dirty page writeback. Consequently, fuse can now use the default writeback accounting. With switching fuse to use default writeback accounting, there are some added benefits. This updates wb->writeback_inodes tracking as well now and updates writeback throughput estimates after writeback completion. This commit also removes inc_wb_stat() and dec_wb_stat(). These have no callers anymore now that fuse does not call them. Signed-off-by: Joanne Koong Reviewed-by: David Hildenbrand Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 +-------- fs/fuse/inode.c | 2 -- include/linux/backing-dev.h | 10 ---------- 3 files changed, 1 insertion(+), 20 deletions(-) (limited to 'include') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffda61c9bc0c..0698bcb2f992 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1822,19 +1822,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } wake_up(&fi->page_waitq); } @@ -2009,14 +2005,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7ddfd2b3cc9c..19fc58cb84dc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1561,8 +1561,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e721148c95d0..9a1e895dd5df 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb, percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, 1); -} - -static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, -1); -} - static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); -- cgit v1.2.3 From 2841808f35eebfd07150333f3af3007cb2904a09 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 7 Jul 2025 16:46:06 -0700 Subject: mm: remove BDI_CAP_WRITEBACK_ACCT There are no users of BDI_CAP_WRITEBACK_ACCT now that fuse doesn't do its own writeback accounting. This commit removes BDI_CAP_WRITEBACK_ACCT. Signed-off-by: Joanne Koong Acked-by: David Hildenbrand Signed-off-by: Miklos Szeredi --- include/linux/backing-dev.h | 4 +--- mm/backing-dev.c | 2 +- mm/page-writeback.c | 43 ++++++++++++++++++------------------------- 3 files changed, 20 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9a1e895dd5df..3e64f14739dd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -108,12 +108,10 @@ int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting - * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) -#define BDI_CAP_WRITEBACK_ACCT (1 << 1) -#define BDI_CAP_STRICTLIMIT (1 << 2) +#define BDI_CAP_STRICTLIMIT (1 << 1) extern struct backing_dev_info noop_backing_dev_info; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..35f11e75e30e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1026,7 +1026,7 @@ struct backing_dev_info *bdi_alloc(int node_id) kfree(bdi); return NULL; } - bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; + bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..de669636120d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3014,26 +3014,22 @@ bool __folio_end_writeback(struct folio *folio) if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - wb_stat_mod(wb, WB_WRITEBACK, -nr); - __wb_writeout_add(wb, nr); - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - wb_inode_writeback_end(wb); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + wb_inode_writeback_end(wb); + if (mapping->host) + sb_clear_inode_writeback(mapping->host); } - if (mapping->host && !mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK)) - sb_clear_inode_writeback(mapping->host); - xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); @@ -3058,7 +3054,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); unsigned long flags; bool on_wblist; @@ -3069,21 +3065,18 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - - wb_stat_mod(wb, WB_WRITEBACK, nr); - if (!on_wblist) - wb_inode_writeback_start(wb); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) { + wb_inode_writeback_start(wb); + /* + * We can come through here when swapping anonymous + * folios, so we don't necessarily have an inode to + * track for sync. + */ + if (mapping->host) + sb_mark_inode_writeback(mapping->host); } - /* - * We can come through here when swapping anonymous - * folios, so we don't necessarily have an inode to - * track for sync. - */ - if (mapping->host && !on_wblist) - sb_mark_inode_writeback(mapping->host); if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) -- cgit v1.2.3 From 08383cd479f8212fafee2f557b58cfd48818bee0 Mon Sep 17 00:00:00 2001 From: Christian Bruel Date: Wed, 20 Aug 2025 09:54:02 +0200 Subject: pinctrl: Add pinctrl_pm_select_init_state helper function If a platform requires an initial pinctrl state during probing, this helper function provides the client with access to the same initial state. eg: xxx_suspend_noirq ... pinctrl_pm_select_sleep_state xxx resume_noirq pinctrl_pm_select_init_state ... pinctrl_pm_select_default_state Signed-off-by: Christian Bruel Signed-off-by: Manivannan Sadhasivam Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20250820075411.1178729-3-christian.bruel@foss.st.com --- drivers/pinctrl/core.c | 13 +++++++++++++ include/linux/pinctrl/consumer.h | 10 ++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 73b78d6eac67..c5dbf4e9db84 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1655,6 +1655,19 @@ int pinctrl_pm_select_default_state(struct device *dev) } EXPORT_SYMBOL_GPL(pinctrl_pm_select_default_state); +/** + * pinctrl_pm_select_init_state() - select init pinctrl state for PM + * @dev: device to select init state for + */ +int pinctrl_pm_select_init_state(struct device *dev) +{ + if (!dev->pins) + return 0; + + return pinctrl_select_bound_state(dev, dev->pins->init_state); +} +EXPORT_SYMBOL_GPL(pinctrl_pm_select_init_state); + /** * pinctrl_pm_select_sleep_state() - select sleep pinctrl state for PM * @dev: device to select sleep state for diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 73de70362b98..63ce16191eb9 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -48,6 +48,7 @@ int pinctrl_select_default_state(struct device *dev); #ifdef CONFIG_PM int pinctrl_pm_select_default_state(struct device *dev); +int pinctrl_pm_select_init_state(struct device *dev); int pinctrl_pm_select_sleep_state(struct device *dev); int pinctrl_pm_select_idle_state(struct device *dev); #else @@ -55,6 +56,10 @@ static inline int pinctrl_pm_select_default_state(struct device *dev) { return 0; } +static inline int pinctrl_pm_select_init_state(struct device *dev) +{ + return 0; +} static inline int pinctrl_pm_select_sleep_state(struct device *dev) { return 0; @@ -143,6 +148,11 @@ static inline int pinctrl_pm_select_default_state(struct device *dev) return 0; } +static inline int pinctrl_pm_select_init_state(struct device *dev) +{ + return 0; +} + static inline int pinctrl_pm_select_sleep_state(struct device *dev) { return 0; -- cgit v1.2.3 From e26dca67fde194340582cfbb0c0bf661825e9e46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:14:41 -0600 Subject: io_uring: add support for IORING_SETUP_CQE_MIXED Normal rings support 16b CQEs for posting completions, while certain features require the ring to be configured with IORING_SETUP_CQE32, as they need to convey more information per completion. This, in turn, makes ALL the CQEs be 32b in size. This is somewhat wasteful and inefficient, particularly when only certain CQEs need to be of the bigger variant. This adds support for setting up a ring with mixed CQE sizes, using IORING_SETUP_CQE_MIXED. When setup in this mode, CQEs posted to the ring may be either 16b or 32b in size. If a CQE is 32b in size, then IORING_CQE_F_32 is set in the CQE flags to indicate that this is the case. If this flag isn't set, the CQE is the normal 16b variant. CQEs on these types of mixed rings may also have IORING_CQE_F_SKIP set. This can happen if the ring is one (small) CQE entry away from wrapping, and an attempt is made to post a 32b CQE. As CQEs must be contigious in the CQ ring, a 32b CQE cannot wrap the ring. For this case, a single dummy CQE is posted with the SKIP flag set. The application should simply ignore those. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 ++++ io_uring/io_uring.c | 78 ++++++++++++++++++++++++++++++++++--------- io_uring/io_uring.h | 49 +++++++++++++++++++-------- io_uring/register.c | 3 +- 4 files changed, 105 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7af8d10b3aba..5135e1be0390 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5166f11f07c7..6c07efac977c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -620,27 +620,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -752,10 +754,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size <<= 1; + } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); @@ -773,12 +777,30 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -792,12 +814,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -815,9 +847,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); @@ -828,14 +860,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -2756,6 +2789,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -3680,6 +3717,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p) !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + return 0; } @@ -3906,7 +3951,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | + IORING_SETUP_CQE_MIXED)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74..2bcb565d9de6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,7 +75,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); int io_uring_fill_params(unsigned entries, struct io_uring_params *p); -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -169,25 +169,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, - bool overflow) + bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & IORING_SETUP_CQE32) { + ctx->cqe_cached++; + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { ctx->cqe_cached++; + ctx->cached_cq_tail++; + } + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } -static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, + bool cqe32) { - return io_get_cqe_overflow(ctx, ret, false); + return io_get_cqe_overflow(ctx, ret, false, cqe32); } static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, @@ -196,25 +202,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, io_lockdep_assert_cq_locked(ctx); ctx->submit_state.cq_flush = true; - return io_get_cqe(ctx, cqe_ret); + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. + * If we can't get a cq entry, userspace overflowed the submission + * (by quite a lot). */ - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; - memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { + if (is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } @@ -239,6 +244,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_CQE_MIXED) + return IORING_CQE_F_32; + return 0; +} + +static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, + __u64 extra1, __u64 extra2) +{ + req->cqe.res = res; + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; +} + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { diff --git a/io_uring/register.c b/io_uring/register.c index a59589249fce..a1a9b2884eae 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -396,7 +396,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { -- cgit v1.2.3 From 806ecb209aa86fcc1d92bc9f10323cf773f64d6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:22:16 -0600 Subject: io_uring/nop: add support for IORING_SETUP_CQE_MIXED This adds support for setting IORING_NOP_CQE32 as a flag for a NOP command, in which case a 32b CQE will be posted rather than a regular one. This is the default if the ring has been setup with IORING_SETUP_CQE32. If the ring has been setup with IORING_SETUP_CQE_MIXED, then 16b CQEs will be posted without this flag set, and 32b CQEs if this flag is set. For the latter case, sqe->off is what will be posted as cqe->big_cqe[0] and sqe->addr is what will be posted as cqe->big_cqe[1]. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/nop.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5135e1be0390..04ebff33d0e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -464,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) diff --git a/io_uring/nop.c b/io_uring/nop.c index 20ed0f85b1c2..3caf07878f8a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -17,11 +17,13 @@ struct io_nop { int result; int fd; unsigned int flags; + __u64 extra1; + __u64 extra2; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ - IORING_NOP_TW) + IORING_NOP_TW | IORING_NOP_CQE32) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -41,6 +43,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); + if (nop->flags & IORING_NOP_CQE32) { + struct io_ring_ctx *ctx = req->ctx; + + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + nop->extra1 = READ_ONCE(sqe->off); + nop->extra2 = READ_ONCE(sqe->addr); + } return 0; } @@ -68,7 +78,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - io_req_set_res(req, nop->result, 0); + if (nop->flags & IORING_NOP_CQE32) + io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); + else + io_req_set_res(req, nop->result, 0); if (nop->flags & IORING_NOP_TW) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); -- cgit v1.2.3 From 1df7dad4d5c49335b72e26d833def960b2de76e3 Mon Sep 17 00:00:00 2001 From: Nandakumar Edamana Date: Tue, 26 Aug 2025 09:15:23 +0530 Subject: bpf: Improve the general precision of tnum_mul Drop the value-mask decomposition technique and adopt straightforward long-multiplication with a twist: when LSB(a) is uncertain, find the two partial products (for LSB(a) = known 0 and LSB(a) = known 1) and take a union. Experiment shows that applying this technique in long multiplication improves the precision in a significant number of cases (at the cost of losing precision in a relatively lower number of cases). Signed-off-by: Nandakumar Edamana Signed-off-by: Andrii Nakryiko Tested-by: Harishankar Vishwanathan Reviewed-by: Harishankar Vishwanathan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250826034524.2159515-1-nandakumar@nandakumar.co.in --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 55 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 0ffb77ffe0e8..c52b862dad45 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -57,6 +57,9 @@ bool tnum_overlap(struct tnum a, struct tnum b); /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); +/* Returns a tnum representing numbers satisfying either @a or @b */ +struct tnum tnum_union(struct tnum t1, struct tnum t2); + /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index d9328bbb3680..f8e70e9c3998 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -116,31 +116,47 @@ struct tnum tnum_xor(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } -/* Generate partial products by multiplying each bit in the multiplier (tnum a) - * with the multiplicand (tnum b), and add the partial products after - * appropriately bit-shifting them. Instead of directly performing tnum addition - * on the generated partial products, equivalenty, decompose each partial - * product into two tnums, consisting of the value-sum (acc_v) and the - * mask-sum (acc_m) and then perform tnum addition on them. The following paper - * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. +/* Perform long multiplication, iterating through the bits in a using rshift: + * - if LSB(a) is a known 0, keep current accumulator + * - if LSB(a) is a known 1, add b to current accumulator + * - if LSB(a) is unknown, take a union of the above cases. + * + * For example: + * + * acc_0: acc_1: + * + * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1 + * x1 01 11 + * ------ ------ ------ + * 11 11 11 + * xx 00 11 + * ------ ------ ------ + * ???? 0011 1001 */ struct tnum tnum_mul(struct tnum a, struct tnum b) { - u64 acc_v = a.value * b.value; - struct tnum acc_m = TNUM(0, 0); + struct tnum acc = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.mask)); + acc = tnum_add(acc, b); /* LSB of tnum a is uncertain */ - else if (a.mask & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); + else if (a.mask & 1) { + /* acc = tnum_union(acc_0, acc_1), where acc_0 and + * acc_1 are partial accumulators for cases + * LSB(a) = certain 0 and LSB(a) = certain 1. + * acc_0 = acc + 0 * b = acc. + * acc_1 = acc + 1 * b = tnum_add(acc, b). + */ + + acc = tnum_union(acc, tnum_add(acc, b)); + } /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } - return tnum_add(TNUM(acc_v, 0), acc_m); + return acc; } bool tnum_overlap(struct tnum a, struct tnum b) @@ -163,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } +/* Returns a tnum with the uncertainty from both a and b, and in addition, new + * uncertainty at any position that a and b disagree. This represents a + * superset of the union of the concrete sets of both a and b. Despite the + * overapproximation, it is optimal. + */ +struct tnum tnum_union(struct tnum a, struct tnum b) +{ + u64 v = a.value & b.value; + u64 mu = (a.value ^ b.value) | a.mask | b.mask; + + return TNUM(v & ~mu, mu); +} + struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; -- cgit v1.2.3 From 97bcc5b6f45425ac56fb04b0893cdaa607ec7e45 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 25 Aug 2025 08:40:04 +0530 Subject: net: Prevent RPS table overwrite of active flows This patch fixes an issue where two different flows on the same RXq produce the same hash resulting in continuous flow overwrites. Flow #1: A packet for Flow #1 comes in, kernel calls the steering function. The driver gives back a filter id. The kernel saves this filter id in the selected slot. Later, the driver's service task checks if any filters have expired and then installs the rule for Flow #1. Flow #2: A packet for Flow #2 comes in. It goes through the same steps. But this time, the chosen slot is being used by Flow #1. The driver gives a new filter id and the kernel saves it in the same slot. When the driver's service task runs, it runs through all the flows, checks if Flow #1 should be expired, the kernel returns True as the slot has a different filter id, and then the driver installs the rule for Flow #2. Flow #1: Another packet for Flow #1 comes in. The same thing repeats. The slot is overwritten with a new filter id for Flow #1. This causes a repeated cycle of flow programming for missed packets, wasting CPU cycles while not improving performance. This problem happens at higher rates when the RPS table is small, but tests show it still happens even with 12,000 connections and an RPS size of 16K per queue (global table size = 144x16K = 64K). This patch prevents overwriting an rps_dev_flow entry if it is active. The intention is that it is better to do aRFS for the first flow instead of hurting all flows on the same hash. Without this, two (or more) flows on one RX queue with the same hash can keep overwriting each other. This causes the driver to reprogram the flow repeatedly. Changes: 1. Add a new 'hash' field to struct rps_dev_flow. 2. Add rps_flow_is_active(): a helper function to check if a flow is active or not, extracted from rps_may_expire_flow(). It is further simplified as per reviewer feedback. 3. In set_rps_cpu(): - Avoid overwriting by programming a new filter if: - The slot is not in use, or - The slot is in use but the flow is not active, or - The slot has an active flow with the same hash, but target CPU differs. - Save the hash in the rps_dev_flow entry. 4. rps_may_expire_flow(): Use earlier extracted rps_flow_is_active(). Testing & results: - Driver: ice (E810 NIC), Kernel: net-next - #CPUs = #RXq = 144 (1:1) - Number of flows: 12K - Eight RPS settings from 256 to 32768. Though RPS=256 is not ideal, it is still sufficient to cover 12K flows (256*144 rx-queues = 64K global table slots) - Global Table Size = 144 * RPS (effectively equal to 256 * RPS) - Each RPS test duration = 8 mins (org code) + 8 mins (new code). - Metrics captured on client Legend for following tables: Steer-C: #times ndo_rx_flow_steer() was Called by set_rps_cpu() Steer-L: #times ice_arfs_flow_steer() Looped over aRFS entries Add: #times driver actually programmed aRFS (ice_arfs_build_entry()) Del: #times driver deleted the flow (ice_arfs_del_flow_rules()) Units: K = 1,000 times, M = 1 million times |-------|---------|------| Org Code |---------|---------| | RPS | Latency | CPU | Add | Del | Steer-C | Steer-L | |-------|---------|------|--------|--------|---------|---------| | 256 | 227.0 | 93.2 | 1.6M | 1.6M | 121.7M | 267.6M | | 512 | 225.9 | 94.1 | 11.5M | 11.2M | 65.7M | 199.6M | | 1024 | 223.5 | 95.6 | 16.5M | 16.5M | 27.1M | 187.3M | | 2048 | 222.2 | 96.3 | 10.5M | 10.5M | 12.5M | 115.2M | | 4096 | 223.9 | 94.1 | 5.5M | 5.5M | 7.2M | 65.9M | | 8192 | 224.7 | 92.5 | 2.7M | 2.7M | 3.0M | 29.9M | | 16384 | 223.5 | 92.5 | 1.3M | 1.3M | 1.4M | 13.9M | | 32768 | 219.6 | 93.2 | 838.1K | 838.1K | 965.1K | 8.9M | |-------|---------|------| New Code |---------|---------| | 256 | 201.5 | 99.1 | 13.4K | 5.0K | 13.7K | 75.2K | | 512 | 202.5 | 98.2 | 11.2K | 5.9K | 11.2K | 55.5K | | 1024 | 207.3 | 93.9 | 11.5K | 9.7K | 11.5K | 59.6K | | 2048 | 207.5 | 96.7 | 11.8K | 11.1K | 15.5K | 79.3K | | 4096 | 206.9 | 96.6 | 11.8K | 11.7K | 11.8K | 63.2K | | 8192 | 205.8 | 96.7 | 11.9K | 11.8K | 11.9K | 63.9K | | 16384 | 200.9 | 98.2 | 11.9K | 11.9K | 11.9K | 64.2K | | 32768 | 202.5 | 98.0 | 11.9K | 11.9K | 11.9K | 64.2K | |-------|---------|------|--------|--------|---------|---------| Some observations: 1. Overall Latency improved: (1790.19-1634.94)/1790.19*100 = 8.67% 2. Overall CPU increased: (777.32-751.49)/751.45*100 = 3.44% 3. Flow Management (add/delete) remained almost constant at ~11K compared to values in millions. Signed-off-by: Krishna Kumar Link: https://patch.msgid.link/20250825031005.3674864-2-krikku@gmail.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 7 ++++-- net/core/dev.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++------ net/core/net-sysfs.c | 4 +++- 3 files changed, 65 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index d8ab3a08bcc4..9917dce42ca4 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -25,13 +25,16 @@ struct rps_map { /* * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. + * tail pointer for that CPU's input queue at the time of last enqueue, a + * hardware filter index, and the hash of the flow if aRFS is enabled. */ struct rps_dev_flow { u16 cpu; u16 filter; unsigned int last_qtail; +#ifdef CONFIG_RFS_ACCEL + u32 hash; +#endif }; #define RPS_NO_FILTER 0xffff diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..fbf0894c55d0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4849,6 +4849,36 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) return hash_32(hash, flow_table->log); } +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -4859,8 +4889,11 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; u32 flow_id; + u32 hash; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4875,14 +4908,32 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = rfs_slot(skb_get_hash(skb), flow_table); + + hash = skb_get_hash(skb); + flow_id = rfs_slot(hash, flow_table); + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; + rflow = tmp_rflow; WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: @@ -5017,17 +5068,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && - ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - - READ_ONCE(rflow->last_qtail)) < - (int)(10 << flow_table->log))) + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..5ea9f64adce3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return -ENOMEM; table->log = ilog2(mask) + 1; - for (count = 0; count <= mask; count++) + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; + table->flows[count].filter = RPS_NO_FILTER; + } } else { table = NULL; } -- cgit v1.2.3 From c2a756891bb428104fa8899998ba277042274cdb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 25 Aug 2025 13:18:28 -0700 Subject: uapi: wrap compiler_types.h in an ifdef instead of the implicit strip The uAPI stddef header includes compiler_types.h, a kernel-only header, to make sure that kernel definitions of annotations like __counted_by() take precedence. There is a hack in scripts/headers_install.sh which strips includes of compiler.h and compiler_types.h when installing uAPI headers. While explicit handling makes sense for compiler.h, which is included all over the uAPI, compiler_types.h is only included by stddef.h (within the uAPI, obviously it's included in kernel code a lot). Remove the stripping from scripts/headers_install.sh and wrap the include of compiler_types.h in #ifdef __KERNEL__ instead. This should be equivalent functionally, but is easier to understand to a casual reader of the code. It also makes it easier to work with kernel headers directly from under tools/ Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250825201828.2370083-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/uapi/linux/stddef.h | 2 ++ scripts/headers_install.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index b87df1b485c2..9a28f7d9a334 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -2,7 +2,9 @@ #ifndef _UAPI_LINUX_STDDEF_H #define _UAPI_LINUX_STDDEF_H +#ifdef __KERNEL__ #include +#endif #ifndef __always_inline #define __always_inline inline diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 6bbccb43f7e7..4c20c62c4faf 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -32,7 +32,7 @@ fi sed -E -e ' s/([[:space:](])(__user|__force|__iomem)[[:space:]]/\1/g s/__attribute_const__([[:space:]]|$)/\1/g - s@^#include @@ + s@^#include @@ s/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g s/(^|[[:space:](])(inline|asm|volatile)([[:space:](]|$)/\1__\2__\3/g s@#(ifndef|define|endif[[:space:]]*/[*])[[:space:]]*_UAPI@#\1 @ -- cgit v1.2.3 From f86f42ed2c471da5b061492bb8ab1d3d73c19c58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:27 +0000 Subject: net: add sk_drops_read(), sk_drops_inc() and sk_drops_reset() helpers We want to split sk->sk_drops in the future to reduce potential contention on this field. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 17 ++++++++++++++++- include/net/tcp.h | 2 +- net/core/datagram.c | 2 +- net/core/sock.c | 14 +++++++------- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 14 +++++++------- net/ipv6/datagram.c | 2 +- net/ipv6/raw.c | 8 ++++---- net/ipv6/udp.c | 6 +++--- net/iucv/af_iucv.c | 4 ++-- net/netlink/af_netlink.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/pep.c | 6 +++--- net/phonet/socket.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 6 +++--- 17 files changed, 57 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 63a6a48afb48..34d7029eb622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,11 +2682,26 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_drops); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2936b8175950..16dc9cebb9d2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2612,7 +2612,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) */ static inline void tcp_listendrop(const struct sock *sk) { - atomic_inc(&((struct sock *)sk)->sk_drops); + sk_drops_inc((struct sock *)sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..ba8253aa6e07 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 8002ac6293dc..75368823969a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } @@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, reason = SKB_DROP_REASON_PFMEMALLOC; if (err == -ENOBUFS) reason = SKB_DROP_REASON_SOCKET_BACKLOG; - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); goto discard_and_relse; } @@ -2505,7 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); @@ -3713,7 +3713,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); } EXPORT_SYMBOL(sock_init_data_uid); @@ -3973,7 +3973,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 031df4c19fcc..f119da68fc30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1119,7 +1119,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d2c89d63cc7..0f9f02f6146e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -1045,7 +1045,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), - refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); + refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int raw_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3ce0f762ec..732bdad43626 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1787,7 +1787,7 @@ uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); busylock_release(busy); return err; } @@ -1852,7 +1852,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2008,7 +2008,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2078,7 +2078,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2449,7 +2449,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2534,7 +2534,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, @@ -3386,7 +3386,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 972bf0426d59..33ebe93d80e3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c3f8245c40f..4026192143ec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a68f77da44b..a35ee6d693a8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cc2b3c44bc05..6c717a7ef292 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return; } @@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return NET_RX_SUCCESS; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e2f7080dd5d7..2b46c0cd752a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk) sk_error_report(sk); } } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) @@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), - atomic_read(&s->sk_drops), + sk_drops_read(s), sock_i_ino(s) ); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a7017d7f0927..9d42c4bd6e39 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2265,7 +2265,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 62527e1ebb88..4db564d9d522 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); break; } __skb_pull(skb, 4); @@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = -ENOBUFS; break; } @@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = NET_RX_DROP; break; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 2b61a40b568e..db2d552e9b32 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops)); + sk_drops_read(sk)); } seq_pad(seq, '\n'); return 0; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 23359e522273..996c2018f0e6 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e028bf658499..1574a83384f8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload2!"); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = TIPC_ERR_OVERLOAD; } @@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, "@sk_enqueue!"); @@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, - atomic_read(&sk->sk_drops))) + sk_drops_read(sk))) goto stat_msg_cancel; if (tsk->cong_link_cnt && -- cgit v1.2.3 From cb4d5a6eb600a43c2e3ec7f54e06d07aa33d8062 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:28 +0000 Subject: net: add sk_drops_skbadd() helper Existing sk_drops_add() helper is renamed to sk_drops_skbadd(). Add sk_drops_add() and convert sk_drops_inc() to use it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/skmsg.h | 2 +- include/net/sock.h | 11 ++++++++--- include/net/udp.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- net/mptcp/protocol.c | 2 +- 7 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b9095a281b8..49847888c287 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -315,7 +315,7 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); kfree_skb(skb); } diff --git a/include/net/sock.h b/include/net/sock.h index 34d7029eb622..9edb42ff0622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,9 +2682,14 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + atomic_add(segs, &sk->sk_drops); +} + static inline void sk_drops_inc(struct sock *sk) { - atomic_inc(&sk->sk_drops); + sk_drops_add(sk, 1); } static inline int sk_drops_read(const struct sock *sk) @@ -2704,11 +2709,11 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) diff --git a/include/net/udp.h b/include/net/udp.h index e2af3bda90c9..7b26d4c50f33 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -627,7 +627,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; drop: - atomic_add(drop_count, &sk->sk_drops); + sk_drops_add(sk, drop_count); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); kfree_skb(skb); return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a52a747d8a55..f1be65af1a77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4830,7 +4830,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0c93b24c6e0..7c1d612afca1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2254,7 +2254,7 @@ lookup: &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2399,7 +2399,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8b2e7b7afbd8..b4e56b877273 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1809,7 +1809,7 @@ lookup: &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1948,7 +1948,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2e728239480..ad41c48126e4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -137,7 +137,7 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } -- cgit v1.2.3 From c51613fa276f038bdd18656a57a90ccc5d4e5200 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:29 +0000 Subject: net: add sk->sk_drop_counters Some sockets suffer from heavy false sharing on sk->sk_drops, and fields in the same cache line. Add sk->sk_drop_counters to: - move the drop counter(s) to dedicated cache lines. - Add basic NUMA awareness to these drop counter(s). Following patches will use this infrastructure for UDP and RAW sockets. sk_clone_lock() is not yet ready, it would need to properly set newsk->sk_drop_counters if we plan to use this for TCP sockets. v2: used Paolo suggestion from https://lore.kernel.org/netdev/8f09830a-d83d-43c9-b36b-88ba0a23e9b2@redhat.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 32 +++++++++++++++++++++++++++++++- net/core/sock.c | 2 ++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 9edb42ff0622..73cd3316e288 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,6 +102,11 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; +struct socket_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -282,6 +287,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to socket_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -449,6 +455,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif + struct socket_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2684,7 +2691,18 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - atomic_add(segs, &sk->sk_drops); + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + int n = numa_node_id() % 2; + + if (n) + atomic_add(segs, &sdc->drops1); + else + atomic_add(segs, &sdc->drops0); + } else { + atomic_add(segs, &sk->sk_drops); + } } static inline void sk_drops_inc(struct sock *sk) @@ -2694,11 +2712,23 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { + const struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + atomic_set(&sdc->drops0, 0); + atomic_set(&sdc->drops1, 0); + } atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/sock.c b/net/core/sock.c index 75368823969a..e66ad1ec3a2d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2505,6 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -4457,6 +4458,7 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- cgit v1.2.3 From 51132b99f01ce05f8008f0fb189d83eed484bd53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:30 +0000 Subject: udp: add drop_counters to udp socket When a packet flood hits one or more UDP sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to udp sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per UDP socket. Tested with the following stress test, sending about 11 Mpps to a dual socket AMD EPYC 7B13 64-Core. super_netperf 20 -t UDP_STREAM -H DUT -l10 -- -n -P,1000 -m 120 Note: due to socket lookup, only one UDP socket is receiving packets on DUT. Then measure receiver (DUT) behavior. We can see both consumer and BH handlers can process more packets per second. Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 615091 0.0 Udp6InErrors 3904277 0.0 Udp6RcvbufErrors 3904277 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 816281 0.0 Udp6InErrors 7497093 0.0 Udp6RcvbufErrors 7497093 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-5-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + tools/testing/selftests/bpf/progs/bpf_iter_udp4.c | 3 ++- tools/testing/selftests/bpf/progs/bpf_iter_udp6.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 4e1a672af4c5..981506be1e15 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,6 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; + struct socket_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 7b26d4c50f33..93b159f30e88 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -288,6 +288,7 @@ static inline void udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); + sk->sk_drop_counters = &up->drop_counters; skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c index ffbd4b116d17..23b2aa2604de 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c @@ -64,7 +64,8 @@ int dump_udp4(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c index 47ff7754f4fd..c48b05aa2a4b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c @@ -72,7 +72,7 @@ int dump_udp6(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); - + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } -- cgit v1.2.3 From b81aa23234d94d99951761d9864061d774633ba9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:31 +0000 Subject: inet: raw: add drop_counters to raw sockets When a packet flood hits one or more RAW sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to raw sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per RAW socket. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/raw.h | 1 + net/ipv4/raw.c | 1 + net/ipv6/raw.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bc6ec2959173..261d02efb615 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - + struct socket_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..d52709139060 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct socket_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0f9f02f6146e..d54ebb7df966 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4026192143ec..4ae07a67b4d4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; -- cgit v1.2.3 From 3ea299d3dccdb8554057d0a87552e7673baea95d Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Mon, 11 Aug 2025 09:40:40 +0200 Subject: mtd: nand: qpic-common: remove a bunch of unused defines A bunch of definitions in the 'nand-qpic-common.h' header became unused after the conversion of the 'qcom_nandc' and 'spi-qpic-snand' drivers to use the FIELD_PREP() macro, so remove those. No functional changes. Signed-off-by: Gabor Juhos Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index 4e694b1aabbd..e8201d1b7cf9 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -71,14 +71,10 @@ /* NAND_DEVn_CFG0 bits */ #define DISABLE_STATUS_AFTER_WRITE BIT(4) -#define CW_PER_PAGE 6 #define CW_PER_PAGE_MASK GENMASK(8, 6) -#define UD_SIZE_BYTES 9 #define UD_SIZE_BYTES_MASK GENMASK(18, 9) #define ECC_PARITY_SIZE_BYTES_RS GENMASK(22, 19) -#define SPARE_SIZE_BYTES 23 #define SPARE_SIZE_BYTES_MASK GENMASK(26, 23) -#define NUM_ADDR_CYCLES 27 #define NUM_ADDR_CYCLES_MASK GENMASK(29, 27) #define STATUS_BFR_READ BIT(30) #define SET_RD_MODE_AFTER_STATUS BIT(31) @@ -86,26 +82,20 @@ /* NAND_DEVn_CFG0 bits */ #define DEV0_CFG1_ECC_DISABLE BIT(0) #define WIDE_FLASH BIT(1) -#define NAND_RECOVERY_CYCLES 2 #define NAND_RECOVERY_CYCLES_MASK GENMASK(4, 2) #define CS_ACTIVE_BSY BIT(5) -#define BAD_BLOCK_BYTE_NUM 6 #define BAD_BLOCK_BYTE_NUM_MASK GENMASK(15, 6) #define BAD_BLOCK_IN_SPARE_AREA BIT(16) -#define WR_RD_BSY_GAP 17 #define WR_RD_BSY_GAP_MASK GENMASK(22, 17) #define ENABLE_BCH_ECC BIT(27) /* NAND_DEV0_ECC_CFG bits */ #define ECC_CFG_ECC_DISABLE BIT(0) #define ECC_SW_RESET BIT(1) -#define ECC_MODE 4 #define ECC_MODE_MASK GENMASK(5, 4) #define ECC_MODE_4BIT 0 #define ECC_MODE_8BIT 1 -#define ECC_PARITY_SIZE_BYTES_BCH 8 #define ECC_PARITY_SIZE_BYTES_BCH_MASK GENMASK(12, 8) -#define ECC_NUM_DATA_BYTES 16 #define ECC_NUM_DATA_BYTES_MASK GENMASK(25, 16) #define ECC_FORCE_CLK_OPEN BIT(30) @@ -120,7 +110,6 @@ #define SEQ_READ_START_VLD BIT(4) /* NAND_EBI2_ECC_BUF_CFG bits */ -#define NUM_STEPS 0 #define NUM_STEPS_MASK GENMASK(9, 0) /* NAND_ERASED_CW_DETECT_CFG bits */ @@ -141,11 +130,8 @@ #define ERASED_CW (CODEWORD_ALL_ERASED | CODEWORD_ERASED) /* NAND_READ_LOCATION_n bits */ -#define READ_LOCATION_OFFSET 0 #define READ_LOCATION_OFFSET_MASK GENMASK(9, 0) -#define READ_LOCATION_SIZE 16 #define READ_LOCATION_SIZE_MASK GENMASK(25, 16) -#define READ_LOCATION_LAST 31 #define READ_LOCATION_LAST_MASK BIT(31) /* Version Mask */ -- cgit v1.2.3 From 5f284dc15ca8695d0394414045ac64616a3b0e69 Mon Sep 17 00:00:00 2001 From: Tianling Shen Date: Mon, 25 Aug 2025 01:00:13 +0800 Subject: mtd: spinand: add support for FudanMicro FM25S01A Add support for FudanMicro FM25S01A SPI NAND. Datasheet: http://eng.fmsh.com/nvm/FM25S01A_ds_eng.pdf Signed-off-by: Tianling Shen Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/Makefile | 2 +- drivers/mtd/nand/spi/core.c | 1 + drivers/mtd/nand/spi/fmsh.c | 74 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/spinand.h | 1 + 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 drivers/mtd/nand/spi/fmsh.c (limited to 'include') diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile index 258da42451a4..6d3d203df048 100644 --- a/drivers/mtd/nand/spi/Makefile +++ b/drivers/mtd/nand/spi/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 spinand-objs := core.o otp.o -spinand-objs += alliancememory.o ato.o esmt.o foresee.o gigadevice.o macronix.o +spinand-objs += alliancememory.o ato.o esmt.o fmsh.o foresee.o gigadevice.o macronix.o spinand-objs += micron.o paragon.o skyhigh.o toshiba.o winbond.o xtx.o obj-$(CONFIG_MTD_SPI_NAND) += spinand.o diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index b0898990b2a5..ea47028d021a 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -1184,6 +1184,7 @@ static const struct spinand_manufacturer *spinand_manufacturers[] = { &alliancememory_spinand_manufacturer, &ato_spinand_manufacturer, &esmt_c8_spinand_manufacturer, + &fmsh_spinand_manufacturer, &foresee_spinand_manufacturer, &gigadevice_spinand_manufacturer, ¯onix_spinand_manufacturer, diff --git a/drivers/mtd/nand/spi/fmsh.c b/drivers/mtd/nand/spi/fmsh.c new file mode 100644 index 000000000000..8b2097bfc771 --- /dev/null +++ b/drivers/mtd/nand/spi/fmsh.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2021 Rockchip Electronics Co., Ltd. + * + * Author: Dingqiang Lin + */ + +#include +#include +#include + +#define SPINAND_MFR_FMSH 0xA1 + +static SPINAND_OP_VARIANTS(read_cache_variants, + SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0)); + +static SPINAND_OP_VARIANTS(write_cache_variants, + SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0), + SPINAND_PROG_LOAD_1S_1S_1S_OP(true, 0, NULL, 0)); + +static SPINAND_OP_VARIANTS(update_cache_variants, + SPINAND_PROG_LOAD_1S_1S_4S_OP(false, 0, NULL, 0), + SPINAND_PROG_LOAD_1S_1S_1S_OP(false, 0, NULL, 0)); + +static int fm25s01a_ooblayout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + return -ERANGE; +} + +static int fm25s01a_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section) + return -ERANGE; + + region->offset = 2; + region->length = 62; + + return 0; +} + +static const struct mtd_ooblayout_ops fm25s01a_ooblayout = { + .ecc = fm25s01a_ooblayout_ecc, + .free = fm25s01a_ooblayout_free, +}; + +static const struct spinand_info fmsh_spinand_table[] = { + SPINAND_INFO("FM25S01A", + SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xE4), + NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1), + NAND_ECCREQ(1, 512), + SPINAND_INFO_OP_VARIANTS(&read_cache_variants, + &write_cache_variants, + &update_cache_variants), + SPINAND_HAS_QE_BIT, + SPINAND_ECCINFO(&fm25s01a_ooblayout, NULL)), +}; + +static const struct spinand_manufacturer_ops fmsh_spinand_manuf_ops = { +}; + +const struct spinand_manufacturer fmsh_spinand_manufacturer = { + .id = SPINAND_MFR_FMSH, + .name = "Fudan Micro", + .chips = fmsh_spinand_table, + .nchips = ARRAY_SIZE(fmsh_spinand_table), + .ops = &fmsh_spinand_manuf_ops, +}; diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 27a45bdab7ec..927c10d78769 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -355,6 +355,7 @@ struct spinand_manufacturer { extern const struct spinand_manufacturer alliancememory_spinand_manufacturer; extern const struct spinand_manufacturer ato_spinand_manufacturer; extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer; +extern const struct spinand_manufacturer fmsh_spinand_manufacturer; extern const struct spinand_manufacturer foresee_spinand_manufacturer; extern const struct spinand_manufacturer gigadevice_spinand_manufacturer; extern const struct spinand_manufacturer macronix_spinand_manufacturer; -- cgit v1.2.3 From 30c2b98aa84c76f2ae60e66dd4ec2d9497713359 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 12:33:17 +0530 Subject: x86/apic: Add new driver for Secure AVIC The Secure AVIC feature provides SEV-SNP guests hardware acceleration for performance sensitive APIC accesses while securely managing the guest-owned APIC state through the use of a private APIC backing page. This helps prevent the hypervisor from generating unexpected interrupts for a vCPU or otherwise violate architectural assumptions around the APIC behavior. Add a new x2APIC driver that will serve as the base of the Secure AVIC support. It is initially the same as the x2APIC physical driver (without IPI callbacks), but will be modified as features are implemented. As the new driver does not implement Secure AVIC features yet, if the hypervisor sets the Secure AVIC bit in SEV_STATUS, maintain the existing behavior to enforce the guest termination. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828070334.208401-2-Neeraj.Upadhyay@amd.com --- arch/x86/Kconfig | 13 ++++++++ arch/x86/boot/compressed/sev.c | 1 + arch/x86/coco/core.c | 3 ++ arch/x86/coco/sev/core.c | 1 + arch/x86/include/asm/msr-index.h | 4 ++- arch/x86/kernel/apic/Makefile | 1 + arch/x86/kernel/apic/x2apic_savic.c | 63 +++++++++++++++++++++++++++++++++++++ include/linux/cc_platform.h | 8 +++++ 8 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/apic/x2apic_savic.c (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..e32952728d9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -483,6 +483,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea22..74e083feb2d9 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -235,6 +235,7 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) /* diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af68114..989ca9f72ba3 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb27..f7a549f650e9 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -79,6 +79,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..2a6d4fd8659a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -699,7 +699,9 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360..581db89477f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 000000000000..bea844f28192 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay + */ + +#include + +#include +#include + +#include "local.h" + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + snp_abort(); + /* unreachable */ + } + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .nmi_to_offline_cpu = true, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +apic_driver(apic_x2apic_savic); diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048..7fcec025c5e0 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM -- cgit v1.2.3 From 13d8e05adf9dd06c74fcc6ba42ec4bf780fd557f Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:39:55 +0300 Subject: queue_api: add support for fetching per queue DMA dev For zerocopy (io_uring, devmem), there is an assumption that the parent device can do DMA. However that is not always the case: - Scalable Function netdevs [1] have the DMA device in the grandparent. - For Multi-PF netdevs [2] queues can be associated to different DMA devices. This patch introduces the a queue based interface for allowing drivers to expose a different DMA device for zerocopy. [1] Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst [2] Documentation/networking/multi-pf-netdev.rst Signed-off-by: Dragos Tatulea Reviewed-by: Pavel Begunkov Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250827144017.1529208-3-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 7 +++++++ net/core/Makefile | 1 + net/core/netdev_queues.c | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 net/core/netdev_queues.c (limited to 'include') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 6e835972abd1..b9d02bc65c97 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -127,6 +127,9 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped * queue's memory is written at the specified address. * + * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used + * for this queue. Return NULL on error. + * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -144,6 +147,8 @@ struct netdev_queue_mgmt_ops { int (*ndo_queue_stop)(struct net_device *dev, void *per_queue_mem, int idx); + struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, + int idx); }; /** @@ -321,4 +326,6 @@ static inline void netif_subqueue_sent(const struct net_device *dev, get_desc, start_thrs); \ }) +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); + #endif diff --git a/net/core/Makefile b/net/core/Makefile index b2a76ce33932..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + -- cgit v1.2.3 From 48b5e5dbdb234ffc951cacceaec7f8ee37c83b2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:47 +0000 Subject: net_sched: act_vlan: use RCU in tcf_vlan_dump() Also storing tcf_action into struct tcf_vlan_params makes sure there is no discrepancy in tcf_vlan_act(). No longer block BH in tcf_vlan_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_vlan.h | 1 + net/sched/act_vlan.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 3f5e9242b5e8..beadee41669a 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -10,6 +10,7 @@ #include struct tcf_vlan_params { + int action; int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 383bf18b6862..b46f980f3b2a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; - int action; int err; u16 tci; @@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { @@ -97,7 +94,7 @@ out: skb_pull_rcsum(skb, skb->mac_len); skb_reset_mac_len(skb); - return action; + return p->action; drop: tcf_action_inc_drop_qstats(&v->common); @@ -255,10 +252,11 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ETH_ALEN); } - spin_lock_bh(&v->tcf_lock); + p->action = parm->action; + spin_lock(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); - spin_unlock_bh(&v->tcf_lock); + spin_unlock(&v->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&v->tcf_lock); - opt.action = v->tcf_action; - p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(v->vlan_p); + opt.action = p->action; opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From e97ae742972f6cb57986a5ebb846048f80b90003 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:48 +0000 Subject: net_sched: act_tunnel_key: use RCU in tunnel_key_dump() Also storing tcf_action into struct tcf_tunnel_key_params makes sure there is no discrepancy in tunnel_key_act(). No longer block BH in tunnel_key_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_tunnel_key.h | 1 + net/sched/act_tunnel_key.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 879fe8cff581..0f1925f97520 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -14,6 +14,7 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; + int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2cef4b08befb..e1c8b48c217c 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; - int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); - action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, break; } - return action; + return params->action; } static const struct nla_policy @@ -532,11 +530,12 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - spin_lock_bh(&t->tcf_lock); + params_new->action = parm->action; + spin_lock(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, lockdep_is_held(&t->tcf_lock)); - spin_unlock_bh(&t->tcf_lock); + spin_unlock(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t tm; - spin_lock_bh(&t->tcf_lock); - params = rcu_dereference_protected(t->params, - lockdep_is_held(&t->tcf_lock)); - opt.action = t->tcf_action; + rcu_read_lock(); + params = rcu_dereference(t->params); + opt.action = params->action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 53df77e7859042a92914d664c860f65d9689f88d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:49 +0000 Subject: net_sched: act_skbmod: use RCU in tcf_skbmod_dump() Also storing tcf_action into struct tcf_skbmod_params makes sure there is no discrepancy in tcf_skbmod_act(). No longer block BH in tcf_skbmod_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_skbmod.h | 1 + net/sched/act_skbmod.c | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h index 7c240d2fed4e..626704cd6241 100644 --- a/include/net/tc_act/tc_skbmod.h +++ b/include/net/tc_act/tc_skbmod.h @@ -12,6 +12,7 @@ struct tcf_skbmod_params { struct rcu_head rcu; u64 flags; /*up to 64 types of operations; extend if needed */ + int action; u8 eth_dst[ETH_ALEN]; u16 eth_type; u8 eth_src[ETH_ALEN]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index dc0229693461..fce625eafcb2 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action, max_edit_len, err; struct tcf_skbmod_params *p; + int max_edit_len, err; u64 flags; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) + p = rcu_dereference_bh(d->skbmod_p); + if (unlikely(p->action == TC_ACT_SHOT)) goto drop; max_edit_len = skb_mac_header_len(skb); - p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; /* tcf_skbmod_init() guarantees "flags" to be one of the following: @@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, INET_ECN_set_ce(skb); out: - return action; + return p->action; drop: qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -193,9 +192,9 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, } p->flags = lflags; - + p->action = parm->action; if (ovr) - spin_lock_bh(&d->tcf_lock); + spin_lock(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p_old = rcu_dereference_protected(d->skbmod_p, 1); @@ -209,7 +208,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(d->skbmod_p, p); if (ovr) - spin_unlock_bh(&d->tcf_lock); + spin_unlock(&d->tcf_lock); if (p_old) kfree_rcu(p_old, rcu); @@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, opt.index = d->tcf_index; opt.refcnt = refcount_read(&d->tcf_refcnt) - ref; opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; - spin_lock_bh(&d->tcf_lock); - opt.action = d->tcf_action; - p = rcu_dereference_protected(d->skbmod_p, - lockdep_is_held(&d->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(d->skbmod_p); + opt.action = p->action; opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 4247053aaacda75480e1918e0d58a687ae5a266a Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 5 Aug 2025 22:47:17 +0200 Subject: media: uapi: Move colorimetry controls at the end of the file The colorimetry controls class is defined after the stateless codec class at the top of the controls header. It is currently defined in the middle of stateless codec controls. Move the colorimetry controls after the stateless codec controls, at the end of the file. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 68 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index f836512e9deb..4a483ff1c418 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -2549,40 +2549,6 @@ struct v4l2_ctrl_hevc_scaling_matrix { __u8 scaling_list_dc_coef_32x32[2]; }; -#define V4L2_CID_COLORIMETRY_CLASS_BASE (V4L2_CTRL_CLASS_COLORIMETRY | 0x900) -#define V4L2_CID_COLORIMETRY_CLASS (V4L2_CTRL_CLASS_COLORIMETRY | 1) - -#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO (V4L2_CID_COLORIMETRY_CLASS_BASE + 0) - -struct v4l2_ctrl_hdr10_cll_info { - __u16 max_content_light_level; - __u16 max_pic_average_light_level; -}; - -#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY (V4L2_CID_COLORIMETRY_CLASS_BASE + 1) - -#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW 5 -#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH 37000 -#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW 5 -#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH 42000 -#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW 5 -#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH 37000 -#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW 5 -#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH 42000 -#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW 50000 -#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH 100000000 -#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW 1 -#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH 50000 - -struct v4l2_ctrl_hdr10_mastering_display { - __u16 display_primaries_x[3]; - __u16 display_primaries_y[3]; - __u16 white_point_x; - __u16 white_point_y; - __u32 max_display_mastering_luminance; - __u32 min_display_mastering_luminance; -}; - /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 @@ -3515,4 +3481,38 @@ struct v4l2_ctrl_av1_film_grain { #define V4L2_CID_MPEG_MFC51_BASE V4L2_CID_CODEC_MFC51_BASE #endif +#define V4L2_CID_COLORIMETRY_CLASS_BASE (V4L2_CTRL_CLASS_COLORIMETRY | 0x900) +#define V4L2_CID_COLORIMETRY_CLASS (V4L2_CTRL_CLASS_COLORIMETRY | 1) + +#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO (V4L2_CID_COLORIMETRY_CLASS_BASE + 0) + +struct v4l2_ctrl_hdr10_cll_info { + __u16 max_content_light_level; + __u16 max_pic_average_light_level; +}; + +#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY (V4L2_CID_COLORIMETRY_CLASS_BASE + 1) + +#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW 5 +#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH 37000 +#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW 5 +#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH 42000 +#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW 5 +#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH 37000 +#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW 5 +#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH 42000 +#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW 50000 +#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH 100000000 +#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW 1 +#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH 50000 + +struct v4l2_ctrl_hdr10_mastering_display { + __u16 display_primaries_x[3]; + __u16 display_primaries_y[3]; + __u16 white_point_x; + __u16 white_point_y; + __u32 max_display_mastering_luminance; + __u32 min_display_mastering_luminance; +}; + #endif -- cgit v1.2.3 From 481c12018c252f7fc88b4bd05e882b9e1bf260c3 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 5 Aug 2025 22:47:18 +0200 Subject: media: uapi: Cleanup tab after define in headers Some definitions use a tab after the define keyword instead of the usual single space. Replace it for better consistency. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 30 +++++++++++++++--------------- include/uapi/linux/videodev2.h | 18 +++++++++--------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 4a483ff1c418..7aef88465d04 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1193,7 +1193,7 @@ enum v4l2_flash_strobe_source { #define V4L2_CID_JPEG_CLASS_BASE (V4L2_CTRL_CLASS_JPEG | 0x900) #define V4L2_CID_JPEG_CLASS (V4L2_CTRL_CLASS_JPEG | 1) -#define V4L2_CID_JPEG_CHROMA_SUBSAMPLING (V4L2_CID_JPEG_CLASS_BASE + 1) +#define V4L2_CID_JPEG_CHROMA_SUBSAMPLING (V4L2_CID_JPEG_CLASS_BASE + 1) enum v4l2_jpeg_chroma_subsampling { V4L2_JPEG_CHROMA_SUBSAMPLING_444 = 0, V4L2_JPEG_CHROMA_SUBSAMPLING_422 = 1, @@ -1202,15 +1202,15 @@ enum v4l2_jpeg_chroma_subsampling { V4L2_JPEG_CHROMA_SUBSAMPLING_410 = 4, V4L2_JPEG_CHROMA_SUBSAMPLING_GRAY = 5, }; -#define V4L2_CID_JPEG_RESTART_INTERVAL (V4L2_CID_JPEG_CLASS_BASE + 2) -#define V4L2_CID_JPEG_COMPRESSION_QUALITY (V4L2_CID_JPEG_CLASS_BASE + 3) +#define V4L2_CID_JPEG_RESTART_INTERVAL (V4L2_CID_JPEG_CLASS_BASE + 2) +#define V4L2_CID_JPEG_COMPRESSION_QUALITY (V4L2_CID_JPEG_CLASS_BASE + 3) -#define V4L2_CID_JPEG_ACTIVE_MARKER (V4L2_CID_JPEG_CLASS_BASE + 4) -#define V4L2_JPEG_ACTIVE_MARKER_APP0 (1 << 0) -#define V4L2_JPEG_ACTIVE_MARKER_APP1 (1 << 1) -#define V4L2_JPEG_ACTIVE_MARKER_COM (1 << 16) -#define V4L2_JPEG_ACTIVE_MARKER_DQT (1 << 17) -#define V4L2_JPEG_ACTIVE_MARKER_DHT (1 << 18) +#define V4L2_CID_JPEG_ACTIVE_MARKER (V4L2_CID_JPEG_CLASS_BASE + 4) +#define V4L2_JPEG_ACTIVE_MARKER_APP0 (1 << 0) +#define V4L2_JPEG_ACTIVE_MARKER_APP1 (1 << 1) +#define V4L2_JPEG_ACTIVE_MARKER_COM (1 << 16) +#define V4L2_JPEG_ACTIVE_MARKER_DQT (1 << 17) +#define V4L2_JPEG_ACTIVE_MARKER_DHT (1 << 18) /* Image source controls */ @@ -1243,10 +1243,10 @@ enum v4l2_jpeg_chroma_subsampling { #define V4L2_CID_DV_CLASS_BASE (V4L2_CTRL_CLASS_DV | 0x900) #define V4L2_CID_DV_CLASS (V4L2_CTRL_CLASS_DV | 1) -#define V4L2_CID_DV_TX_HOTPLUG (V4L2_CID_DV_CLASS_BASE + 1) -#define V4L2_CID_DV_TX_RXSENSE (V4L2_CID_DV_CLASS_BASE + 2) -#define V4L2_CID_DV_TX_EDID_PRESENT (V4L2_CID_DV_CLASS_BASE + 3) -#define V4L2_CID_DV_TX_MODE (V4L2_CID_DV_CLASS_BASE + 4) +#define V4L2_CID_DV_TX_HOTPLUG (V4L2_CID_DV_CLASS_BASE + 1) +#define V4L2_CID_DV_TX_RXSENSE (V4L2_CID_DV_CLASS_BASE + 2) +#define V4L2_CID_DV_TX_EDID_PRESENT (V4L2_CID_DV_CLASS_BASE + 3) +#define V4L2_CID_DV_TX_MODE (V4L2_CID_DV_CLASS_BASE + 4) enum v4l2_dv_tx_mode { V4L2_DV_TX_MODE_DVI_D = 0, V4L2_DV_TX_MODE_HDMI = 1, @@ -1267,7 +1267,7 @@ enum v4l2_dv_it_content_type { V4L2_DV_IT_CONTENT_TYPE_NO_ITC = 4, }; -#define V4L2_CID_DV_RX_POWER_PRESENT (V4L2_CID_DV_CLASS_BASE + 100) +#define V4L2_CID_DV_RX_POWER_PRESENT (V4L2_CID_DV_CLASS_BASE + 100) #define V4L2_CID_DV_RX_RGB_RANGE (V4L2_CID_DV_CLASS_BASE + 101) #define V4L2_CID_DV_RX_IT_CONTENT_TYPE (V4L2_CID_DV_CLASS_BASE + 102) @@ -2552,7 +2552,7 @@ struct v4l2_ctrl_hevc_scaling_matrix { /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 -#define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 +#define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 /** * struct v4l2_vp9_loop_filter - VP9 loop filter parameters diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 3dd9fa45dde1..64943f1a6149 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1607,8 +1607,8 @@ struct v4l2_bt_timings { } __attribute__ ((packed)); /* Interlaced or progressive format */ -#define V4L2_DV_PROGRESSIVE 0 -#define V4L2_DV_INTERLACED 1 +#define V4L2_DV_PROGRESSIVE 0 +#define V4L2_DV_INTERLACED 1 /* Polarities. If bit is not set, it is assumed to be negative polarity */ #define V4L2_DV_VSYNC_POS_POL 0x00000001 @@ -2788,15 +2788,15 @@ struct v4l2_remove_buffers { * Only implemented if CONFIG_VIDEO_ADV_DEBUG is defined. * You must be root to use these ioctls. Never use these in applications! */ -#define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) -#define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) +#define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) +#define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) #define VIDIOC_S_HW_FREQ_SEEK _IOW('V', 82, struct v4l2_hw_freq_seek) -#define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) -#define VIDIOC_G_DV_TIMINGS _IOWR('V', 88, struct v4l2_dv_timings) -#define VIDIOC_DQEVENT _IOR('V', 89, struct v4l2_event) -#define VIDIOC_SUBSCRIBE_EVENT _IOW('V', 90, struct v4l2_event_subscription) -#define VIDIOC_UNSUBSCRIBE_EVENT _IOW('V', 91, struct v4l2_event_subscription) +#define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) +#define VIDIOC_G_DV_TIMINGS _IOWR('V', 88, struct v4l2_dv_timings) +#define VIDIOC_DQEVENT _IOR('V', 89, struct v4l2_event) +#define VIDIOC_SUBSCRIBE_EVENT _IOW('V', 90, struct v4l2_event_subscription) +#define VIDIOC_UNSUBSCRIBE_EVENT _IOW('V', 91, struct v4l2_event_subscription) #define VIDIOC_CREATE_BUFS _IOWR('V', 92, struct v4l2_create_buffers) #define VIDIOC_PREPARE_BUF _IOWR('V', 93, struct v4l2_buffer) #define VIDIOC_G_SELECTION _IOWR('V', 94, struct v4l2_selection) -- cgit v1.2.3 From 039b9302d64ec35f70c91919cd7bcdbc1aef3707 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Tue, 26 Aug 2025 10:25:01 +0800 Subject: media: aspeed: Allow to capture from SoC display (GFX) ASPEED BMC IC has 2 different display engines. Please find AST2600's datasheet to get detailed information. 1. VGA on PCIe 2. SoC Display (GFX) By default, video engine (VE) will capture video from VGA. This patch adds an option to capture video from GFX with standard ioctl, vidioc_s_input. An enum, aspeed_video_input, is added for this purpose. enum aspeed_video_input { VIDEO_INPUT_VGA = 0, VIDEO_INPUT_GFX, VIDEO_INPUT_MAX }; To test this feature, you will need to enable GFX first. Please refer to ASPEED's SDK_User_Guide, 6.3.x Soc Display driver, for more information. In your application, you will need to use v4l2 ioctl, VIDIOC_S_INPUT, as below to select before start streaming. int rc; struct v4l2_input input; input.index = VIDEO_INPUT_GFX; rc = ioctl(fd, VIDIOC_S_INPUT, &input); if (rc < 0) { ... } Link: https://github.com/AspeedTech-BMC/openbmc/releases Signed-off-by: Jammy Huang Signed-off-by: Hans Verkuil [hverkuil: split up three overly long lines] --- drivers/media/platform/aspeed/aspeed-video.c | 199 +++++++++++++++++++++++---- include/uapi/linux/aspeed-video.h | 7 + 2 files changed, 178 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/media/platform/aspeed/aspeed-video.c b/drivers/media/platform/aspeed/aspeed-video.c index 54cae0da9aca..b83e43245277 100644 --- a/drivers/media/platform/aspeed/aspeed-video.c +++ b/drivers/media/platform/aspeed/aspeed-video.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -203,6 +206,25 @@ #define VE_MEM_RESTRICT_START 0x310 #define VE_MEM_RESTRICT_END 0x314 +/* SCU's registers */ +#define SCU_MISC_CTRL 0xC0 +#define SCU_DPLL_SOURCE BIT(20) + +/* GFX's registers */ +#define GFX_CTRL 0x60 +#define GFX_CTRL_ENABLE BIT(0) +#define GFX_CTRL_FMT GENMASK(9, 7) + +#define GFX_H_DISPLAY 0x70 +#define GFX_H_DISPLAY_DE GENMASK(28, 16) +#define GFX_H_DISPLAY_TOTAL GENMASK(12, 0) + +#define GFX_V_DISPLAY 0x78 +#define GFX_V_DISPLAY_DE GENMASK(27, 16) +#define GFX_V_DISPLAY_TOTAL GENMASK(11, 0) + +#define GFX_DISPLAY_ADDR 0x80 + /* * VIDEO_MODE_DETECT_DONE: a flag raised if signal lock * VIDEO_RES_CHANGE: a flag raised if res_change work on-going @@ -262,6 +284,7 @@ struct aspeed_video_perf { /* * struct aspeed_video - driver data * + * version: holds the version of aspeed SoC * res_work: holds the delayed_work for res-detection if unlock * buffers: holds the list of buffer queued from user * flags: holds the state of video @@ -273,6 +296,7 @@ struct aspeed_video_perf { * yuv420: a flag raised if JPEG subsampling is 420 * format: holds the video format * hq_mode: a flag raised if HQ is enabled. Only for VIDEO_FMT_ASPEED + * input: holds the video input * frame_rate: holds the frame_rate * jpeg_quality: holds jpeq's quality (0~11) * jpeg_hq_quality: holds hq's quality (1~12) only if hq_mode enabled @@ -298,6 +322,9 @@ struct aspeed_video { struct video_device vdev; struct mutex video_lock; /* v4l2 and videobuf2 lock */ + struct regmap *scu; + struct regmap *gfx; + u32 version; u32 jpeg_mode; u32 comp_size_read; @@ -316,6 +343,7 @@ struct aspeed_video { bool yuv420; enum aspeed_video_format format; bool hq_mode; + enum aspeed_video_input input; unsigned int frame_rate; unsigned int jpeg_quality; unsigned int jpeg_hq_quality; @@ -331,21 +359,25 @@ struct aspeed_video { #define to_aspeed_video(x) container_of((x), struct aspeed_video, v4l2_dev) struct aspeed_video_config { + u32 version; u32 jpeg_mode; u32 comp_size_read; }; static const struct aspeed_video_config ast2400_config = { + .version = 4, .jpeg_mode = AST2400_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2400_VE_COMP_SIZE_READ_BACK, }; static const struct aspeed_video_config ast2500_config = { + .version = 5, .jpeg_mode = AST2500_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2400_VE_COMP_SIZE_READ_BACK, }; static const struct aspeed_video_config ast2600_config = { + .version = 6, .jpeg_mode = AST2500_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2600_VE_COMP_SIZE_READ_BACK, }; @@ -485,6 +517,7 @@ static const struct v4l2_dv_timings_cap aspeed_video_timings_cap = { static const char * const format_str[] = {"Standard JPEG", "Aspeed JPEG"}; +static const char * const input_str[] = {"HOST VGA", "BMC GFX"}; static unsigned int debug; @@ -609,6 +642,14 @@ static int aspeed_video_start_frame(struct aspeed_video *video) aspeed_video_free_buf(video, &video->bcd); } + if (video->input == VIDEO_INPUT_GFX) { + u32 val; + + // update input buffer address as gfx's + regmap_read(video->gfx, GFX_DISPLAY_ADDR, &val); + aspeed_video_write(video, VE_TGS_0, val); + } + spin_lock_irqsave(&video->lock, flags); buf = list_first_entry_or_null(&video->buffers, struct aspeed_video_buffer, link); @@ -1026,9 +1067,23 @@ static void aspeed_video_get_timings(struct aspeed_video *v, } } +static void aspeed_video_get_resolution_gfx(struct aspeed_video *video, + struct v4l2_bt_timings *det) +{ + u32 h_val, v_val; + + regmap_read(video->gfx, GFX_H_DISPLAY, &h_val); + regmap_read(video->gfx, GFX_V_DISPLAY, &v_val); + + det->width = FIELD_GET(GFX_H_DISPLAY_DE, h_val) + 1; + det->height = FIELD_GET(GFX_V_DISPLAY_DE, v_val) + 1; + video->v4l2_input_status = 0; +} + #define res_check(v) test_and_clear_bit(VIDEO_MODE_DETECT_DONE, &(v)->flags) -static void aspeed_video_get_resolution(struct aspeed_video *video) +static void aspeed_video_get_resolution_vga(struct aspeed_video *video, + struct v4l2_bt_timings *det) { bool invalid_resolution = true; int rc; @@ -1036,7 +1091,6 @@ static void aspeed_video_get_resolution(struct aspeed_video *video) u32 mds; u32 src_lr_edge; u32 src_tb_edge; - struct v4l2_bt_timings *det = &video->detected_timings; det->width = MIN_WIDTH; det->height = MIN_HEIGHT; @@ -1113,14 +1167,20 @@ static void aspeed_video_get_resolution(struct aspeed_video *video) aspeed_video_get_timings(video, det); - /* - * Enable mode-detect watchdog, resolution-change watchdog and - * automatic compression after frame capture. - */ + /* Enable mode-detect watchdog, resolution-change watchdog */ aspeed_video_update(video, VE_INTERRUPT_CTRL, 0, VE_INTERRUPT_MODE_DETECT_WD); - aspeed_video_update(video, VE_SEQ_CTRL, 0, - VE_SEQ_CTRL_AUTO_COMP | VE_SEQ_CTRL_EN_WATCHDOG); + aspeed_video_update(video, VE_SEQ_CTRL, 0, VE_SEQ_CTRL_EN_WATCHDOG); +} + +static void aspeed_video_get_resolution(struct aspeed_video *video) +{ + struct v4l2_bt_timings *det = &video->detected_timings; + + if (video->input == VIDEO_INPUT_GFX) + aspeed_video_get_resolution_gfx(video, det); + else + aspeed_video_get_resolution_vga(video, det); v4l2_dbg(1, debug, &video->v4l2_dev, "Got resolution: %dx%d\n", det->width, det->height); @@ -1156,7 +1216,7 @@ static void aspeed_video_set_resolution(struct aspeed_video *video) aspeed_video_write(video, VE_SRC_SCANLINE_OFFSET, act->width * 4); /* Don't use direct mode below 1024 x 768 (irqs don't fire) */ - if (size < DIRECT_FETCH_THRESHOLD) { + if (video->input == VIDEO_INPUT_VGA && size < DIRECT_FETCH_THRESHOLD) { v4l2_dbg(1, debug, &video->v4l2_dev, "Capture: Sync Mode\n"); aspeed_video_write(video, VE_TGS_0, FIELD_PREP(VE_TGS_FIRST, @@ -1171,10 +1231,20 @@ static void aspeed_video_set_resolution(struct aspeed_video *video) VE_CTRL_INT_DE | VE_CTRL_DIRECT_FETCH, VE_CTRL_INT_DE); } else { + u32 ctrl, val, bpp; + v4l2_dbg(1, debug, &video->v4l2_dev, "Capture: Direct Mode\n"); + ctrl = VE_CTRL_DIRECT_FETCH; + if (video->input == VIDEO_INPUT_GFX) { + regmap_read(video->gfx, GFX_CTRL, &val); + bpp = FIELD_GET(GFX_CTRL_FMT, val) ? 32 : 16; + if (bpp == 16) + ctrl |= VE_CTRL_INT_DE; + aspeed_video_write(video, VE_TGS_1, act->width * (bpp >> 3)); + } aspeed_video_update(video, VE_CTRL, VE_CTRL_INT_DE | VE_CTRL_DIRECT_FETCH, - VE_CTRL_DIRECT_FETCH); + ctrl); } size *= 4; @@ -1207,6 +1277,22 @@ err_mem: aspeed_video_free_buf(video, &video->srcs[0]); } +/* + * Update relative parameters when timing changed. + * + * @video: the struct of aspeed_video + * @timings: the new timings + */ +static void aspeed_video_update_timings(struct aspeed_video *video, struct v4l2_bt_timings *timings) +{ + video->active_timings = *timings; + aspeed_video_set_resolution(video); + + video->pix_fmt.width = timings->width; + video->pix_fmt.height = timings->height; + video->pix_fmt.sizeimage = video->max_compressed_size; +} + static void aspeed_video_update_regs(struct aspeed_video *video) { u8 jpeg_hq_quality = clamp((int)video->jpeg_hq_quality - 1, 0, @@ -1219,6 +1305,8 @@ static void aspeed_video_update_regs(struct aspeed_video *video) u32 ctrl = 0; u32 seq_ctrl = 0; + v4l2_dbg(1, debug, &video->v4l2_dev, "input(%s)\n", + input_str[video->input]); v4l2_dbg(1, debug, &video->v4l2_dev, "framerate(%d)\n", video->frame_rate); v4l2_dbg(1, debug, &video->v4l2_dev, "jpeg format(%s) subsample(%s)\n", @@ -1234,6 +1322,9 @@ static void aspeed_video_update_regs(struct aspeed_video *video) else aspeed_video_update(video, VE_BCD_CTRL, VE_BCD_CTRL_EN_BCD, 0); + if (video->input == VIDEO_INPUT_VGA) + ctrl |= VE_CTRL_AUTO_OR_CURSOR; + if (video->frame_rate) ctrl |= FIELD_PREP(VE_CTRL_FRC, video->frame_rate); @@ -1252,7 +1343,9 @@ static void aspeed_video_update_regs(struct aspeed_video *video) aspeed_video_update(video, VE_SEQ_CTRL, video->jpeg_mode | VE_SEQ_CTRL_YUV420, seq_ctrl); - aspeed_video_update(video, VE_CTRL, VE_CTRL_FRC, ctrl); + aspeed_video_update(video, VE_CTRL, + VE_CTRL_FRC | VE_CTRL_AUTO_OR_CURSOR | + VE_CTRL_SOURCE, ctrl); aspeed_video_update(video, VE_COMP_CTRL, VE_COMP_CTRL_DCT_LUM | VE_COMP_CTRL_DCT_CHR | VE_COMP_CTRL_EN_HQ | VE_COMP_CTRL_HQ_DCT_LUM | @@ -1280,6 +1373,7 @@ static void aspeed_video_init_regs(struct aspeed_video *video) aspeed_video_write(video, VE_JPEG_ADDR, video->jpeg.dma); /* Set control registers */ + aspeed_video_write(video, VE_SEQ_CTRL, VE_SEQ_CTRL_AUTO_COMP); aspeed_video_write(video, VE_CTRL, ctrl); aspeed_video_write(video, VE_COMP_CTRL, VE_COMP_CTRL_RSVD); @@ -1311,12 +1405,7 @@ static void aspeed_video_start(struct aspeed_video *video) aspeed_video_get_resolution(video); /* Set timings since the device is being opened for the first time */ - video->active_timings = video->detected_timings; - aspeed_video_set_resolution(video); - - video->pix_fmt.width = video->active_timings.width; - video->pix_fmt.height = video->active_timings.height; - video->pix_fmt.sizeimage = video->max_compressed_size; + aspeed_video_update_timings(video, &video->detected_timings); } static void aspeed_video_stop(struct aspeed_video *video) @@ -1401,10 +1490,10 @@ static int aspeed_video_enum_input(struct file *file, void *fh, { struct aspeed_video *video = video_drvdata(file); - if (inp->index) + if (inp->index >= VIDEO_INPUT_MAX) return -EINVAL; - strscpy(inp->name, "Host VGA capture", sizeof(inp->name)); + sprintf(inp->name, "%s capture", input_str[inp->index]); inp->type = V4L2_INPUT_TYPE_CAMERA; inp->capabilities = V4L2_IN_CAP_DV_TIMINGS; inp->status = video->v4l2_input_status; @@ -1414,16 +1503,57 @@ static int aspeed_video_enum_input(struct file *file, void *fh, static int aspeed_video_get_input(struct file *file, void *fh, unsigned int *i) { - *i = 0; + struct aspeed_video *video = video_drvdata(file); + + *i = video->input; return 0; } static int aspeed_video_set_input(struct file *file, void *fh, unsigned int i) { - if (i) + struct aspeed_video *video = video_drvdata(file); + + if (i >= VIDEO_INPUT_MAX) return -EINVAL; + if (i == video->input) + return 0; + + if (vb2_is_busy(&video->queue)) + return -EBUSY; + + if (IS_ERR(video->scu)) { + v4l2_dbg(1, debug, &video->v4l2_dev, + "%s: scu isn't ready for input-control\n", __func__); + return -EINVAL; + } + + if (IS_ERR(video->gfx) && i == VIDEO_INPUT_GFX) { + v4l2_dbg(1, debug, &video->v4l2_dev, + "%s: gfx isn't ready for GFX input\n", __func__); + return -EINVAL; + } + + video->input = i; + + if (video->version == 6) { + /* modify dpll source per current input */ + if (video->input == VIDEO_INPUT_VGA) + regmap_update_bits(video->scu, SCU_MISC_CTRL, + SCU_DPLL_SOURCE, 0); + else + regmap_update_bits(video->scu, SCU_MISC_CTRL, + SCU_DPLL_SOURCE, SCU_DPLL_SOURCE); + } + + aspeed_video_update_regs(video); + + /* update signal status */ + aspeed_video_get_resolution(video); + if (!video->v4l2_input_status) + aspeed_video_update_timings(video, &video->detected_timings); + return 0; } @@ -1527,13 +1657,7 @@ static int aspeed_video_set_dv_timings(struct file *file, void *fh, if (vb2_is_busy(&video->queue)) return -EBUSY; - video->active_timings = timings->bt; - - aspeed_video_set_resolution(video); - - video->pix_fmt.width = timings->bt.width; - video->pix_fmt.height = timings->bt.height; - video->pix_fmt.sizeimage = video->max_compressed_size; + aspeed_video_update_timings(video, &timings->bt); timings->type = V4L2_DV_BT_656_1120; @@ -1909,6 +2033,7 @@ static int aspeed_video_debugfs_show(struct seq_file *s, void *data) val08 = aspeed_video_read(v, VE_CTRL); if (FIELD_GET(VE_CTRL_DIRECT_FETCH, val08)) { seq_printf(s, " %-20s:\tDirect fetch\n", "Mode"); + seq_printf(s, " %-20s:\t%s\n", "Input", input_str[v->input]); seq_printf(s, " %-20s:\t%s\n", "VGA bpp mode", FIELD_GET(VE_CTRL_INT_DE, val08) ? "16" : "32"); } else { @@ -2068,12 +2193,29 @@ static int aspeed_video_setup_video(struct aspeed_video *video) return 0; } +/* + * Get regmap without checking res, such as clk/reset, that could lead to + * conflict. + */ +static struct regmap *aspeed_regmap_lookup(struct device_node *np, const char *property) +{ + struct device_node *syscon_np __free(device_node) = of_parse_phandle(np, property, 0); + + if (!syscon_np) + return ERR_PTR(-ENODEV); + + return device_node_to_regmap(syscon_np); +} + static int aspeed_video_init(struct aspeed_video *video) { int irq; int rc; struct device *dev = video->dev; + video->scu = aspeed_regmap_lookup(dev->of_node, "aspeed,scu"); + video->gfx = aspeed_regmap_lookup(dev->of_node, "aspeed,gfx"); + irq = irq_of_parse_and_map(dev->of_node, 0); if (!irq) { dev_err(dev, "Unable to find IRQ\n"); @@ -2165,6 +2307,7 @@ static int aspeed_video_probe(struct platform_device *pdev) if (!config) return -ENODEV; + video->version = config->version; video->jpeg_mode = config->jpeg_mode; video->comp_size_read = config->comp_size_read; diff --git a/include/uapi/linux/aspeed-video.h b/include/uapi/linux/aspeed-video.h index 6586a65548c4..15168e8c931e 100644 --- a/include/uapi/linux/aspeed-video.h +++ b/include/uapi/linux/aspeed-video.h @@ -8,6 +8,13 @@ #include +/* aspeed video's input types */ +enum aspeed_video_input { + VIDEO_INPUT_VGA = 0, + VIDEO_INPUT_GFX, + VIDEO_INPUT_MAX +}; + #define V4L2_CID_ASPEED_HQ_MODE (V4L2_CID_USER_ASPEED_BASE + 1) #define V4L2_CID_ASPEED_HQ_JPEG_QUALITY (V4L2_CID_USER_ASPEED_BASE + 2) -- cgit v1.2.3 From 5d8c9c987fbdd65677315198c2b1f35a440d7cdf Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 27 Aug 2025 09:28:41 +0200 Subject: ALSA: hda: Introduce auto cleanup macros for PM The temporary power up/down of the codec via snd_hda_power_up() and _down() (or snd_hda_power_up_pm() and _down_pm()) is seen in various places. This patch introduces simple auto-cleanup macros for those call patterns, so that the drivers don't have to call the corresponding power-down calls explicitly. Namely, err = snd_hda_power_up(codec); if (err < 0) return err; .... snd_power_down(codec); can drop the *_down() call by replacing with CLASS(snd_hda_power, pm)(codec); if (pm.err < 0) return pm.err; Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250827072916.31933-2-tiwai@suse.de --- include/sound/hda_codec.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h index 006d4e4a8195..5d9f0ef228af 100644 --- a/include/sound/hda_codec.h +++ b/include/sound/hda_codec.h @@ -503,6 +503,36 @@ static inline bool hda_codec_need_resume(struct hda_codec *codec) return !codec->relaxed_resume && codec->jacktbl.used; } +/* + * PM with auto-cleanup: call like CLASS(snd_hda_power, pm)(codec) + * If the error handling is needed, refer pm.err. + */ +struct __hda_power_obj { + struct hda_codec *codec; + int err; +}; + +static inline struct __hda_power_obj __snd_hda_power_up(struct hda_codec *codec) +{ + struct __hda_power_obj T = { .codec = codec }; + T.err = snd_hda_power_up(codec); + return T; +} + +static inline struct __hda_power_obj __snd_hda_power_up_pm(struct hda_codec *codec) +{ + struct __hda_power_obj T = { .codec = codec }; + T.err = snd_hda_power_up_pm(codec); + return T; +} + +DEFINE_CLASS(snd_hda_power, struct __hda_power_obj, + snd_hda_power_down((_T).codec), __snd_hda_power_up(codec), + struct hda_codec *codec) +DEFINE_CLASS(snd_hda_power_pm, struct __hda_power_obj, + snd_hda_power_down_pm((_T).codec), __snd_hda_power_up_pm(codec), + struct hda_codec *codec) + #ifdef CONFIG_SND_HDA_PATCH_LOADER /* * patch firmware -- cgit v1.2.3 From a23160c87986732590e68c1788e9b4929950ef67 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 27 Aug 2025 09:28:46 +0200 Subject: ALSA: hda: Use auto cleanup macros for DSP loader locks There are temporary DSP locking/unlocking patterns found in various places, and those can be cleaned up nicely with the guard() macro calling snd_hdac_dsp_lock() and *_unlock(). Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250827072916.31933-7-tiwai@suse.de --- include/sound/hdaudio.h | 1 + sound/hda/common/controller.c | 47 ++++++++++++++++++------------------------- sound/hda/core/stream.c | 11 +++------- 3 files changed, 24 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h index d38234f8fe44..4e0c1d8af09f 100644 --- a/include/sound/hdaudio.h +++ b/include/sound/hdaudio.h @@ -651,6 +651,7 @@ int snd_hdac_stream_set_lpib(struct hdac_stream *azx_dev, u32 value); #define snd_hdac_dsp_lock(dev) mutex_lock(&(dev)->dsp_mutex) #define snd_hdac_dsp_unlock(dev) mutex_unlock(&(dev)->dsp_mutex) #define snd_hdac_stream_is_locked(dev) ((dev)->locked) +DEFINE_GUARD(snd_hdac_dsp_lock, struct hdac_stream *, snd_hdac_dsp_lock(_T), snd_hdac_dsp_unlock(_T)) /* DSP loader helpers */ int snd_hdac_dsp_prepare(struct hdac_stream *azx_dev, unsigned int format, unsigned int byte_size, struct snd_dma_buffer *bufp); diff --git a/sound/hda/common/controller.c b/sound/hda/common/controller.c index 84387ed761be..ceab8625bb1f 100644 --- a/sound/hda/common/controller.c +++ b/sound/hda/common/controller.c @@ -32,8 +32,11 @@ #include "controller_trace.h" /* DSP lock helpers */ -#define dsp_lock(dev) snd_hdac_dsp_lock(azx_stream(dev)) -#define dsp_unlock(dev) snd_hdac_dsp_unlock(azx_stream(dev)) +#ifdef CONFIG_SND_HDA_DSP_LOADER +#define guard_dsp_lock(dev) guard(snd_hdac_dsp_lock)(azx_stream(dev)) +#else +#define guard_dsp_lock(dev) do {} while (0) +#endif #define dsp_is_locked(dev) snd_hdac_stream_is_locked(azx_stream(dev)) /* assign a stream for the PCM */ @@ -110,14 +113,11 @@ static int azx_pcm_hw_params(struct snd_pcm_substream *substream, struct azx *chip = apcm->chip; struct azx_dev *azx_dev = get_azx_dev(substream); struct hdac_stream *hdas = azx_stream(azx_dev); - int ret = 0; trace_azx_pcm_hw_params(chip, azx_dev); - dsp_lock(azx_dev); - if (dsp_is_locked(azx_dev)) { - ret = -EBUSY; - goto unlock; - } + guard_dsp_lock(azx_dev); + if (dsp_is_locked(azx_dev)) + return -EBUSY; /* Set up BDLEs here, return -ENOMEM if too many BDLEs are required */ hdas->bufsize = params_buffer_bytes(hw_params); @@ -127,11 +127,9 @@ static int azx_pcm_hw_params(struct snd_pcm_substream *substream, (hw_params->info & SNDRV_PCM_INFO_NO_PERIOD_WAKEUP) && (hw_params->flags & SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP); if (snd_hdac_stream_setup_periods(hdas) < 0) - ret = -ENOMEM; + return -ENOMEM; -unlock: - dsp_unlock(azx_dev); - return ret; + return 0; } static int azx_pcm_hw_free(struct snd_pcm_substream *substream) @@ -141,14 +139,13 @@ static int azx_pcm_hw_free(struct snd_pcm_substream *substream) struct hda_pcm_stream *hinfo = to_hda_pcm_stream(substream); /* reset BDL address */ - dsp_lock(azx_dev); + guard_dsp_lock(azx_dev); if (!dsp_is_locked(azx_dev)) snd_hdac_stream_cleanup(azx_stream(azx_dev)); snd_hda_codec_cleanup(apcm->codec, hinfo, substream); azx_stream(azx_dev)->prepared = 0; - dsp_unlock(azx_dev); return 0; } @@ -166,11 +163,9 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) unsigned short ctls = spdif ? spdif->ctls : 0; trace_azx_pcm_prepare(chip, azx_dev); - dsp_lock(azx_dev); - if (dsp_is_locked(azx_dev)) { - err = -EBUSY; - goto unlock; - } + guard_dsp_lock(azx_dev); + if (dsp_is_locked(azx_dev)) + return -EBUSY; snd_hdac_stream_reset(azx_stream(azx_dev)); bits = snd_hdac_stream_format_bits(runtime->format, SNDRV_PCM_SUBFORMAT_STD, hinfo->maxbps); @@ -180,13 +175,12 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) dev_err(chip->card->dev, "invalid format_val, rate=%d, ch=%d, format=%d\n", runtime->rate, runtime->channels, runtime->format); - err = -EINVAL; - goto unlock; + return -EINVAL; } err = snd_hdac_stream_set_params(azx_stream(azx_dev), format_val); if (err < 0) - goto unlock; + return err; snd_hdac_stream_setup(azx_stream(azx_dev), false); @@ -197,12 +191,11 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) stream_tag -= chip->capture_streams; err = snd_hda_codec_prepare(apcm->codec, hinfo, stream_tag, azx_dev->core.format_val, substream); + if (err < 0) + return err; - unlock: - if (!err) - azx_stream(azx_dev)->prepared = 1; - dsp_unlock(azx_dev); - return err; + azx_stream(azx_dev)->prepared = 1; + return 0; } static int azx_pcm_trigger(struct snd_pcm_substream *substream, int cmd) diff --git a/sound/hda/core/stream.c b/sound/hda/core/stream.c index 4a87bef8834f..0caeebcc591a 100644 --- a/sound/hda/core/stream.c +++ b/sound/hda/core/stream.c @@ -922,12 +922,11 @@ int snd_hdac_dsp_prepare(struct hdac_stream *azx_dev, unsigned int format, struct hdac_bus *bus = azx_dev->bus; int err; - snd_hdac_dsp_lock(azx_dev); + guard(snd_hdac_dsp_lock)(azx_dev); spin_lock_irq(&bus->reg_lock); if (azx_dev->running || azx_dev->locked) { spin_unlock_irq(&bus->reg_lock); - err = -EBUSY; - goto unlock; + return -EBUSY; } azx_dev->locked = true; spin_unlock_irq(&bus->reg_lock); @@ -951,7 +950,6 @@ int snd_hdac_dsp_prepare(struct hdac_stream *azx_dev, unsigned int format, goto error; snd_hdac_stream_setup(azx_dev, true); - snd_hdac_dsp_unlock(azx_dev); return azx_dev->stream_tag; error: @@ -960,8 +958,6 @@ int snd_hdac_dsp_prepare(struct hdac_stream *azx_dev, unsigned int format, spin_lock_irq(&bus->reg_lock); azx_dev->locked = false; spin_unlock_irq(&bus->reg_lock); - unlock: - snd_hdac_dsp_unlock(azx_dev); return err; } EXPORT_SYMBOL_GPL(snd_hdac_dsp_prepare); @@ -993,7 +989,7 @@ void snd_hdac_dsp_cleanup(struct hdac_stream *azx_dev, if (!dmab->area || !azx_dev->locked) return; - snd_hdac_dsp_lock(azx_dev); + guard(snd_hdac_dsp_lock)(azx_dev); /* reset BDL address */ snd_hdac_stream_writel(azx_dev, SD_BDLPL, 0); snd_hdac_stream_writel(azx_dev, SD_BDLPU, 0); @@ -1008,7 +1004,6 @@ void snd_hdac_dsp_cleanup(struct hdac_stream *azx_dev, spin_lock_irq(&bus->reg_lock); azx_dev->locked = false; spin_unlock_irq(&bus->reg_lock); - snd_hdac_dsp_unlock(azx_dev); } EXPORT_SYMBOL_GPL(snd_hdac_dsp_cleanup); #endif /* CONFIG_SND_HDA_DSP_LOADER */ -- cgit v1.2.3 From 782d4613171e271b1e28ee1db9616beb8e6ad8a1 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 29 Aug 2025 12:30:19 +0300 Subject: ASoC: SOF: IPC4: Add GET macros for module id and module instance id Add SOF_IPC4_MOD_INSTANCE_GET() and SOF_IPC4_MOD_ID_GET() for getting the ids from ipc4 header presentation. Signed-off-by: Jyri Sarha Reviewed-by: Liam Girdwood Signed-off-by: Peter Ujfalusi Message-ID: <20250829093022.32094-3-peter.ujfalusi@linux.intel.com> Signed-off-by: Mark Brown --- include/sound/sof/ipc4/header.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/sound/sof/ipc4/header.h b/include/sound/sof/ipc4/header.h index e85c7afd85a4..15fac532688e 100644 --- a/include/sound/sof/ipc4/header.h +++ b/include/sound/sof/ipc4/header.h @@ -326,10 +326,14 @@ struct sof_ipc4_base_module_cfg { #define SOF_IPC4_MOD_INSTANCE_SHIFT 16 #define SOF_IPC4_MOD_INSTANCE_MASK GENMASK(23, 16) #define SOF_IPC4_MOD_INSTANCE(x) ((x) << SOF_IPC4_MOD_INSTANCE_SHIFT) +#define SOF_IPC4_MOD_INSTANCE_GET(x) (((x) & SOF_IPC4_MOD_INSTANCE_MASK) \ + >> SOF_IPC4_MOD_INSTANCE_SHIFT) #define SOF_IPC4_MOD_ID_SHIFT 0 #define SOF_IPC4_MOD_ID_MASK GENMASK(15, 0) #define SOF_IPC4_MOD_ID(x) ((x) << SOF_IPC4_MOD_ID_SHIFT) +#define SOF_IPC4_MOD_ID_GET(x) (((x) & SOF_IPC4_MOD_ID_MASK) \ + >> SOF_IPC4_MOD_ID_SHIFT) /* init module ipc msg */ #define SOF_IPC4_MOD_EXT_PARAM_SIZE_SHIFT 0 -- cgit v1.2.3 From 9a98f9e84cfbeaa51af42ba2b8bbbde046c709a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:01 -0400 Subject: fs: make the i_state flags an enum Adjusting i_state flags always means updating the values manually. Bring these forward into the 2020's and make a nice clean macro for defining the i_state values as an enum, providing __ variants for the cases where we need the bit position instead of the actual value, and leaving the actual NAME as the 1U << bit value. Reviewed-by: Christian Brauner Signed-off-by: Josef Bacik Link: https://lore.kernel.org/0da9348da6ece0dce12fccec07b1dd2b8e4cfdab.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 231 +++++++++++++++++++++++++++-------------------------- 1 file changed, 119 insertions(+), 112 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 12ecc6b0e6f9..c34554d8c4fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,6 +664,124 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 +/* + * Inode state bits. Protected by inode->i_lock + * + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. + * + * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, + * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at + * various stages of removing an inode. + * + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. + * + * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * these changes separately from I_DIRTY_SYNC so that we + * don't have to write inode on fdatasync() when only + * e.g. the timestamps have changed. + * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. + * I_WILL_FREE Must be set when calling write_inode_now() if i_count + * is zero. I_FREEING must be set when I_WILL_FREE is + * cleared. + * I_FREEING Set when inode is about to be freed but still has dirty + * pages or buffers attached or the inode itself is still + * dirty. + * I_CLEAR Added by clear_inode(). In this state the inode is + * clean and can be destroyed. Inode keeps I_FREEING. + * + * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are + * prohibited for many purposes. iget() must wait for + * the inode to be completely released, then create it + * anew. Other functions will just ignore such inodes, + * if appropriate. I_NEW is used for waiting. + * + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. + * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab the i_pages lock. See + * inode_switch_wbs_work_fn() for details. + * + * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper + * and work dirs among overlayfs mounts. + * + * I_CREATING New object's inode in the middle of setting up. + * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. + */ + +enum inode_state_bits { + __I_NEW = 0U, + __I_SYNC = 1U, + __I_LRU_ISOLATING = 2U + /* reserved wait address bit 3 */ +}; + +enum inode_state_flags_t { + I_NEW = (1U << __I_NEW), + I_SYNC = (1U << __I_SYNC), + I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), + /* reserved flag bit 3 */ + I_DIRTY_SYNC = (1U << 4), + I_DIRTY_DATASYNC = (1U << 5), + I_DIRTY_PAGES = (1U << 6), + I_WILL_FREE = (1U << 7), + I_FREEING = (1U << 8), + I_CLEAR = (1U << 9), + I_REFERENCED = (1U << 10), + I_LINKABLE = (1U << 11), + I_DIRTY_TIME = (1U << 12), + I_WB_SWITCH = (1U << 13), + I_OVL_INUSE = (1U << 14), + I_CREATING = (1U << 15), + I_DONTCACHE = (1U << 16), + I_SYNC_QUEUED = (1U << 17), + I_PINNING_NETFS_WB = (1U << 18) +}; + +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -722,7 +840,7 @@ struct inode { #endif /* Misc */ - u32 i_state; + enum inode_state_flags_t i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -2482,117 +2600,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, }; } -/* - * Inode state bits. Protected by inode->i_lock - * - * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, - * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. - * - * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, - * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at - * various stages of removing an inode. - * - * Two bits are used for locking and completion notification, I_NEW and I_SYNC. - * - * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync() (unless I_DIRTY_DATASYNC is also set). - * Timestamp updates are the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of - * these changes separately from I_DIRTY_SYNC so that we - * don't have to write inode on fdatasync() when only - * e.g. the timestamps have changed. - * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself has dirty timestamps, and the - * lazytime mount option is enabled. We keep track of this - * separately from I_DIRTY_SYNC in order to implement - * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But - * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already - * in place because writeback might already be in progress - * and we don't want to lose the time update - * I_NEW Serves as both a mutex and completion notification. - * New inodes set I_NEW. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_NEW to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_NEW, without I_NEW actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_WILL_FREE Must be set when calling write_inode_now() if i_count - * is zero. I_FREEING must be set when I_WILL_FREE is - * cleared. - * I_FREEING Set when inode is about to be freed but still has dirty - * pages or buffers attached or the inode itself is still - * dirty. - * I_CLEAR Added by clear_inode(). In this state the inode is - * clean and can be destroyed. Inode keeps I_FREEING. - * - * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are - * prohibited for many purposes. iget() must wait for - * the inode to be completely released, then create it - * anew. Other functions will just ignore such inodes, - * if appropriate. I_NEW is used for waiting. - * - * I_SYNC Writeback of inode is running. The bit is set during - * data writeback, and cleared with a wakeup on the bit - * address once it is done. The bit is also used to pin - * the inode in memory for flusher thread. - * - * I_REFERENCED Marks the inode as recently references on the LRU list. - * - * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to - * synchronize competing switching instances and to tell - * wb stat updates to grab the i_pages lock. See - * inode_switch_wbs_work_fn() for details. - * - * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper - * and work dirs among overlayfs mounts. - * - * I_CREATING New object's inode in the middle of setting up. - * - * I_DONTCACHE Evict inode as soon as it is not used anymore. - * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * - * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. - * - * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding - * i_count. - * - * Q: What is the difference between I_WILL_FREE and I_FREEING? - * - * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait - * upon. There's one free address left. - */ -#define __I_NEW 0 -#define I_NEW (1 << __I_NEW) -#define __I_SYNC 1 -#define I_SYNC (1 << __I_SYNC) -#define __I_LRU_ISOLATING 2 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) - -#define I_DIRTY_SYNC (1 << 3) -#define I_DIRTY_DATASYNC (1 << 4) -#define I_DIRTY_PAGES (1 << 5) -#define I_WILL_FREE (1 << 6) -#define I_FREEING (1 << 7) -#define I_CLEAR (1 << 8) -#define I_REFERENCED (1 << 9) -#define I_LINKABLE (1 << 10) -#define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 12) -#define I_OVL_INUSE (1 << 13) -#define I_CREATING (1 << 14) -#define I_DONTCACHE (1 << 15) -#define I_SYNC_QUEUED (1 << 16) -#define I_PINNING_NETFS_WB (1 << 17) - -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) -#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) - extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { -- cgit v1.2.3 From db2ab24a341ce89351a1bede37a96a3e3ce1726a Mon Sep 17 00:00:00 2001 From: Lauri Vasama Date: Wed, 27 Aug 2025 16:39:00 +0300 Subject: Add RWF_NOSIGNAL flag for pwritev2 For a user mode library to avoid generating SIGPIPE signals (e.g. because this behaviour is not portable across operating systems) is cumbersome. It is generally bad form to change the process-wide signal mask in a library, so a local solution is needed instead. For I/O performed directly using system calls (synchronous or readiness based asynchronous) this currently involves applying a thread-specific signal mask before the operation and reverting it afterwards. This can be avoided when it is known that the file descriptor refers to neither a pipe nor a socket, but a conservative implementation must always apply the mask. This incurs the cost of two additional system calls. In the case of sockets, the existing MSG_NOSIGNAL flag can be used with send. For asynchronous I/O performed using io_uring, currently the only option (apart from MSG_NOSIGNAL for sockets), is to mask SIGPIPE entirely in the call to io_uring_enter. Thankfully io_uring_enter takes a signal mask, so only a single syscall is needed. However, copying the signal mask on every call incurs a non-zero performance penalty. Furthermore, this mask applies to all completions, meaning that if the non-signaling behaviour is desired only for some subset of operations, the desired signals must be raised manually from user-mode depending on the completed operation. Add RWF_NOSIGNAL flag for pwritev2. This flag prevents the SIGPIPE signal from being raised when writing on disconnected pipes or sockets. The flag is handled directly by the pipe filesystem and converted to the existing MSG_NOSIGNAL flag for sockets. Signed-off-by: Lauri Vasama Link: https://lore.kernel.org/20250827133901.1820771-1-git@vasama.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/pipe.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 5 ++++- net/socket.c | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 780e9c774c54..34693cae15a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/net/socket.c b/net/socket.c index 682969deaed3..bac335ecee4c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; -- cgit v1.2.3 From df220cc5e689213c34a0eec7ef26d25f503c77ae Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Aug 2025 08:25:11 -0700 Subject: lib/crypto: poly1305: Remove unused function poly1305_is_arch_optimized() poly1305_is_arch_optimized() is unused, so remove it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250829152513.92459-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/poly1305.h | 9 --------- lib/crypto/arm/poly1305-glue.c | 7 ------- lib/crypto/arm64/poly1305-glue.c | 7 ------- lib/crypto/mips/poly1305-glue.c | 6 ------ lib/crypto/powerpc/poly1305-p10-glue.c | 6 ------ lib/crypto/x86/poly1305_glue.c | 6 ------ 6 files changed, 41 deletions(-) (limited to 'include') diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index e54abda8cfe9..d4daeec8da19 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -64,13 +64,4 @@ void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes); void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest); -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305) -bool poly1305_is_arch_optimized(void); -#else -static inline bool poly1305_is_arch_optimized(void) -{ - return false; -} -#endif - #endif diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c index 2d86c78af883..9e513e319e37 100644 --- a/lib/crypto/arm/poly1305-glue.c +++ b/lib/crypto/arm/poly1305-glue.c @@ -51,13 +51,6 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, } EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init arm_poly1305_mod_init(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c index 31aea21ce42f..d4a522e7d25a 100644 --- a/lib/crypto/arm64/poly1305-glue.c +++ b/lib/crypto/arm64/poly1305-glue.c @@ -50,13 +50,6 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, } EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init neon_poly1305_mod_init(void) { if (cpu_have_named_feature(ASIMD)) diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c index 764a38a65200..002f50f710ab 100644 --- a/lib/crypto/mips/poly1305-glue.c +++ b/lib/crypto/mips/poly1305-glue.c @@ -23,11 +23,5 @@ asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, const u32 nonce[4]); EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305-p10-glue.c index 3f1664a724b6..184a71f9c1de 100644 --- a/lib/crypto/powerpc/poly1305-p10-glue.c +++ b/lib/crypto/powerpc/poly1305-p10-glue.c @@ -72,12 +72,6 @@ void poly1305_emit_arch(const struct poly1305_state *state, } EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init poly1305_p10_init(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c index 856d48fd422b..deb5841cb0ad 100644 --- a/lib/crypto/x86/poly1305_glue.c +++ b/lib/crypto/x86/poly1305_glue.c @@ -141,12 +141,6 @@ void poly1305_emit_arch(const struct poly1305_state *ctx, } EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&poly1305_use_avx); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - static int __init poly1305_simd_mod_init(void) { if (boot_cpu_has(X86_FEATURE_AVX) && -- cgit v1.2.3 From b646b782e522da3509e61f971e5502fccb3a3723 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Aug 2025 08:25:12 -0700 Subject: lib/crypto: poly1305: Consolidate into single module Consolidate the Poly1305 code into a single module, similar to various other algorithms (SHA-1, SHA-256, SHA-512, etc.): - Each arch now provides a header file lib/crypto/$(SRCARCH)/poly1305.h, replacing lib/crypto/$(SRCARCH)/poly1305*.c. The header defines poly1305_block_init(), poly1305_blocks(), poly1305_emit(), and optionally poly1305_mod_init_arch(). It is included by lib/crypto/poly1305.c, and thus the code gets built into the single libpoly1305 module, with improved inlining in some cases. - Whether arch-optimized Poly1305 is buildable is now controlled centrally by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig. The conditions for enabling it remain the same as before, and it remains enabled by default. (The PPC64 one remains unconditionally disabled due to 'depends on BROKEN'.) - Any additional arch-specific translation units for the optimized Poly1305 code, such as assembly files, are now compiled by lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile. A special consideration is needed because the Adiantum code uses the poly1305_core_*() functions directly. For now, just carry forward that approach. This means retaining the CRYPTO_LIB_POLY1305_GENERIC kconfig symbol, and keeping the poly1305_core_*() functions in separate translation units. So it's not quite as streamlined I've done with the other hash functions, but we still get a single libpoly1305 module. Note: to see the diff from the arm, arm64, and x86 .c files to the new .h files, view this commit with 'git show -M10'. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250829152513.92459-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- crypto/Kconfig | 2 + include/crypto/internal/poly1305.h | 16 ++- lib/crypto/Kconfig | 50 ++++---- lib/crypto/Makefile | 59 +++++++++- lib/crypto/arm/Kconfig | 5 - lib/crypto/arm/Makefile | 18 --- lib/crypto/arm/poly1305-armv4.pl | 3 +- lib/crypto/arm/poly1305-glue.c | 69 ----------- lib/crypto/arm/poly1305.h | 53 +++++++++ lib/crypto/arm64/Kconfig | 6 - lib/crypto/arm64/Makefile | 13 --- lib/crypto/arm64/poly1305-armv8.pl | 3 + lib/crypto/arm64/poly1305-glue.c | 67 ----------- lib/crypto/arm64/poly1305.h | 50 ++++++++ lib/crypto/mips/Kconfig | 5 - lib/crypto/mips/Makefile | 14 --- lib/crypto/mips/poly1305-glue.c | 27 ----- lib/crypto/mips/poly1305-mips.pl | 8 +- lib/crypto/mips/poly1305.h | 14 +++ lib/crypto/poly1305-generic.c | 25 ---- lib/crypto/poly1305.c | 81 ++++++++----- lib/crypto/powerpc/Kconfig | 8 -- lib/crypto/powerpc/Makefile | 3 - lib/crypto/powerpc/poly1305-p10-glue.c | 90 -------------- lib/crypto/powerpc/poly1305.h | 74 ++++++++++++ lib/crypto/x86/Kconfig | 6 - lib/crypto/x86/Makefile | 10 -- lib/crypto/x86/poly1305-x86_64-cryptogams.pl | 33 ++---- lib/crypto/x86/poly1305.h | 158 +++++++++++++++++++++++++ lib/crypto/x86/poly1305_glue.c | 169 --------------------------- 30 files changed, 507 insertions(+), 632 deletions(-) delete mode 100644 lib/crypto/arm/poly1305-glue.c create mode 100644 lib/crypto/arm/poly1305.h delete mode 100644 lib/crypto/arm64/poly1305-glue.c create mode 100644 lib/crypto/arm64/poly1305.h delete mode 100644 lib/crypto/mips/poly1305-glue.c create mode 100644 lib/crypto/mips/poly1305.h delete mode 100644 lib/crypto/poly1305-generic.c delete mode 100644 lib/crypto/powerpc/poly1305-p10-glue.c create mode 100644 lib/crypto/powerpc/poly1305.h create mode 100644 lib/crypto/x86/poly1305.h delete mode 100644 lib/crypto/x86/poly1305_glue.c (limited to 'include') diff --git a/crypto/Kconfig b/crypto/Kconfig index 1575dbec084d..e8ccf5f51b85 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -609,6 +609,7 @@ menu "Length-preserving ciphers and modes" config CRYPTO_ADIANTUM tristate "Adiantum" select CRYPTO_CHACHA20 + select CRYPTO_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_NHPOLY1305 select CRYPTO_MANAGER @@ -770,6 +771,7 @@ config CRYPTO_XTS config CRYPTO_NHPOLY1305 tristate select CRYPTO_HASH + select CRYPTO_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC endmenu diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h index c60315f47562..a72fff409ab8 100644 --- a/include/crypto/internal/poly1305.h +++ b/include/crypto/internal/poly1305.h @@ -30,12 +30,13 @@ void poly1305_core_blocks(struct poly1305_state *state, void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst); -void poly1305_block_init_arch(struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -void poly1305_block_init_generic(struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit); +static inline void +poly1305_block_init_generic(struct poly1305_block_state *desc, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_core_init(&desc->h); + poly1305_core_setkey(&desc->core_r, raw_key); +} static inline void poly1305_blocks_generic(struct poly1305_block_state *state, const u8 *src, unsigned int len, @@ -45,9 +46,6 @@ static inline void poly1305_blocks_generic(struct poly1305_block_state *state, len / POLY1305_BLOCK_SIZE, padbit); } -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]); - static inline void poly1305_emit_generic(const struct poly1305_state *state, u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 79b848448e07..9991118c41a9 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -114,6 +114,33 @@ config CRYPTO_LIB_MD5_ARCH default y if PPC default y if SPARC64 +config CRYPTO_LIB_POLY1305 + tristate + help + The Poly1305 library functions. Select this if your module uses any + of the functions from . + +config CRYPTO_LIB_POLY1305_ARCH + bool + depends on CRYPTO_LIB_POLY1305 && !UML + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS + # The PPC64 code needs to be fixed to work in softirq context. + default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN + default y if X86_64 + +# This symbol controls the inclusion of the Poly1305 generic code. This differs +# from most of the other algorithms, which handle the generic code +# "automatically" via __maybe_unused. This is needed so that the Adiantum code, +# which calls the poly1305_core_*() functions directly, can enable them. +config CRYPTO_LIB_POLY1305_GENERIC + bool + depends on CRYPTO_LIB_POLY1305 + # Enable if there's no arch impl or the arch impl requires the generic + # impl as a fallback. (Or if selected explicitly.) + default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64 + config CRYPTO_LIB_POLY1305_RSIZE int default 2 if MIPS @@ -121,29 +148,6 @@ config CRYPTO_LIB_POLY1305_RSIZE default 9 if ARM || ARM64 default 1 -config CRYPTO_ARCH_HAVE_LIB_POLY1305 - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Poly1305 library interface, - either builtin or as a module. - -config CRYPTO_LIB_POLY1305_GENERIC - tristate - default CRYPTO_LIB_POLY1305 if !CRYPTO_ARCH_HAVE_LIB_POLY1305 - help - This symbol can be selected by arch implementations of the Poly1305 - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_POLY1305. - -config CRYPTO_LIB_POLY1305 - tristate - help - Enable the Poly1305 library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. - config CRYPTO_LIB_CHACHA20POLY1305 tristate select CRYPTO_LIB_CHACHA diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index d362636a22d3..e0536e3b3a04 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -71,13 +71,60 @@ endif # CONFIG_CRYPTO_LIB_MD5_ARCH ################################################################################ -obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o -libpoly1305-y += poly1305.o +obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o +libpoly1305-y := poly1305.o +ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y) +libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o +else +libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o +endif + +ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y) +CFLAGS_poly1305.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libpoly1305-y += arm/poly1305-core.o +$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl + $(call cmd,perlasm) +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) +endif + +ifeq ($(CONFIG_ARM64),y) +libpoly1305-y += arm64/poly1305-core.o +$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl + $(call cmd,perlasm_with_args) +endif + +ifeq ($(CONFIG_MIPS),y) +libpoly1305-y += mips/poly1305-core.o +poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32 +poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64 +quiet_cmd_perlasm_poly1305 = PERLASM $@ + cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@ +# Use if_changed instead of cmd, in case the flavour changed. +$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE + $(call if_changed,perlasm_poly1305) +targets += mips/poly1305-core.S +endif -obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305-generic.o -libpoly1305-generic-y := poly1305-donna32.o -libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o -libpoly1305-generic-y += poly1305-generic.o +libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o + +ifeq ($(CONFIG_X86),y) +libpoly1305-y += x86/poly1305-x86_64-cryptogams.o +$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl + $(call cmd,perlasm) +endif + +endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH + +# clean-files must be defined unconditionally +clean-files += arm/poly1305-core.S \ + arm64/poly1305-core.S \ + mips/poly1305-core.S \ + x86/poly1305-x86_64-cryptogams.S ################################################################################ diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index e8444fd0aae3..0d821e282c64 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -17,8 +17,3 @@ config CRYPTO_CHACHA20_NEON tristate default CRYPTO_LIB_CHACHA select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_ARM - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile index 4c042a4c77ed..9f70e61d419e 100644 --- a/lib/crypto/arm/Makefile +++ b/lib/crypto/arm/Makefile @@ -6,21 +6,3 @@ libblake2s-arm-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o chacha-neon-y := chacha-scalar-core.o chacha-glue.o chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o - -obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o -poly1305-arm-y := poly1305-core.o poly1305-glue.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $(<) > $(@) - -$(obj)/%-core.S: $(src)/%-armv4.pl - $(call cmd,perl) - -clean-files += poly1305-core.S - -aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 - -# massage the perlasm code a bit so we only get the NEON routine if we need it -poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 -poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 -AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl index dd7a996361a7..34c11b7b44bd 100644 --- a/lib/crypto/arm/poly1305-armv4.pl +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -43,9 +43,8 @@ $code.=<<___; #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define poly1305_init poly1305_block_init_arch +# define poly1305_init poly1305_block_init # define poly1305_blocks poly1305_blocks_arm -# define poly1305_emit poly1305_emit_arch #endif #if defined(__thumb2__) diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c deleted file mode 100644 index 9e513e319e37..000000000000 --- a/lib/crypto/arm/poly1305-glue.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && likely(may_use_simd())) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks_arm(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -static int __init arm_poly1305_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - (elf_hwcap & HWCAP_NEON)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(arm_poly1305_mod_init); - -static void __exit arm_poly1305_mod_exit(void) -{ -} -module_exit(arm_poly1305_mod_exit); - -MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h new file mode 100644 index 000000000000..0021cf368307 --- /dev/null +++ b/lib/crypto/arm/poly1305.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm(state, src, len, padbit); +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) + static_branch_enable(&have_neon); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig index 0b903ef524d8..07c8a4f0ab03 100644 --- a/lib/crypto/arm64/Kconfig +++ b/lib/crypto/arm64/Kconfig @@ -6,9 +6,3 @@ config CRYPTO_CHACHA20_NEON default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile index 6207088397a7..d49cceca3d1c 100644 --- a/lib/crypto/arm64/Makefile +++ b/lib/crypto/arm64/Makefile @@ -2,16 +2,3 @@ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o -poly1305-neon-y := poly1305-core.o poly1305-glue.o -AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch -AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/%-core.S: $(src)/%-armv8.pl - $(call cmd,perlasm) - -clean-files += poly1305-core.S diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl index 22c9069c0650..f1930c6b55ce 100644 --- a/lib/crypto/arm64/poly1305-armv8.pl +++ b/lib/crypto/arm64/poly1305-armv8.pl @@ -50,6 +50,9 @@ $code.=<<___; #ifndef __KERNEL__ # include "arm_arch.h" .extern OPENSSL_armcap_P +#else +# define poly1305_init poly1305_block_init +# define poly1305_blocks poly1305_blocks_arm64 #endif .text diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c deleted file mode 100644 index d4a522e7d25a..000000000000 --- a/lib/crypto/arm64/poly1305-glue.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (static_branch_likely(&have_neon) && likely(may_use_simd())) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -static int __init neon_poly1305_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(neon_poly1305_mod_init); - -static void __exit neon_poly1305_mod_exit(void) -{ -} -module_exit(neon_poly1305_mod_exit); - -MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h new file mode 100644 index 000000000000..aed5921ccd9a --- /dev/null +++ b/lib/crypto/arm64/poly1305.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm64(state, src, len, padbit); +} + +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig index 0670a170c1be..94c1a0892c20 100644 --- a/lib/crypto/mips/Kconfig +++ b/lib/crypto/mips/Kconfig @@ -5,8 +5,3 @@ config CRYPTO_CHACHA_MIPS depends on CPU_MIPS32_R2 default CRYPTO_LIB_CHACHA select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_MIPS - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile index 804488c7aded..b5ea0e25c21e 100644 --- a/lib/crypto/mips/Makefile +++ b/lib/crypto/mips/Makefile @@ -3,17 +3,3 @@ obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o chacha-mips-y := chacha-core.o chacha-glue.o AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots - -obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o -poly1305-mips-y := poly1305-core.o poly1305-glue.o - -perlasm-flavour-$(CONFIG_32BIT) := o32 -perlasm-flavour-$(CONFIG_64BIT) := 64 - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) - -$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE - $(call if_changed,perlasm) - -targets += poly1305-core.S diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c deleted file mode 100644 index 002f50f710ab..000000000000 --- a/lib/crypto/mips/poly1305-glue.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl index 399f10c3e385..71347f34f4f9 100644 --- a/lib/crypto/mips/poly1305-mips.pl +++ b/lib/crypto/mips/poly1305-mips.pl @@ -93,9 +93,7 @@ $code.=<<___; #endif #ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch +# define poly1305_init poly1305_block_init #endif #if defined(__MIPSEB__) && !defined(MIPSEB) @@ -565,9 +563,7 @@ $code.=<<___; #endif #ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch +# define poly1305_init poly1305_block_init #endif #if defined(__MIPSEB__) && !defined(MIPSEB) diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h new file mode 100644 index 000000000000..85de450f1a93 --- /dev/null +++ b/lib/crypto/mips/poly1305.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. + */ + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c deleted file mode 100644 index 71a16c5c538b..000000000000 --- a/lib/crypto/poly1305-generic.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Poly1305 authenticator algorithm, RFC7539 - * - * Copyright (C) 2015 Martin Willi - * - * Based on public domain code by Andrew Moon and Daniel J. Bernstein. - */ - -#include -#include -#include -#include - -void poly1305_block_init_generic(struct poly1305_block_state *desc, - const u8 raw_key[POLY1305_BLOCK_SIZE]) -{ - poly1305_core_init(&desc->h); - poly1305_core_setkey(&desc->core_r, raw_key); -} -EXPORT_SYMBOL_GPL(poly1305_block_init_generic); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("Poly1305 algorithm (generic implementation)"); diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index a6dc182b6c22..f313ccc4b4dd 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -7,7 +7,6 @@ * Based on public domain code by Andrew Moon and Daniel J. Bernstein. */ -#include #include #include #include @@ -15,6 +14,14 @@ #include #include +#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH +#include "poly1305.h" /* $(SRCARCH)/poly1305.h */ +#else +#define poly1305_block_init poly1305_block_init_generic +#define poly1305_blocks poly1305_blocks_generic +#define poly1305_emit poly1305_emit_generic +#endif + void poly1305_init(struct poly1305_desc_ctx *desc, const u8 key[POLY1305_KEY_SIZE]) { @@ -23,28 +30,40 @@ void poly1305_init(struct poly1305_desc_ctx *desc, desc->s[2] = get_unaligned_le32(key + 24); desc->s[3] = get_unaligned_le32(key + 28); desc->buflen = 0; - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_block_init_arch(&desc->state, key); - else - poly1305_block_init_generic(&desc->state, key); + poly1305_block_init(&desc->state, key); } EXPORT_SYMBOL(poly1305_init); -static inline void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, unsigned int len) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_blocks_arch(state, src, len, 1); - else - poly1305_blocks_generic(state, src, len, 1); -} - void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes) { - desc->buflen = BLOCK_HASH_UPDATE(poly1305_blocks, &desc->state, - src, nbytes, POLY1305_BLOCK_SIZE, - desc->buf, desc->buflen); + if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) { + unsigned int bulk_len; + + if (desc->buflen) { + unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen; + + memcpy(&desc->buf[desc->buflen], src, l); + src += l; + nbytes -= l; + + poly1305_blocks(&desc->state, desc->buf, + POLY1305_BLOCK_SIZE, 1); + desc->buflen = 0; + } + + bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE); + nbytes %= POLY1305_BLOCK_SIZE; + + if (bulk_len) { + poly1305_blocks(&desc->state, src, bulk_len, 1); + src += bulk_len; + } + } + if (nbytes) { + memcpy(&desc->buf[desc->buflen], src, nbytes); + desc->buflen += nbytes; + } } EXPORT_SYMBOL(poly1305_update); @@ -54,22 +73,28 @@ void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst) desc->buf[desc->buflen++] = 1; memset(desc->buf + desc->buflen, 0, POLY1305_BLOCK_SIZE - desc->buflen); - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_blocks_arch(&desc->state, desc->buf, - POLY1305_BLOCK_SIZE, 0); - else - poly1305_blocks_generic(&desc->state, desc->buf, - POLY1305_BLOCK_SIZE, 0); + poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE, + 0); } - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_emit_arch(&desc->state.h, dst, desc->s); - else - poly1305_emit_generic(&desc->state.h, dst, desc->s); + poly1305_emit(&desc->state.h, dst, desc->s); *desc = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final); +#ifdef poly1305_mod_init_arch +static int __init poly1305_mod_init(void) +{ + poly1305_mod_init_arch(); + return 0; +} +subsys_initcall(poly1305_mod_init); + +static void __exit poly1305_mod_exit(void) +{ +} +module_exit(poly1305_mod_exit); +#endif + MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539"); diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig index 2eaeb7665a6a..e41012a61876 100644 --- a/lib/crypto/powerpc/Kconfig +++ b/lib/crypto/powerpc/Kconfig @@ -6,11 +6,3 @@ config CRYPTO_CHACHA20_P10 default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - depends on BROKEN # Needs to be fixed to work in softirq context - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - select CRYPTO_LIB_POLY1305_GENERIC diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile index 5709ae14258a..778a04edd226 100644 --- a/lib/crypto/powerpc/Makefile +++ b/lib/crypto/powerpc/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o - -obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o -poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305-p10-glue.c deleted file mode 100644 index 184a71f9c1de..000000000000 --- a/lib/crypto/powerpc/poly1305-p10-glue.c +++ /dev/null @@ -1,90 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Poly1305 authenticator algorithm, RFC7539. - * - * Copyright 2023- IBM Corp. All rights reserved. - */ -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); -asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit); -asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); - -static void vsx_begin(void) -{ - preempt_disable(); - enable_kernel_vsx(); -} - -static void vsx_end(void) -{ - disable_kernel_vsx(); - preempt_enable(); -} - -void poly1305_block_init_arch(struct poly1305_block_state *dctx, - const u8 raw_key[POLY1305_BLOCK_SIZE]) -{ - if (!static_key_enabled(&have_p10)) - return poly1305_block_init_generic(dctx, raw_key); - - dctx->h = (struct poly1305_state){}; - dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); - dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); -} -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - if (!static_key_enabled(&have_p10)) - return poly1305_blocks_generic(state, src, len, padbit); - vsx_begin(); - if (len >= POLY1305_BLOCK_SIZE * 4) { - poly1305_p10le_4blocks(state, src, len); - src += len - (len % (POLY1305_BLOCK_SIZE * 4)); - len %= POLY1305_BLOCK_SIZE * 4; - } - while (len >= POLY1305_BLOCK_SIZE) { - poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit); - len -= POLY1305_BLOCK_SIZE; - src += POLY1305_BLOCK_SIZE; - } - vsx_end(); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) -{ - if (!static_key_enabled(&have_p10)) - return poly1305_emit_generic(state, digest, nonce); - poly1305_emit_64(state, nonce, digest); -} -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static int __init poly1305_p10_init(void) -{ - if (cpu_has_feature(CPU_FTR_ARCH_31)) - static_branch_enable(&have_p10); - return 0; -} -subsys_initcall(poly1305_p10_init); - -static void __exit poly1305_p10_exit(void) -{ -} -module_exit(poly1305_p10_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/lib/crypto/powerpc/poly1305.h b/lib/crypto/powerpc/poly1305.h new file mode 100644 index 000000000000..b8ed098a0e95 --- /dev/null +++ b/lib/crypto/powerpc/poly1305.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Poly1305 authenticator algorithm, RFC7539. + * + * Copyright 2023- IBM Corp. All rights reserved. + */ +#include +#include +#include +#include +#include + +asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); +asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit); +asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void poly1305_block_init(struct poly1305_block_state *dctx, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_block_init_generic(dctx, raw_key); + + dctx->h = (struct poly1305_state){}; + dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); + dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); +} + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_blocks_generic(state, src, len, padbit); + vsx_begin(); + if (len >= POLY1305_BLOCK_SIZE * 4) { + poly1305_p10le_4blocks(state, src, len); + src += len - (len % (POLY1305_BLOCK_SIZE * 4)); + len %= POLY1305_BLOCK_SIZE * 4; + } + while (len >= POLY1305_BLOCK_SIZE) { + poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit); + len -= POLY1305_BLOCK_SIZE; + src += POLY1305_BLOCK_SIZE; + } + vsx_end(); +} + +static void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_emit_generic(state, digest, nonce); + poly1305_emit_64(state, nonce, digest); +} + +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); +} diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index 546fe2afe0b5..24dc9a59b272 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -18,9 +18,3 @@ config CRYPTO_CHACHA20_X86_64 default CRYPTO_LIB_CHACHA select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile index c2ff8c5f1046..16c9d76f9947 100644 --- a/lib/crypto/x86/Makefile +++ b/lib/crypto/x86/Makefile @@ -5,13 +5,3 @@ libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ - -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl index 501827254fed..409ec6955733 100644 --- a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl +++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl @@ -118,19 +118,6 @@ sub declare_function() { } } -sub declare_typed_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_TYPED_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - sub end_function() { my ($name) = @_; if($kernel) { @@ -141,7 +128,7 @@ sub end_function() { } $code.=<<___ if $kernel; -#include +#include ___ if ($avx) { @@ -249,14 +236,14 @@ ___ $code.=<<___ if (!$kernel); .extern OPENSSL_ia32cap_P -.globl poly1305_block_init_arch -.hidden poly1305_block_init_arch +.globl poly1305_init_x86_64 +.hidden poly1305_init_x86_64 .globl poly1305_blocks_x86_64 .hidden poly1305_blocks_x86_64 .globl poly1305_emit_x86_64 .hidden poly1305_emit_x86_64 ___ -&declare_typed_function("poly1305_block_init_arch", 32, 3); +&declare_function("poly1305_init_x86_64", 32, 3); $code.=<<___; xor %eax,%eax mov %rax,0($ctx) # initialize hash value @@ -311,7 +298,7 @@ $code.=<<___; .Lno_key: RET ___ -&end_function("poly1305_block_init_arch"); +&end_function("poly1305_init_x86_64"); &declare_function("poly1305_blocks_x86_64", 32, 4); $code.=<<___; @@ -4118,9 +4105,9 @@ avx_handler: .section .pdata .align 4 - .rva .LSEH_begin_poly1305_block_init_arch - .rva .LSEH_end_poly1305_block_init_arch - .rva .LSEH_info_poly1305_block_init_arch + .rva .LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_end_poly1305_init_x86_64 + .rva .LSEH_info_poly1305_init_x86_64 .rva .LSEH_begin_poly1305_blocks_x86_64 .rva .LSEH_end_poly1305_blocks_x86_64 @@ -4168,10 +4155,10 @@ ___ $code.=<<___; .section .xdata .align 8 -.LSEH_info_poly1305_block_init_arch: +.LSEH_info_poly1305_init_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 .LSEH_info_poly1305_blocks_x86_64: .byte 9,0,0,0 diff --git a/lib/crypto/x86/poly1305.h b/lib/crypto/x86/poly1305.h new file mode 100644 index 000000000000..ee92e3740a78 --- /dev/null +++ b/lib/crypto/x86/poly1305.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; + +/* + * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) +{ + struct poly1305_arch_internal *state = ctx; + u32 cy; + + if (!state->is_base2_26) + return; + + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; + /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */ +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; +} + +asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +static void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_init_x86_64(state, raw_key); +} + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp, + unsigned int len, u32 padbit) +{ + struct poly1305_arch_internal *ctx = + container_of(&state->h.h, struct poly1305_arch_internal, h); + + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); + + /* + * The AVX implementations have significant setup overhead (e.g. key + * power computation, kernel FPU enabling) which makes them slower for + * short messages. Fall back to the scalar implementation for messages + * shorter than 288 bytes, unless the AVX-specific key setup has already + * been performed (indicated by ctx->is_base2_26). + */ + if (!static_branch_likely(&poly1305_use_avx) || + (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) || + unlikely(!irq_fpu_usable())) { + convert_to_base2_64(ctx); + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return; + } + + do { + const unsigned int bytes = min(len, SZ_4K); + + kernel_fpu_begin(); + if (static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + + len -= bytes; + inp += bytes; + } while (len); +} + +static void poly1305_emit(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +{ + if (!static_branch_likely(&poly1305_use_avx)) + poly1305_emit_x86_64(ctx, mac, nonce); + else + poly1305_emit_avx(ctx, mac, nonce); +} + +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); +} diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c deleted file mode 100644 index deb5841cb0ad..000000000000 --- a/lib/crypto/x86/poly1305_glue.c +++ /dev/null @@ -1,169 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -/* - * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit - * the unfortunate situation of using AVX and then having to go back to scalar - * -- because the user is silly and has called the update function from two - * separate contexts -- then we need to convert back to the original base before - * proceeding. It is possible to reason that the initial reduction below is - * sufficient given the implementation invariants. However, for an avoidance of - * doubt and because this is not performance critical, we do the full reduction - * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py - */ -static void convert_to_base2_64(void *ctx) -{ - struct poly1305_arch_internal *state = ctx; - u32 cy; - - if (!state->is_base2_26) - return; - - cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; - cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; - cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; - cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; - state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; - state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); - state->hs[2] = state->h[4] >> 24; - /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */ -#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) - cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); - state->hs[2] &= 3; - state->hs[0] += cy; - state->hs[1] += (cy = ULT(state->hs[0], cy)); - state->hs[2] += ULT(state->hs[1], cy); -#undef ULT - state->is_base2_26 = 0; -} - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, - unsigned int len, u32 padbit) -{ - struct poly1305_arch_internal *ctx = - container_of(&state->h.h, struct poly1305_arch_internal, h); - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - /* - * The AVX implementations have significant setup overhead (e.g. key - * power computation, kernel FPU enabling) which makes them slower for - * short messages. Fall back to the scalar implementation for messages - * shorter than 288 bytes, unless the AVX-specific key setup has already - * been performed (indicated by ctx->is_base2_26). - */ - if (!static_branch_likely(&poly1305_use_avx) || - (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) || - unlikely(!irq_fpu_usable())) { - convert_to_base2_64(ctx); - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const unsigned int bytes = min(len, SZ_4K); - - kernel_fpu_begin(); - if (static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -void poly1305_emit_arch(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return 0; -} -subsys_initcall(poly1305_simd_mod_init); - -static void __exit poly1305_simd_mod_exit(void) -{ -} -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld "); -MODULE_DESCRIPTION("Poly1305 authenticator"); -- cgit v1.2.3 From c4b846ff6ecab0427cc7dcccbe0af60b244a6d56 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:22 -0700 Subject: lib/crypto: chacha: Remove unused function chacha_is_arch_optimized() chacha_is_arch_optimized() is no longer used, so remove it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/chacha.h | 9 --------- lib/crypto/arm/chacha-glue.c | 7 ------- lib/crypto/arm64/chacha-neon-glue.c | 6 ------ lib/crypto/mips/chacha-glue.c | 6 ------ lib/crypto/powerpc/chacha-p10-glue.c | 6 ------ lib/crypto/riscv/chacha-riscv64-glue.c | 6 ------ lib/crypto/s390/chacha-glue.c | 6 ------ lib/crypto/x86/chacha_glue.c | 6 ------ 8 files changed, 52 deletions(-) (limited to 'include') diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index 91f6b4cf561c..be25a0b65a05 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -119,13 +119,4 @@ static inline void chacha_zeroize_state(struct chacha_state *state) memzero_explicit(state, sizeof(*state)); } -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA) -bool chacha_is_arch_optimized(void); -#else -static inline bool chacha_is_arch_optimized(void) -{ - return false; -} -#endif - #endif /* _CRYPTO_CHACHA_H */ diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c index 88ec96415283..67ba045cae35 100644 --- a/lib/crypto/arm/chacha-glue.c +++ b/lib/crypto/arm/chacha-glue.c @@ -101,13 +101,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_arm_mod_init(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c index d0188f974ca5..48097aa34af7 100644 --- a/lib/crypto/arm64/chacha-neon-glue.c +++ b/lib/crypto/arm64/chacha-neon-glue.c @@ -95,12 +95,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_neon); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_simd_mod_init(void) { if (cpu_have_named_feature(ASIMD)) diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c index 88c097594eb0..f8390af21dc9 100644 --- a/lib/crypto/mips/chacha-glue.c +++ b/lib/crypto/mips/chacha-glue.c @@ -18,12 +18,6 @@ asmlinkage void hchacha_block_arch(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); EXPORT_SYMBOL(hchacha_block_arch); -bool chacha_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha-p10-glue.c index fcd23c6f1590..5d3d5506d7f9 100644 --- a/lib/crypto/powerpc/chacha-p10-glue.c +++ b/lib/crypto/powerpc/chacha-p10-glue.c @@ -76,12 +76,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_p10_init(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha-riscv64-glue.c index 8c3f11d79be3..a15f0aca3fc4 100644 --- a/lib/crypto/riscv/chacha-riscv64-glue.c +++ b/lib/crypto/riscv/chacha-riscv64-glue.c @@ -50,12 +50,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&use_zvkb); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init riscv64_chacha_mod_init(void) { if (riscv_isa_extension_available(NULL, ZVKB) && diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c index c57dc851214f..d8137387fe28 100644 --- a/lib/crypto/s390/chacha-glue.c +++ b/lib/crypto/s390/chacha-glue.c @@ -47,11 +47,5 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return cpu_has_vx(); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha_glue.c index 10b2c945f541..de7da9d512af 100644 --- a/lib/crypto/x86/chacha_glue.c +++ b/lib/crypto/x86/chacha_glue.c @@ -160,12 +160,6 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&chacha_use_simd); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) -- cgit v1.2.3 From 13cecc526d8fe7eeb9b136159738688a1a10cd82 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:25 -0700 Subject: lib/crypto: chacha: Consolidate into single module Consolidate the ChaCha code into a single module (excluding chacha-block-generic.c which remains always built-in for random.c), similar to various other algorithms: - Each arch now provides a header file lib/crypto/$(SRCARCH)/chacha.h, replacing lib/crypto/$(SRCARCH)/chacha*.c. The header defines chacha_crypt_arch() and hchacha_block_arch(). It is included by lib/crypto/chacha.c, and thus the code gets built into the single libchacha module, with improved inlining in some cases. - Whether arch-optimized ChaCha is buildable is now controlled centrally by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig. The conditions for enabling it remain the same as before, and it remains enabled by default. - Any additional arch-specific translation units for the optimized ChaCha code, such as assembly files, are now compiled by lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile. This removes the last use for the Makefile and Kconfig files in the arm64, mips, powerpc, riscv, and s390 subdirectories of lib/crypto/. So also remove those files and the references to them. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/chacha.h | 28 +---- lib/crypto/Kconfig | 47 +++----- lib/crypto/Makefile | 43 ++++++-- lib/crypto/arm/Kconfig | 5 - lib/crypto/arm/Makefile | 4 - lib/crypto/arm/chacha-glue.c | 131 ----------------------- lib/crypto/arm/chacha.h | 117 ++++++++++++++++++++ lib/crypto/arm64/Kconfig | 8 -- lib/crypto/arm64/Makefile | 4 - lib/crypto/arm64/chacha-neon-glue.c | 113 -------------------- lib/crypto/arm64/chacha.h | 99 +++++++++++++++++ lib/crypto/chacha.c | 41 ++++++- lib/crypto/mips/Kconfig | 7 -- lib/crypto/mips/Makefile | 5 - lib/crypto/mips/chacha-glue.c | 23 ---- lib/crypto/mips/chacha.h | 14 +++ lib/crypto/powerpc/Kconfig | 8 -- lib/crypto/powerpc/Makefile | 4 - lib/crypto/powerpc/chacha-p10-glue.c | 94 ---------------- lib/crypto/powerpc/chacha.h | 76 +++++++++++++ lib/crypto/riscv/Kconfig | 8 -- lib/crypto/riscv/Makefile | 4 - lib/crypto/riscv/chacha-riscv64-glue.c | 69 ------------ lib/crypto/riscv/chacha.h | 51 +++++++++ lib/crypto/s390/Kconfig | 7 -- lib/crypto/s390/Makefile | 4 - lib/crypto/s390/chacha-glue.c | 51 --------- lib/crypto/s390/chacha.h | 36 +++++++ lib/crypto/x86/Kconfig | 7 -- lib/crypto/x86/Makefile | 3 - lib/crypto/x86/chacha.h | 176 ++++++++++++++++++++++++++++++ lib/crypto/x86/chacha_glue.c | 190 --------------------------------- 32 files changed, 657 insertions(+), 820 deletions(-) delete mode 100644 lib/crypto/arm/chacha-glue.c create mode 100644 lib/crypto/arm/chacha.h delete mode 100644 lib/crypto/arm64/Kconfig delete mode 100644 lib/crypto/arm64/Makefile delete mode 100644 lib/crypto/arm64/chacha-neon-glue.c create mode 100644 lib/crypto/arm64/chacha.h delete mode 100644 lib/crypto/mips/Kconfig delete mode 100644 lib/crypto/mips/Makefile delete mode 100644 lib/crypto/mips/chacha-glue.c create mode 100644 lib/crypto/mips/chacha.h delete mode 100644 lib/crypto/powerpc/Kconfig delete mode 100644 lib/crypto/powerpc/Makefile delete mode 100644 lib/crypto/powerpc/chacha-p10-glue.c create mode 100644 lib/crypto/powerpc/chacha.h delete mode 100644 lib/crypto/riscv/Kconfig delete mode 100644 lib/crypto/riscv/Makefile delete mode 100644 lib/crypto/riscv/chacha-riscv64-glue.c create mode 100644 lib/crypto/riscv/chacha.h delete mode 100644 lib/crypto/s390/Kconfig delete mode 100644 lib/crypto/s390/Makefile delete mode 100644 lib/crypto/s390/chacha-glue.c create mode 100644 lib/crypto/s390/chacha.h create mode 100644 lib/crypto/x86/chacha.h delete mode 100644 lib/crypto/x86/chacha_glue.c (limited to 'include') diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index be25a0b65a05..38e26dff27b0 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -45,19 +45,11 @@ static inline void chacha20_block(struct chacha_state *state, chacha_block_generic(state, out, 20); } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); void hchacha_block_generic(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); -static inline void hchacha_block(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) - hchacha_block_arch(state, out, nrounds); - else - hchacha_block_generic(state, out, nrounds); -} +void hchacha_block(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, @@ -93,20 +85,8 @@ static inline void chacha_init(struct chacha_state *state, state->x[15] = get_unaligned_le32(iv + 12); } -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); - -static inline void chacha_crypt(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) - chacha_crypt_arch(state, dst, src, bytes, nrounds); - else - chacha_crypt_generic(state, dst, src, bytes, nrounds); -} +void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); static inline void chacha20_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index cb4e056a98fa..c1db483bc230 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -44,29 +44,23 @@ config CRYPTO_LIB_BLAKE2S_GENERIC implementation is enabled, this implementation serves the users of CRYPTO_LIB_BLAKE2S. -config CRYPTO_ARCH_HAVE_LIB_CHACHA - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the ChaCha library interface, - either builtin or as a module. - -config CRYPTO_LIB_CHACHA_GENERIC +config CRYPTO_LIB_CHACHA tristate - default CRYPTO_LIB_CHACHA if !CRYPTO_ARCH_HAVE_LIB_CHACHA select CRYPTO_LIB_UTILS help - This symbol can be selected by arch implementations of the ChaCha - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_CHACHA. + Enable the ChaCha library interface. Select this if your module uses + chacha_crypt() or hchacha_block(). -config CRYPTO_LIB_CHACHA - tristate - help - Enable the ChaCha library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. +config CRYPTO_LIB_CHACHA_ARCH + bool + depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS && CPU_MIPS32_R2 + default y if PPC64 && CPU_LITTLE_ENDIAN && VSX + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 + default y if X86_64 config CRYPTO_ARCH_HAVE_LIB_CURVE25519 bool @@ -218,21 +212,6 @@ if !KMSAN # avoid false positives from assembly if ARM source "lib/crypto/arm/Kconfig" endif -if ARM64 -source "lib/crypto/arm64/Kconfig" -endif -if MIPS -source "lib/crypto/mips/Kconfig" -endif -if PPC -source "lib/crypto/powerpc/Kconfig" -endif -if RISCV -source "lib/crypto/riscv/Kconfig" -endif -if S390 -source "lib/crypto/s390/Kconfig" -endif if X86 source "lib/crypto/x86/Kconfig" endif diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index a006048ba2bd..baafd58b5dfd 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -15,11 +15,6 @@ obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o -# chacha20_block() is used by the /dev/random driver which is always builtin -obj-y += chacha-block-generic.o -obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o -libchacha-y := chacha.o - obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o libaes-y := aes.o @@ -40,6 +35,39 @@ libblake2s-y := blake2s.o libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o +################################################################################ + +# chacha20_block() is used by the /dev/random driver which is always builtin +obj-y += chacha-block-generic.o + +obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o +libchacha-y := chacha.o + +ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y) +CFLAGS_chacha.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libchacha-y += arm/chacha-scalar-core.o +libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o +endif + +libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o + +ifeq ($(CONFIG_MIPS),y) +libchacha-y += mips/chacha-core.o +AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots +endif + +libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o +libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o +libchacha-$(CONFIG_S390) += s390/chacha-s390.o +libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \ + x86/chacha-avx2-x86_64.o \ + x86/chacha-avx512vl-x86_64.o +endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH + +################################################################################ + obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o libchacha20poly1305-y += chacha20poly1305.o libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o @@ -231,11 +259,6 @@ obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o obj-$(CONFIG_ARM) += arm/ -obj-$(CONFIG_ARM64) += arm64/ -obj-$(CONFIG_MIPS) += mips/ -obj-$(CONFIG_PPC) += powerpc/ -obj-$(CONFIG_RISCV) += riscv/ -obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_X86) += x86/ # clean-files must be defined unconditionally diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig index 0d821e282c64..740341aa35d2 100644 --- a/lib/crypto/arm/Kconfig +++ b/lib/crypto/arm/Kconfig @@ -12,8 +12,3 @@ config CRYPTO_BLAKE2S_ARM BLAKE2b, but slower than the NEON implementation of BLAKE2b. There is no NEON implementation of BLAKE2s, since NEON doesn't really help with it. - -config CRYPTO_CHACHA20_NEON - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile index 9f70e61d419e..0574b0e9739e 100644 --- a/lib/crypto/arm/Makefile +++ b/lib/crypto/arm/Makefile @@ -2,7 +2,3 @@ obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o libblake2s-arm-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-scalar-core.o chacha-glue.o -chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c deleted file mode 100644 index 67ba045cae35..000000000000 --- a/lib/crypto/arm/chacha-glue.c +++ /dev/null @@ -1,131 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha and HChaCha functions (ARM optimized) - * - * Copyright (C) 2016-2019 Linaro, Ltd. - * Copyright (C) 2015 Martin Willi - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, int nrounds); -asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, - int nrounds, unsigned int nbytes); -asmlinkage void hchacha_block_arm(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); -asmlinkage void hchacha_block_neon(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, - const struct chacha_state *state, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); - -static inline bool neon_usable(void) -{ - return static_branch_likely(&use_neon) && crypto_simd_usable(); -} - -static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - u8 buf[CHACHA_BLOCK_SIZE]; - - while (bytes > CHACHA_BLOCK_SIZE) { - unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); - - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } - if (bytes) { - const u8 *s = src; - u8 *d = dst; - - if (bytes != CHACHA_BLOCK_SIZE) - s = d = memcpy(buf, src, bytes); - chacha_block_xor_neon(state, d, s, nrounds); - if (d != dst) - memcpy(dst, buf, bytes); - state->x[12]++; - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { - hchacha_block_arm(state, out, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, out, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || - bytes <= CHACHA_BLOCK_SIZE) { - chacha_doarm(dst, src, bytes, state, nrounds); - state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); - return; - } - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int __init chacha_arm_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { - switch (read_cpuid_part()) { - case ARM_CPU_PART_CORTEX_A7: - case ARM_CPU_PART_CORTEX_A5: - /* - * The Cortex-A7 and Cortex-A5 do not perform well with - * the NEON implementation but do incredibly with the - * scalar one and use less power. - */ - break; - default: - static_branch_enable(&use_neon); - } - } - return 0; -} -subsys_initcall(chacha_arm_mod_init); - -static void __exit chacha_arm_mod_exit(void) -{ -} -module_exit(chacha_arm_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h new file mode 100644 index 000000000000..0cae30f8ee5d --- /dev/null +++ b/lib/crypto/arm/chacha.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ChaCha and HChaCha functions (ARM optimized) + * + * Copyright (C) 2016-2019 Linaro, Ltd. + * Copyright (C) 2015 Martin Willi + */ + +#include +#include +#include + +#include +#include +#include +#include + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, unsigned int nbytes); +asmlinkage void hchacha_block_arm(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + const struct chacha_state *state, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); + +static inline bool neon_usable(void) +{ + return static_branch_likely(&use_neon) && crypto_simd_usable(); +} + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes > CHACHA_BLOCK_SIZE) { + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); + + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } + if (bytes) { + const u8 *s = src; + u8 *d = dst; + + if (bytes != CHACHA_BLOCK_SIZE) + s = d = memcpy(buf, src, bytes); + chacha_block_xor_neon(state, d, s, nrounds); + if (d != dst) + memcpy(dst, buf, bytes); + state->x[12]++; + } +} + +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { + hchacha_block_arm(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} + +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || + bytes <= CHACHA_BLOCK_SIZE) { + chacha_doarm(dst, src, bytes, state, nrounds); + state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); + return; + } + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} + +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A7: + case ARM_CPU_PART_CORTEX_A5: + /* + * The Cortex-A7 and Cortex-A5 do not perform well with + * the NEON implementation but do incredibly with the + * scalar one and use less power. + */ + break; + default: + static_branch_enable(&use_neon); + } + } +} diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig deleted file mode 100644 index 07c8a4f0ab03..000000000000 --- a/lib/crypto/arm64/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile deleted file mode 100644 index d49cceca3d1c..000000000000 --- a/lib/crypto/arm64/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c deleted file mode 100644 index 48097aa34af7..000000000000 --- a/lib/crypto/arm64/chacha-neon-glue.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * ChaCha and HChaCha functions (ARM64 optimized) - * - * Copyright (C) 2016 - 2017 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, int nrounds); -asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, - int nrounds, int bytes); -asmlinkage void hchacha_block_neon(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, - int bytes, int nrounds) -{ - while (bytes > 0) { - int l = min(bytes, CHACHA_BLOCK_SIZE * 5); - - if (l <= CHACHA_BLOCK_SIZE) { - u8 buf[CHACHA_BLOCK_SIZE]; - - memcpy(buf, src, l); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, l); - state->x[12] += 1; - break; - } - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, out, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int __init chacha_simd_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/chacha.h b/lib/crypto/arm64/chacha.h new file mode 100644 index 000000000000..ba6c22d46086 --- /dev/null +++ b/lib/crypto/arm64/chacha.h @@ -0,0 +1,99 @@ +/* + * ChaCha and HChaCha functions (ARM64 optimized) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, int bytes); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + int bytes, int nrounds) +{ + while (bytes > 0) { + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state->x[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } +} + +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} + +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} + +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c index 26862ad90a96..e0c7cb4af318 100644 --- a/lib/crypto/chacha.c +++ b/lib/crypto/chacha.c @@ -11,8 +11,9 @@ #include #include -void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void __maybe_unused +chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) { /* aligned to potentially speed up crypto_xor() */ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); @@ -29,7 +30,41 @@ void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, crypto_xor_cpy(dst, src, stream, bytes); } } -EXPORT_SYMBOL(chacha_crypt_generic); + +#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH +#include "chacha.h" /* $(SRCARCH)/chacha.h */ +#else +#define chacha_crypt_arch chacha_crypt_generic +#define hchacha_block_arch hchacha_block_generic +#endif + +void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + chacha_crypt_arch(state, dst, src, bytes, nrounds); +} +EXPORT_SYMBOL_GPL(chacha_crypt); + +void hchacha_block(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_arch(state, out, nrounds); +} +EXPORT_SYMBOL_GPL(hchacha_block); + +#ifdef chacha_mod_init_arch +static int __init chacha_mod_init(void) +{ + chacha_mod_init_arch(); + return 0; +} +subsys_initcall(chacha_mod_init); + +static void __exit chacha_mod_exit(void) +{ +} +module_exit(chacha_mod_exit); +#endif MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig deleted file mode 100644 index 94c1a0892c20..000000000000 --- a/lib/crypto/mips/Kconfig +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_MIPS - tristate - depends on CPU_MIPS32_R2 - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile deleted file mode 100644 index b5ea0e25c21e..000000000000 --- a/lib/crypto/mips/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o -chacha-mips-y := chacha-core.o chacha-glue.o -AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c deleted file mode 100644 index f8390af21dc9..000000000000 --- a/lib/crypto/mips/chacha-glue.c +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha and HChaCha functions (MIPS optimized) - * - * Copyright (C) 2019 Linaro, Ltd. - */ - -#include -#include -#include - -asmlinkage void chacha_crypt_arch(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -EXPORT_SYMBOL(chacha_crypt_arch); - -asmlinkage void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); -EXPORT_SYMBOL(hchacha_block_arch); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/chacha.h b/lib/crypto/mips/chacha.h new file mode 100644 index 000000000000..0c18c0dc2a40 --- /dev/null +++ b/lib/crypto/mips/chacha.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ChaCha and HChaCha functions (MIPS optimized) + * + * Copyright (C) 2019 Linaro, Ltd. + */ + +#include + +asmlinkage void chacha_crypt_arch(struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +asmlinkage void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig deleted file mode 100644 index e41012a61876..000000000000 --- a/lib/crypto/powerpc/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile deleted file mode 100644 index 778a04edd226..000000000000 --- a/lib/crypto/powerpc/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o -chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha-p10-glue.c deleted file mode 100644 index 5d3d5506d7f9..000000000000 --- a/lib/crypto/powerpc/chacha-p10-glue.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * ChaCha stream cipher (P10 accelerated) - * - * Copyright 2023- IBM Corp. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, - const u8 *src, unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); - -static void vsx_begin(void) -{ - preempt_disable(); - enable_kernel_vsx(); -} - -static void vsx_end(void) -{ - disable_kernel_vsx(); - preempt_enable(); -} - -static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - unsigned int l = bytes & ~0x0FF; - - if (l > 0) { - chacha_p10le_8x(state, dst, src, l, nrounds); - bytes -= l; - src += l; - dst += l; - state->x[12] += l / CHACHA_BLOCK_SIZE; - } - - if (bytes > 0) - chacha_crypt_generic(state, dst, src, bytes, nrounds); -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - vsx_begin(); - chacha_p10_do_8x(state, dst, src, todo, nrounds); - vsx_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int __init chacha_p10_init(void) -{ - if (cpu_has_feature(CPU_FTR_ARCH_31)) - static_branch_enable(&have_p10); - return 0; -} -subsys_initcall(chacha_p10_init); - -static void __exit chacha_p10_exit(void) -{ -} -module_exit(chacha_p10_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/powerpc/chacha.h b/lib/crypto/powerpc/chacha.h new file mode 100644 index 000000000000..1df6e1ce31c4 --- /dev/null +++ b/lib/crypto/powerpc/chacha.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ChaCha stream cipher (P10 accelerated) + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state->x[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ + +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} + +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); +} diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig deleted file mode 100644 index bc7a43f33eb3..000000000000 --- a/lib/crypto/riscv/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_RISCV64 - tristate - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile deleted file mode 100644 index e27b78f317fc..000000000000 --- a/lib/crypto/riscv/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o -chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha-riscv64-glue.c deleted file mode 100644 index a15f0aca3fc4..000000000000 --- a/lib/crypto/riscv/chacha-riscv64-glue.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ChaCha stream cipher (RISC-V optimized) - * - * Copyright (C) 2023 SiFive, Inc. - * Author: Jerry Shih - */ - -#include -#include -#include -#include -#include -#include - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); - -asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, - size_t nblocks, int nrounds); - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - u8 block_buffer[CHACHA_BLOCK_SIZE]; - unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; - unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE; - - if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - kernel_vector_begin(); - if (full_blocks) { - chacha_zvkb(state, src, dst, full_blocks, nrounds); - src += full_blocks * CHACHA_BLOCK_SIZE; - dst += full_blocks * CHACHA_BLOCK_SIZE; - } - if (tail_bytes) { - memcpy(block_buffer, src, tail_bytes); - chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds); - memcpy(dst, block_buffer, tail_bytes); - } - kernel_vector_end(); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int __init riscv64_chacha_mod_init(void) -{ - if (riscv_isa_extension_available(NULL, ZVKB) && - riscv_vector_vlen() >= 128) - static_branch_enable(&use_zvkb); - return 0; -} -subsys_initcall(riscv64_chacha_mod_init); - -static void __exit riscv64_chacha_mod_exit(void) -{ -} -module_exit(riscv64_chacha_mod_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)"); -MODULE_AUTHOR("Jerry Shih "); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/riscv/chacha.h b/lib/crypto/riscv/chacha.h new file mode 100644 index 000000000000..5c000c6aef4b --- /dev/null +++ b/lib/crypto/riscv/chacha.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * ChaCha stream cipher (RISC-V optimized) + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); + +asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, + size_t nblocks, int nrounds); + +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ + +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) +{ + u8 block_buffer[CHACHA_BLOCK_SIZE]; + unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; + unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE; + + if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_vector_begin(); + if (full_blocks) { + chacha_zvkb(state, src, dst, full_blocks, nrounds); + src += full_blocks * CHACHA_BLOCK_SIZE; + dst += full_blocks * CHACHA_BLOCK_SIZE; + } + if (tail_bytes) { + memcpy(block_buffer, src, tail_bytes); + chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds); + memcpy(dst, block_buffer, tail_bytes); + } + kernel_vector_end(); +} + +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) +{ + if (riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + static_branch_enable(&use_zvkb); +} diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig deleted file mode 100644 index 069b355fe51a..000000000000 --- a/lib/crypto/s390/Kconfig +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_S390 - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile deleted file mode 100644 index 06c2cf77178e..000000000000 --- a/lib/crypto/s390/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o -chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c deleted file mode 100644 index d8137387fe28..000000000000 --- a/lib/crypto/s390/chacha-glue.c +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha stream cipher (s390 optimized) - * - * Copyright IBM Corp. 2021 - */ - -#define KMSG_COMPONENT "chacha_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include "chacha-s390.h" - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - /* TODO: implement hchacha_block_arch() in assembly */ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - /* s390 chacha20 implementation has 20 rounds hard-coded, - * it cannot handle a block of data or less, but otherwise - * it can handle data of arbitrary size - */ - if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) { - chacha_crypt_generic(state, dst, src, bytes, nrounds); - } else { - DECLARE_KERNEL_FPU_ONSTACK32(vxstate); - - kernel_fpu_begin(&vxstate, KERNEL_VXR); - chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]); - kernel_fpu_end(&vxstate, KERNEL_VXR); - - state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) / - CHACHA_BLOCK_SIZE; - } -} -EXPORT_SYMBOL(chacha_crypt_arch); - -MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/s390/chacha.h b/lib/crypto/s390/chacha.h new file mode 100644 index 000000000000..fd9c4a422365 --- /dev/null +++ b/lib/crypto/s390/chacha.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ChaCha stream cipher (s390 optimized) + * + * Copyright IBM Corp. 2021 + */ + +#include +#include +#include +#include +#include +#include "chacha-s390.h" + +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ + +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) +{ + /* s390 chacha20 implementation has 20 rounds hard-coded, + * it cannot handle a block of data or less, but otherwise + * it can handle data of arbitrary size + */ + if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) { + chacha_crypt_generic(state, dst, src, bytes, nrounds); + } else { + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]); + kernel_fpu_end(&vxstate, KERNEL_VXR); + + state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) / + CHACHA_BLOCK_SIZE; + } +} diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig index 24dc9a59b272..eb47da71aa6b 100644 --- a/lib/crypto/x86/Kconfig +++ b/lib/crypto/x86/Kconfig @@ -11,10 +11,3 @@ config CRYPTO_BLAKE2S_X86 Architecture: x86_64 using: - SSSE3 (Supplemental SSE3) - AVX-512 (Advanced Vector Extensions-512) - -config CRYPTO_CHACHA20_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile index 16c9d76f9947..4454556b243e 100644 --- a/lib/crypto/x86/Makefile +++ b/lib/crypto/x86/Makefile @@ -2,6 +2,3 @@ obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o diff --git a/lib/crypto/x86/chacha.h b/lib/crypto/x86/chacha.h new file mode 100644 index 000000000000..10cf8f1c569d --- /dev/null +++ b/lib/crypto/x86/chacha.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ChaCha and HChaCha functions (x86_64 optimized) + * + * Copyright (C) 2015 Martin Willi + */ + +#include +#include +#include +#include + +asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (static_branch_likely(&chacha_use_avx512vl)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + if (static_branch_likely(&chacha_use_avx2)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state->x[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12]++; + } +} + +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd)) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, out, nrounds); + kernel_fpu_end(); + } +} + +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd) || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} + +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return; + + static_branch_enable(&chacha_use_simd); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } +} diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha_glue.c deleted file mode 100644 index de7da9d512af..000000000000 --- a/lib/crypto/x86/chacha_glue.c +++ /dev/null @@ -1,190 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * ChaCha and HChaCha functions (x86_64 optimized) - * - * Copyright (C) 2015 Martin Willi - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state->x[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12]++; - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd)) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, out, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); -- cgit v1.2.3 From 126f5d90f6c855b39eebec17f93c2f9d2ce01ebb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:27 -0700 Subject: lib/crypto: blake2s: Remove obsolete self-test Remove the original BLAKE2s self-test, since it will be superseded by blake2s_kunit. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/blake2s.h | 2 - lib/crypto/Makefile | 1 - lib/crypto/blake2s-selftest.c | 651 -------------------------------------- lib/crypto/blake2s.c | 10 - 4 files changed, 664 deletions(-) delete mode 100644 lib/crypto/blake2s-selftest.c (limited to 'include') diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h index 506d56530ca9..3b82572c9433 100644 --- a/include/crypto/internal/blake2s.h +++ b/include/crypto/internal/blake2s.h @@ -16,6 +16,4 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, void blake2s_compress(struct blake2s_state *state, const u8 *block, size_t nblocks, const u32 inc); -bool blake2s_selftest(void); - #endif /* _CRYPTO_INTERNAL_BLAKE2S_H */ diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index baafd58b5dfd..b2d2745879d1 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o obj-y += libblake2s.o libblake2s-y := blake2s.o libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o -libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o ################################################################################ diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c deleted file mode 100644 index d0634ed6a937..000000000000 --- a/lib/crypto/blake2s-selftest.c +++ /dev/null @@ -1,651 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include - -/* - * blake2s_testvecs[] generated with the program below (using libb2-dev and - * libssl-dev [OpenSSL]) - * - * #include - * #include - * #include - * - * #include - * - * #define BLAKE2S_TESTVEC_COUNT 256 - * - * static void print_vec(const uint8_t vec[], int len) - * { - * int i; - * - * printf(" { "); - * for (i = 0; i < len; i++) { - * if (i && (i % 12) == 0) - * printf("\n "); - * printf("0x%02x, ", vec[i]); - * } - * printf("},\n"); - * } - * - * int main(void) - * { - * uint8_t key[BLAKE2S_KEYBYTES]; - * uint8_t buf[BLAKE2S_TESTVEC_COUNT]; - * uint8_t hash[BLAKE2S_OUTBYTES]; - * int i, j; - * - * key[0] = key[1] = 1; - * for (i = 2; i < BLAKE2S_KEYBYTES; ++i) - * key[i] = key[i - 2] + key[i - 1]; - * - * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) - * buf[i] = (uint8_t)i; - * - * printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n"); - * - * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) { - * int outlen = 1 + i % BLAKE2S_OUTBYTES; - * int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1); - * - * blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i, - * keylen); - * print_vec(hash, outlen); - * } - * printf("};\n\n"); - * - * return 0; - *} - */ -static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = { - { 0xa1, }, - { 0x7c, 0x89, }, - { 0x74, 0x0e, 0xd4, }, - { 0x47, 0x0c, 0x21, 0x15, }, - { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, }, - { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, }, - { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, }, - { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, }, - { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, }, - { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, }, - { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, }, - { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, }, - { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda, - 0xb7, }, - { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25, - 0x52, 0x3e, }, - { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc, - 0x57, 0xe2, 0x27, }, - { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6, - 0x47, 0x2a, 0xc7, 0xf5, }, - { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43, - 0xe7, 0x72, 0x08, 0xec, 0xbe, }, - { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96, - 0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, }, - { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e, - 0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, }, - { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d, - 0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, }, - { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86, - 0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, }, - { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41, - 0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, }, - { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22, - 0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, }, - { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00, - 0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, }, - { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9, - 0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13, - 0xd1, }, - { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05, - 0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07, - 0x01, 0x3e, }, - { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74, - 0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b, - 0xec, 0x13, 0xed, }, - { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7, - 0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37, - 0x72, 0x67, 0x09, 0xfc, }, - { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38, - 0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c, - 0x95, 0x29, 0x19, 0xf2, 0x4b, }, - { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22, - 0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30, - 0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, }, - { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e, - 0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd, - 0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, }, - { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe, - 0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61, - 0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, }, - { 0x0a, }, - { 0x6e, 0xd4, }, - { 0x64, 0xe9, 0xd1, }, - { 0x30, 0xdd, 0x71, 0xef, }, - { 0x11, 0xb5, 0x0c, 0x87, 0xc9, }, - { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, }, - { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, }, - { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, }, - { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, }, - { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, }, - { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, }, - { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, }, - { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf, - 0xe2, }, - { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64, - 0x7e, 0xb0, }, - { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51, - 0x98, 0x0a, 0x8d, }, - { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04, - 0xf1, 0xc3, 0x94, 0xd8, }, - { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7, - 0x2c, 0xe8, 0x8f, 0xc7, 0x34, }, - { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41, - 0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, }, - { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81, - 0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, }, - { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4, - 0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, }, - { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a, - 0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, }, - { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb, - 0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, }, - { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9, - 0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, }, - { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9, - 0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, }, - { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e, - 0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42, - 0xe3, }, - { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9, - 0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e, - 0xe3, 0x90, }, - { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3, - 0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10, - 0x58, 0x06, 0x9a, }, - { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5, - 0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d, - 0x53, 0x03, 0x1a, 0x39, }, - { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0, - 0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5, - 0xd4, 0x36, 0xb1, 0xac, 0x73, }, - { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59, - 0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90, - 0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, }, - { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28, - 0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75, - 0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, }, - { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6, - 0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77, - 0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, }, - { 0x9d, }, - { 0x9d, 0x7d, }, - { 0xfd, 0xc3, 0xda, }, - { 0xe8, 0x82, 0xcd, 0x21, }, - { 0xc3, 0x1d, 0x42, 0x4c, 0x74, }, - { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, }, - { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, }, - { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, }, - { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, }, - { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, }, - { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, }, - { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, }, - { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3, - 0x63, }, - { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f, - 0xe6, 0xa9, }, - { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a, - 0xf6, 0x0e, 0x20, }, - { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39, - 0x18, 0x3e, 0xc7, 0xc3, }, - { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c, - 0x92, 0xfc, 0x7f, 0x34, 0x41, }, - { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb, - 0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, }, - { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96, - 0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, }, - { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6, - 0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, }, - { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba, - 0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, }, - { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0, - 0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, }, - { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d, - 0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, }, - { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59, - 0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, }, - { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89, - 0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e, - 0xaf, }, - { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc, - 0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7, - 0xb1, 0x1d, }, - { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9, - 0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf, - 0xee, 0x6a, 0xbe, }, - { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8, - 0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd, - 0x5c, 0xef, 0xa3, 0x25, }, - { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6, - 0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8, - 0x4b, 0x0e, 0xe3, 0x94, 0x9f, }, - { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc, - 0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf, - 0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, }, - { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76, - 0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46, - 0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, }, - { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02, - 0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3, - 0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, }, - { 0x02, }, - { 0x52, 0xa8, }, - { 0x38, 0x25, 0x0d, }, - { 0xe3, 0x04, 0xd4, 0x92, }, - { 0x97, 0xdb, 0xf7, 0x81, 0xca, }, - { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, }, - { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, }, - { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, }, - { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, }, - { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, }, - { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, }, - { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, }, - { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa, - 0xd3, }, - { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c, - 0xb7, 0x9a, }, - { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31, - 0x3d, 0x47, 0x3d, }, - { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0, - 0x24, 0xf0, 0x17, 0xc0, }, - { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76, - 0xd2, 0xfd, 0x0c, 0x47, 0x40, }, - { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae, - 0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, }, - { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee, - 0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, }, - { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d, - 0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, }, - { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61, - 0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, }, - { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd, - 0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, }, - { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5, - 0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, }, - { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17, - 0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, }, - { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c, - 0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c, - 0xae, }, - { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5, - 0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d, - 0x59, 0x00, }, - { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e, - 0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c, - 0x5a, 0x2c, 0x3b, }, - { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda, - 0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c, - 0x10, 0x07, 0x2a, 0xc4, }, - { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14, - 0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf, - 0xee, 0xa1, 0x74, 0xd6, 0x71, }, - { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33, - 0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff, - 0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, }, - { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c, - 0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44, - 0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, }, - { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff, - 0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23, - 0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, }, - { 0xbe, }, - { 0x17, 0x6c, }, - { 0x1a, 0x42, 0x4f, }, - { 0xba, 0xaf, 0xb7, 0x65, }, - { 0xc2, 0x63, 0x43, 0x6a, 0xea, }, - { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, }, - { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, }, - { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, }, - { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, }, - { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, }, - { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, }, - { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, }, - { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92, - 0xe9, }, - { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d, - 0xc4, 0xb3, }, - { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60, - 0xd7, 0xd3, 0x5e, }, - { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16, - 0xaf, 0x39, 0xbd, 0x92, }, - { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10, - 0xd9, 0xa7, 0x66, 0x27, 0xcf, }, - { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02, - 0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, }, - { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd, - 0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, }, - { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3, - 0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, }, - { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f, - 0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, }, - { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51, - 0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, }, - { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3, - 0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, }, - { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda, - 0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, }, - { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6, - 0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a, - 0x26, }, - { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24, - 0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35, - 0xf4, 0x1d, }, - { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9, - 0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46, - 0x4e, 0x29, 0x26, }, - { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09, - 0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98, - 0x1a, 0x5b, 0xff, 0xc9, }, - { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2, - 0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69, - 0xbf, 0xf6, 0xbe, 0x3c, 0x40, }, - { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31, - 0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20, - 0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, }, - { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4, - 0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30, - 0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, }, - { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0, - 0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5, - 0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, }, - { 0x1f, }, - { 0x82, 0x60, }, - { 0xcc, 0xe3, 0x08, }, - { 0x56, 0x17, 0xe4, 0x59, }, - { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, }, - { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, }, - { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, }, - { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, }, - { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, }, - { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, }, - { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, }, - { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, }, - { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc, - 0x5b, }, - { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0, - 0x90, 0x48, }, - { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60, - 0x04, 0xd9, 0xd5, }, - { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47, - 0xe5, 0x31, 0x06, 0x70, }, - { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c, - 0x70, 0x25, 0xdb, 0x33, 0x27, }, - { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a, - 0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, }, - { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3, - 0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, }, - { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7, - 0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, }, - { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac, - 0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, }, - { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98, - 0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, }, - { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11, - 0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, }, - { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a, - 0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, }, - { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41, - 0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70, - 0x88, }, - { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4, - 0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3, - 0xc6, 0xbb, }, - { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55, - 0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5, - 0x55, 0xfd, 0x4f, }, - { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e, - 0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2, - 0x42, 0x64, 0x3b, 0x1a, }, - { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95, - 0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5, - 0x1d, 0x60, 0x8f, 0x8a, 0x1d, }, - { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4, - 0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d, - 0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, }, - { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b, - 0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3, - 0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, }, - { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf, - 0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f, - 0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, }, - { 0x60, }, - { 0x24, 0x26, }, - { 0x47, 0xeb, 0xc9, }, - { 0x4a, 0xd0, 0xbc, 0xf0, }, - { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, }, - { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, }, - { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, }, - { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, }, - { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, }, - { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, }, - { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, }, - { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, }, - { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91, - 0x8d, }, - { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33, - 0xbf, 0xa0, }, - { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c, - 0xd5, 0x4b, 0xed, }, - { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44, - 0xc8, 0x24, 0x0a, 0xb7, }, - { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75, - 0xb2, 0x15, 0xd1, 0xb1, 0x10, }, - { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35, - 0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, }, - { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9, - 0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, }, - { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47, - 0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, }, - { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35, - 0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, }, - { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1, - 0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, }, - { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21, - 0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, }, - { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4, - 0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, }, - { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7, - 0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7, - 0xd3, }, - { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb, - 0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16, - 0xa6, 0xd6, }, - { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80, - 0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88, - 0x55, 0xd1, 0xa9, }, - { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54, - 0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46, - 0xef, 0xb0, 0x00, 0x8d, }, - { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8, - 0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94, - 0x1c, 0x9f, 0x8b, 0xf3, 0xcb, }, - { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4, - 0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67, - 0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, }, - { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02, - 0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04, - 0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, }, - { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28, - 0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca, - 0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, }, - { 0x7e, }, - { 0x1e, 0x21, }, - { 0x26, 0xd3, 0xdd, }, - { 0x2c, 0xd4, 0xb3, 0x3d, }, - { 0x86, 0x7b, 0x76, 0x3c, 0xf0, }, - { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, }, - { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, }, - { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, }, - { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, }, - { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, }, - { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, }, - { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, }, - { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77, - 0x66, }, - { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31, - 0x55, 0x66, }, - { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12, - 0x43, 0xbf, 0xa1, }, - { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e, - 0x95, 0x8a, 0xc5, 0xed, }, - { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef, - 0xf6, 0x2b, 0x3a, 0xba, 0x60, }, - { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2, - 0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, }, - { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b, - 0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, }, - { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94, - 0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, }, - { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b, - 0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, }, - { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf, - 0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, }, - { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8, - 0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, }, - { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea, - 0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, }, - { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2, - 0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0, - 0xd7, }, - { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b, - 0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd, - 0xb6, 0xef, }, - { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6, - 0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57, - 0xef, 0xf8, 0x53, }, - { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46, - 0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89, - 0x89, 0xfd, 0xf7, 0x99, }, - { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03, - 0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a, - 0xa9, 0x8f, 0x2b, 0x59, 0xfb, }, - { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb, - 0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d, - 0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, }, - { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0, - 0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d, - 0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, }, - { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c, - 0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92, - 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, }, -}; - -static bool __init noinline_for_stack blake2s_digest_test(void) -{ - u8 key[BLAKE2S_KEY_SIZE]; - u8 buf[ARRAY_SIZE(blake2s_testvecs)]; - u8 hash[BLAKE2S_HASH_SIZE]; - struct blake2s_state state; - bool success = true; - int i, l; - - key[0] = key[1] = 1; - for (i = 2; i < sizeof(key); ++i) - key[i] = key[i - 2] + key[i - 1]; - - for (i = 0; i < sizeof(buf); ++i) - buf[i] = (u8)i; - - for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) { - int outlen = 1 + i % BLAKE2S_HASH_SIZE; - int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1); - - blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i, - keylen); - if (memcmp(hash, blake2s_testvecs[i], outlen)) { - pr_err("blake2s self-test %d: FAIL\n", i + 1); - success = false; - } - - if (!keylen) - blake2s_init(&state, outlen); - else - blake2s_init_key(&state, outlen, - key + BLAKE2S_KEY_SIZE - keylen, - keylen); - - blake2s_update(&state, buf, l); - blake2s_update(&state, buf + l, i - l); - blake2s_final(&state, hash); - if (memcmp(hash, blake2s_testvecs[i], outlen)) { - pr_err("blake2s init/update/final self-test %d: FAIL\n", - i + 1); - success = false; - } - } - - return success; -} - -static bool __init noinline_for_stack blake2s_random_test(void) -{ - struct blake2s_state state; - bool success = true; - int i, l; - - for (i = 0; i < 32; ++i) { - enum { TEST_ALIGNMENT = 16 }; - u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1] - __aligned(TEST_ALIGNMENT); - u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE; - struct blake2s_state state1, state2; - - get_random_bytes(blocks, sizeof(blocks)); - get_random_bytes(&state, sizeof(state)); - -#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \ - defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S) - memcpy(&state1, &state, sizeof(state1)); - memcpy(&state2, &state, sizeof(state2)); - blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE); - blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE); - if (memcmp(&state1, &state2, sizeof(state1))) { - pr_err("blake2s random compress self-test %d: FAIL\n", - i + 1); - success = false; - } -#endif - - memcpy(&state1, &state, sizeof(state1)); - blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE); - for (l = 1; l < TEST_ALIGNMENT; ++l) { - memcpy(unaligned_block + l, blocks, - BLAKE2S_BLOCK_SIZE); - memcpy(&state2, &state, sizeof(state2)); - blake2s_compress(&state2, unaligned_block + l, 1, - BLAKE2S_BLOCK_SIZE); - if (memcmp(&state1, &state2, sizeof(state1))) { - pr_err("blake2s random compress align %d self-test %d: FAIL\n", - l, i + 1); - success = false; - } - } - } - - return success; -} - -bool __init blake2s_selftest(void) -{ - bool success; - - success = blake2s_digest_test(); - success &= blake2s_random_test(); - - return success; -} diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index f6ec68c3dcda..51f2dd7a38a4 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -59,14 +58,5 @@ void blake2s_final(struct blake2s_state *state, u8 *out) } EXPORT_SYMBOL(blake2s_final); -static int __init blake2s_mod_init(void) -{ - if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && - WARN_ON(!blake2s_selftest())) - return -ENODEV; - return 0; -} - -module_init(blake2s_mod_init); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); -- cgit v1.2.3 From 39ee3970f26d55b57343da392d45117d7f893205 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 27 Aug 2025 08:11:30 -0700 Subject: lib/crypto: blake2s: Consolidate into single C translation unit As was done with the other algorithms, reorganize the BLAKE2s code so that the generic implementation and the arch-specific "glue" code is consolidated into a single translation unit, so that the compiler will inline the functions and automatically decide whether to include the generic code in the resulting binary or not. Similarly, also consolidate the build rules into lib/crypto/{Makefile,Kconfig}. This removes the last uses of lib/crypto/{arm,x86}/{Makefile,Kconfig}, so remove those too. Don't keep the !KMSAN dependency. It was needed only for other algorithms such as ChaCha that initialize memory from assembly code. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250827151131.27733-12-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/blake2s.h | 19 ----------- lib/crypto/Kconfig | 29 ++++------------ lib/crypto/Makefile | 13 +++++--- lib/crypto/arm/Kconfig | 14 -------- lib/crypto/arm/Makefile | 4 --- lib/crypto/arm/blake2s-core.S | 5 ++- lib/crypto/arm/blake2s-glue.c | 7 ---- lib/crypto/arm/blake2s.h | 5 +++ lib/crypto/blake2s.c | 29 ++++++++++------ lib/crypto/x86/Kconfig | 13 -------- lib/crypto/x86/Makefile | 4 --- lib/crypto/x86/blake2s-glue.c | 70 --------------------------------------- lib/crypto/x86/blake2s.h | 64 +++++++++++++++++++++++++++++++++++ 13 files changed, 106 insertions(+), 170 deletions(-) delete mode 100644 include/crypto/internal/blake2s.h delete mode 100644 lib/crypto/arm/Kconfig delete mode 100644 lib/crypto/arm/Makefile delete mode 100644 lib/crypto/arm/blake2s-glue.c create mode 100644 lib/crypto/arm/blake2s.h delete mode 100644 lib/crypto/x86/Kconfig delete mode 100644 lib/crypto/x86/Makefile delete mode 100644 lib/crypto/x86/blake2s-glue.c create mode 100644 lib/crypto/x86/blake2s.h (limited to 'include') diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h deleted file mode 100644 index 3b82572c9433..000000000000 --- a/include/crypto/internal/blake2s.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Helper functions for BLAKE2s implementations. - * Keep this in sync with the corresponding BLAKE2b header. - */ - -#ifndef _CRYPTO_INTERNAL_BLAKE2S_H -#define _CRYPTO_INTERNAL_BLAKE2S_H - -#include -#include - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc); - -#endif /* _CRYPTO_INTERNAL_BLAKE2S_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index c1db483bc230..37d85e0c9b97 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -28,21 +28,13 @@ config CRYPTO_LIB_ARC4 config CRYPTO_LIB_GF128MUL tristate -config CRYPTO_ARCH_HAVE_LIB_BLAKE2S - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Blake2s library interface, - either builtin or as a module. +# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option. -config CRYPTO_LIB_BLAKE2S_GENERIC - def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - This symbol can be depended upon by arch implementations of the - Blake2s library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_BLAKE2S. +config CRYPTO_LIB_BLAKE2S_ARCH + bool + depends on !UML + default y if ARM + default y if X86_64 config CRYPTO_LIB_CHACHA tristate @@ -208,13 +200,4 @@ config CRYPTO_LIB_SM3 source "lib/crypto/tests/Kconfig" -if !KMSAN # avoid false positives from assembly -if ARM -source "lib/crypto/arm/Kconfig" -endif -if X86 -source "lib/crypto/x86/Kconfig" -endif -endif - endmenu diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index c9193f0604d9..ad27c5bf99e1 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -29,9 +29,15 @@ libarc4-y := arc4.o obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o +################################################################################ + # blake2s is used by the /dev/random driver which is always builtin -obj-y += libblake2s.o -libblake2s-y := blake2s.o +obj-y += blake2s.o +ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y) +CFLAGS_blake2s.o += -I$(src)/$(SRCARCH) +obj-$(CONFIG_ARM) += arm/blake2s-core.o +obj-$(CONFIG_X86) += x86/blake2s-core.o +endif ################################################################################ @@ -256,9 +262,6 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o -obj-$(CONFIG_ARM) += arm/ -obj-$(CONFIG_X86) += x86/ - # clean-files must be defined unconditionally clean-files += arm/sha256-core.S arm/sha512-core.S clean-files += arm64/sha256-core.S arm64/sha512-core.S diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig deleted file mode 100644 index a5607ad079c4..000000000000 --- a/lib/crypto/arm/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_ARM - def_bool y - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: arm - - This is faster than the generic implementations of BLAKE2s and - BLAKE2b, but slower than the NEON implementation of BLAKE2b. - There is no NEON implementation of BLAKE2s, since NEON doesn't - really help with it. diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile deleted file mode 100644 index 0574b0e9739e..000000000000 --- a/lib/crypto/arm/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o -libblake2s-arm-y := blake2s-core.o blake2s-glue.o diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S index df40e46601f1..293f44fa8f31 100644 --- a/lib/crypto/arm/blake2s-core.S +++ b/lib/crypto/arm/blake2s-core.S @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * BLAKE2s digest algorithm, ARM scalar implementation + * BLAKE2s digest algorithm, ARM scalar implementation. This is faster + * than the generic implementations of BLAKE2s and BLAKE2b, but slower + * than the NEON implementation of BLAKE2b. There is no NEON + * implementation of BLAKE2s, since NEON doesn't really help with it. * * Copyright 2020 Google LLC * diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c deleted file mode 100644 index 0238a70d9581..000000000000 --- a/lib/crypto/arm/blake2s-glue.c +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include - -/* defined in blake2s-core.S */ -EXPORT_SYMBOL(blake2s_compress); diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h new file mode 100644 index 000000000000..aa7a97139ea7 --- /dev/null +++ b/lib/crypto/arm/blake2s.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/* defined in blake2s-core.S */ +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, u32 inc); diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index b5b75ade4658..5638ed9d882d 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -8,7 +8,7 @@ * */ -#include +#include #include #include #include @@ -16,7 +16,6 @@ #include #include -#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC static const u8 blake2s_sigma[10][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, @@ -37,12 +36,9 @@ static inline void blake2s_increment_counter(struct blake2s_state *state, state->t[1] += (state->t[0] < inc); } -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) - __weak __alias(blake2s_compress_generic); - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) +static void __maybe_unused +blake2s_compress_generic(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) { u32 m[16]; u32 v[16]; @@ -107,8 +103,12 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, --nblocks; } } -EXPORT_SYMBOL(blake2s_compress_generic); -#endif /* CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC */ + +#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH +#include "blake2s.h" /* $(SRCARCH)/blake2s.h */ +#else +#define blake2s_compress blake2s_compress_generic +#endif static inline void blake2s_set_lastblock(struct blake2s_state *state) { @@ -152,5 +152,14 @@ void blake2s_final(struct blake2s_state *state, u8 *out) } EXPORT_SYMBOL(blake2s_final); +#ifdef blake2s_mod_init_arch +static int __init blake2s_mod_init(void) +{ + blake2s_mod_init_arch(); + return 0; +} +subsys_initcall(blake2s_mod_init); +#endif + MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig deleted file mode 100644 index ffa718321369..000000000000 --- a/lib/crypto/x86/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_X86 - def_bool y - depends on 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile deleted file mode 100644 index 4454556b243e..000000000000 --- a/lib/crypto/x86/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s-glue.c deleted file mode 100644 index adc296cd17c9..000000000000 --- a/lib/crypto/x86/blake2s-glue.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); -asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - - if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { - blake2s_compress_generic(state, block, nblocks, inc); - return; - } - - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2S_BLOCK_SIZE); - - kernel_fpu_begin(); - if (static_branch_likely(&blake2s_use_avx512)) - blake2s_compress_avx512(state, block, blocks, inc); - else - blake2s_compress_ssse3(state, block, blocks, inc); - kernel_fpu_end(); - - nblocks -= blocks; - block += blocks * BLAKE2S_BLOCK_SIZE; - } while (nblocks); -} -EXPORT_SYMBOL(blake2s_compress); - -static int __init blake2s_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - static_branch_enable(&blake2s_use_ssse3); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL)) - static_branch_enable(&blake2s_use_avx512); - - return 0; -} - -subsys_initcall(blake2s_mod_init); diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h new file mode 100644 index 000000000000..b6d30d2fa045 --- /dev/null +++ b/lib/crypto/x86/blake2s.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +static void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + do { + const size_t blocks = min_t(size_t, nblocks, + SZ_4K / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + block += blocks * BLAKE2S_BLOCK_SIZE; + } while (nblocks); +} + +#define blake2s_mod_init_arch blake2s_mod_init_arch +static void blake2s_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + static_branch_enable(&blake2s_use_ssse3); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); +} -- cgit v1.2.3 From 3d439e1ec3368fae17db379354bd7a9e568ca0ab Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 29 Aug 2025 18:11:01 +0300 Subject: ASoC: sof: ipc4-topology: Add support to sched_domain attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SOF_TKN_COMP_SCHED_DOMAIN and connect it to struct snd_sof_widget comp_domain member, with new get_token_comp_domain() function. The logic is such that if the topology attribute is not present in the widget node the corresponding IPC4 extension value is taken from the module's manifest like before. But if the attribute is found and recognized its value overrides what is there in the manifest. Signed-off-by: Jyri Sarha Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Signed-off-by: Peter Ujfalusi Message-ID: <20250829151101.27327-1-peter.ujfalusi@linux.intel.com> Signed-off-by: Mark Brown --- include/uapi/sound/sof/tokens.h | 2 ++ sound/soc/sof/ipc4-topology.c | 44 ++++++++++++++++++++++++++++++++++++++++- sound/soc/sof/ipc4-topology.h | 7 +++++++ sound/soc/sof/sof-audio.h | 3 +++ 4 files changed, 55 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h index c28c766270de..9ce72fbd6f11 100644 --- a/include/uapi/sound/sof/tokens.h +++ b/include/uapi/sound/sof/tokens.h @@ -106,6 +106,8 @@ */ #define SOF_TKN_COMP_NO_WNAME_IN_KCONTROL_NAME 417 +#define SOF_TKN_COMP_SCHED_DOMAIN 418 + /* SSP */ #define SOF_TKN_INTEL_SSP_CLKS_CONTROL 500 #define SOF_TKN_INTEL_SSP_MCLK_ID 501 diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 591ee30551ba..74a1319d4bd2 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -38,6 +38,36 @@ MODULE_PARM_DESC(ipc4_ignore_cpc, static DEFINE_IDA(alh_group_ida); static DEFINE_IDA(pipeline_ida); +struct sof_comp_domains { + const char *name; + enum sof_comp_domain domain; +}; + +static const struct sof_comp_domains sof_domains[] = { + { "LL", SOF_COMP_DOMAIN_LL, }, + { "DP", SOF_COMP_DOMAIN_DP, } +}; + +static enum sof_comp_domain find_domain(const char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(sof_domains); i++) { + if (strcmp(name, sof_domains[i].name) == 0) + return sof_domains[i].domain; + } + /* No valid value found, fall back to manifest value */ + return SOF_COMP_DOMAIN_UNSET; +} + +static int get_token_comp_domain(void *elem, void *object, u32 offset) +{ + u32 *val = (u32 *)((u8 *)object + offset); + + *val = find_domain((const char *)elem); + return 0; +} + static const struct sof_topology_token ipc4_sched_tokens[] = { {SOF_TKN_SCHED_LP_MODE, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, offsetof(struct sof_ipc4_pipeline, lp_mode)}, @@ -127,6 +157,8 @@ static const struct sof_topology_token comp_ext_tokens[] = { offsetof(struct snd_sof_widget, uuid)}, {SOF_TKN_COMP_CORE_ID, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, offsetof(struct snd_sof_widget, core)}, + {SOF_TKN_COMP_SCHED_DOMAIN, SND_SOC_TPLG_TUPLE_TYPE_STRING, get_token_comp_domain, + offsetof(struct snd_sof_widget, comp_domain)}, }; static const struct sof_topology_token gain_tokens[] = { @@ -497,7 +529,17 @@ static int sof_ipc4_widget_setup_msg(struct snd_sof_widget *swidget, struct sof_ msg->extension = SOF_IPC4_MOD_EXT_CORE_ID(swidget->core); - type = (fw_module->man4_module_entry.type & SOF_IPC4_MODULE_DP) ? 1 : 0; + switch (swidget->comp_domain) { + case SOF_COMP_DOMAIN_LL: + type = 0; + break; + case SOF_COMP_DOMAIN_DP: + type = 1; + break; + default: + type = (fw_module->man4_module_entry.type & SOF_IPC4_MODULE_DP) ? 1 : 0; + break; + } msg->extension |= SOF_IPC4_MOD_EXT_DOMAIN(type); return 0; diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h index 14ba58d2be03..e8e848233314 100644 --- a/sound/soc/sof/ipc4-topology.h +++ b/sound/soc/sof/ipc4-topology.h @@ -109,6 +109,13 @@ enum sof_ipc4_copier_module_config_params { SOF_IPC4_COPIER_MODULE_CFG_ATTENUATION, }; +/* Scheduling domain, unset, Low Latency, or Data Processing */ +enum sof_comp_domain { + SOF_COMP_DOMAIN_UNSET = 0, /* Take domain value from manifest */ + SOF_COMP_DOMAIN_LL, /* Low Latency scheduling domain */ + SOF_COMP_DOMAIN_DP, /* Data Processing scheduling domain */ +}; + struct sof_ipc4_copier_config_set_sink_format { /* Id of sink */ u32 sink_id; diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h index 36ab75e11779..db6973c8eac3 100644 --- a/sound/soc/sof/sof-audio.h +++ b/sound/soc/sof/sof-audio.h @@ -451,6 +451,9 @@ struct snd_sof_widget { */ bool dynamic_pipeline_widget; + /* Scheduling domain (enum sof_comp_domain), unset, Low Latency, or Data Processing */ + u32 comp_domain; + struct snd_soc_dapm_widget *widget; struct list_head list; /* list in sdev widget list */ struct snd_sof_pipeline *spipe; -- cgit v1.2.3 From 72cdc67e7fa74931b055df3a76852bab551f1a04 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 28 Aug 2025 09:20:16 +0800 Subject: pppoe: remove rwlock usage Like ppp_generic.c, convert the PPPoE socket hash table to use RCU for lookups and a spinlock for updates. This removes rwlock usage and allows lockless readers on the fast path. - Mark hash table and list pointers as __rcu. - Use spin_lock() to protect writers. - Readers use rcu_dereference() under rcu_read_lock(). All known callers of get_item() already hold the RCU read lock, so no additional locking is needed. - get_item() now uses refcount_inc_not_zero() instead of sock_hold() to safely take a reference. This prevents crashes if a socket is already in the process of being freed (sk_refcnt == 0). - Set SOCK_RCU_FREE to defer socket freeing until after an RCU grace period. - Move skb_queue_purge() into sk_destruct callback to ensure purge happens after an RCU grace period. Signed-off-by: Qingfang Deng Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250828012018.15922-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/pppoe.c | 94 +++++++++++++++++++++++++++--------------------- include/linux/if_pppox.h | 2 +- 2 files changed, 54 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 410effa42ade..54522b26b728 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -100,8 +100,8 @@ struct pppoe_net { * as well, moreover in case of SMP less locking * controversy here */ - struct pppox_sock *hash_table[PPPOE_HASH_SIZE]; - rwlock_t hash_lock; + struct pppox_sock __rcu *hash_table[PPPOE_HASH_SIZE]; + spinlock_t hash_lock; }; /* @@ -162,13 +162,13 @@ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, int hash = hash_item(sid, addr); struct pppox_sock *ret; - ret = pn->hash_table[hash]; + ret = rcu_dereference(pn->hash_table[hash]); while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) return ret; - ret = ret->next; + ret = rcu_dereference(ret->next); } return NULL; @@ -177,19 +177,20 @@ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, static int __set_item(struct pppoe_net *pn, struct pppox_sock *po) { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); - struct pppox_sock *ret; + struct pppox_sock *ret, *first; - ret = pn->hash_table[hash]; + first = rcu_dereference_protected(pn->hash_table[hash], lockdep_is_held(&pn->hash_lock)); + ret = first; while (ret) { if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) && ret->pppoe_ifindex == po->pppoe_ifindex) return -EALREADY; - ret = ret->next; + ret = rcu_dereference_protected(ret->next, lockdep_is_held(&pn->hash_lock)); } - po->next = pn->hash_table[hash]; - pn->hash_table[hash] = po; + RCU_INIT_POINTER(po->next, first); + rcu_assign_pointer(pn->hash_table[hash], po); return 0; } @@ -198,20 +199,24 @@ static void __delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { int hash = hash_item(sid, addr); - struct pppox_sock *ret, **src; + struct pppox_sock *ret, __rcu **src; - ret = pn->hash_table[hash]; + ret = rcu_dereference_protected(pn->hash_table[hash], lockdep_is_held(&pn->hash_lock)); src = &pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) { - *src = ret->next; + struct pppox_sock *next; + + next = rcu_dereference_protected(ret->next, + lockdep_is_held(&pn->hash_lock)); + rcu_assign_pointer(*src, next); break; } src = &ret->next; - ret = ret->next; + ret = rcu_dereference_protected(ret->next, lockdep_is_held(&pn->hash_lock)); } } @@ -225,11 +230,9 @@ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, { struct pppox_sock *po; - read_lock_bh(&pn->hash_lock); po = __get_item(pn, sid, addr, ifindex); - if (po) - sock_hold(sk_pppox(po)); - read_unlock_bh(&pn->hash_lock); + if (po && !refcount_inc_not_zero(&sk_pppox(po)->sk_refcnt)) + po = NULL; return po; } @@ -258,9 +261,9 @@ static inline struct pppox_sock *get_item_by_addr(struct net *net, static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { - write_lock_bh(&pn->hash_lock); + spin_lock(&pn->hash_lock); __delete_item(pn, sid, addr, ifindex); - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); } /*************************************************************************** @@ -276,14 +279,16 @@ static void pppoe_flush_dev(struct net_device *dev) int i; pn = pppoe_pernet(dev_net(dev)); - write_lock_bh(&pn->hash_lock); + spin_lock(&pn->hash_lock); for (i = 0; i < PPPOE_HASH_SIZE; i++) { - struct pppox_sock *po = pn->hash_table[i]; + struct pppox_sock *po = rcu_dereference_protected(pn->hash_table[i], + lockdep_is_held(&pn->hash_lock)); struct sock *sk; while (po) { while (po && po->pppoe_dev != dev) { - po = po->next; + po = rcu_dereference_protected(po->next, + lockdep_is_held(&pn->hash_lock)); } if (!po) @@ -300,7 +305,7 @@ static void pppoe_flush_dev(struct net_device *dev) */ sock_hold(sk); - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); lock_sock(sk); if (po->pppoe_dev == dev && @@ -320,11 +325,12 @@ static void pppoe_flush_dev(struct net_device *dev) */ BUG_ON(pppoe_pernet(dev_net(dev)) == NULL); - write_lock_bh(&pn->hash_lock); - po = pn->hash_table[i]; + spin_lock(&pn->hash_lock); + po = rcu_dereference_protected(pn->hash_table[i], + lockdep_is_held(&pn->hash_lock)); } } - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); } static int pppoe_device_event(struct notifier_block *this, @@ -528,6 +534,11 @@ static struct proto pppoe_sk_proto __read_mostly = { .obj_size = sizeof(struct pppox_sock), }; +static void pppoe_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + /*********************************************************************** * * Initialize a new struct sock. @@ -542,11 +553,13 @@ static int pppoe_create(struct net *net, struct socket *sock, int kern) return -ENOMEM; sock_init_data(sock, sk); + sock_set_flag(sk, SOCK_RCU_FREE); sock->state = SS_UNCONNECTED; sock->ops = &pppoe_ops; sk->sk_backlog_rcv = pppoe_rcv_core; + sk->sk_destruct = pppoe_destruct; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; @@ -599,7 +612,6 @@ static int pppoe_release(struct socket *sock) sock_orphan(sk); sock->sk = NULL; - skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); @@ -681,9 +693,9 @@ static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, &sp->sa_addr.pppoe, sizeof(struct pppoe_addr)); - write_lock_bh(&pn->hash_lock); + spin_lock(&pn->hash_lock); error = __set_item(pn, po); - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); if (error < 0) goto err_put; @@ -1052,11 +1064,11 @@ static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos) int i; for (i = 0; i < PPPOE_HASH_SIZE; i++) { - po = pn->hash_table[i]; + po = rcu_dereference(pn->hash_table[i]); while (po) { if (!pos--) goto out; - po = po->next; + po = rcu_dereference(po->next); } } @@ -1065,19 +1077,19 @@ out: } static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(pn->hash_lock) + __acquires(RCU) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); loff_t l = *pos; - read_lock_bh(&pn->hash_lock); + rcu_read_lock(); return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN; } static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); - struct pppox_sock *po; + struct pppox_sock *po, *next; ++*pos; if (v == SEQ_START_TOKEN) { @@ -1085,14 +1097,15 @@ static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) goto out; } po = v; - if (po->next) - po = po->next; + next = rcu_dereference(po->next); + if (next) + po = next; else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); po = NULL; while (++hash < PPPOE_HASH_SIZE) { - po = pn->hash_table[hash]; + po = rcu_dereference(pn->hash_table[hash]); if (po) break; } @@ -1103,10 +1116,9 @@ out: } static void pppoe_seq_stop(struct seq_file *seq, void *v) - __releases(pn->hash_lock) + __releases(RCU) { - struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); - read_unlock_bh(&pn->hash_lock); + rcu_read_unlock(); } static const struct seq_operations pppoe_seq_ops = { @@ -1149,7 +1161,7 @@ static __net_init int pppoe_init_net(struct net *net) struct pppoe_net *pn = pppoe_pernet(net); struct proc_dir_entry *pde; - rwlock_init(&pn->hash_lock); + spin_lock_init(&pn->hash_lock); pde = proc_create_net("pppoe", 0444, net->proc_net, &pppoe_seq_ops, sizeof(struct seq_net_private)); diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index ff3beda1312c..db45d6f1c4f4 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -43,7 +43,7 @@ struct pppox_sock { /* struct sock must be the first member of pppox_sock */ struct sock sk; struct ppp_channel chan; - struct pppox_sock *next; /* for hash table */ + struct pppox_sock __rcu *next; /* for hash table */ union { struct pppoe_opt pppoe; struct pptp_opt pptp; -- cgit v1.2.3 From 9529320ad64e614cfaf96e6b8e3d8c0a1245160c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:37 +0000 Subject: inet_diag: change inet_diag_bc_sk() first argument We want to have access to the inet_diag_dump_data structure in the following patch. This patch removes duplication in callers. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 2 +- net/ipv4/inet_diag.c | 3 ++- net/ipv4/raw_diag.c | 10 +++------- net/ipv4/tcp_diag.c | 8 +++----- net/ipv4/udp_diag.c | 10 +++------- net/mptcp/mptcp_diag.c | 15 ++++----------- 6 files changed, 16 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 30bf8f7ea62b..86a0641ec36e 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -46,7 +46,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk); void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3827e9979d4f..117103042687 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -591,8 +591,9 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, } } -int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) { + const struct nlattr *bc = cb_data->inet_diag_nla_bc; const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index cc793bd8de25..943e5998e0ad 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb, static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); @@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; - struct nlattr *bc; if (IS_ERR(hashinfo)) return; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4ed6b93527f4..d83efd91f461 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -320,11 +320,9 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, u32 idiag_states = r->idiag_states; struct inet_hashinfo *hashinfo; int i, num, s_i, s_num; - struct nlattr *bc; struct sock *sk; hashinfo = net->ipv4.tcp_death_row.hashinfo; - bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; @@ -365,7 +363,7 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_sport) goto next_listen; - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_listen; if (inet_sk_diag_fill(sk, inet_csk(sk), skb, @@ -432,7 +430,7 @@ resume_bind_walk: r->sdiag_family != sk->sk_family) goto next_bind; - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_bind; sock_hold(sk); @@ -519,7 +517,7 @@ next_chunk: goto next_normal; twsk_build_assert(); - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_normal; if (!refcount_inc_not_zero(&sk->sk_refcnt)) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 38cb3a28e4ed..6e491c720c90 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -16,9 +16,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, @@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; - struct nlattr *bc; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 0566dd793810..ac974299de71 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -15,9 +15,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, @@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba const struct inet_diag_req_v2 *r, bool net_admin) { - struct inet_diag_dump_data *cb_data = cb->data; struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; - struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); struct inet_hashinfo *hinfo; int i; @@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_listen; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); sock_put(sk); @@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; - struct nlattr *bc; BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; @@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_dport) goto next; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); next: sock_put(sk); if (ret < 0) { -- cgit v1.2.3 From 95fa78830e5b2eb2041174c7f9549c746e003dd6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:38 +0000 Subject: inet_diag: avoid cache line misses in inet_diag_bc_sk() inet_diag_bc_sk() pulls five cache lines per socket, while most filters only need the two first ones. Add three booleans to struct inet_diag_dump_data, that are selectively set if a filter needs specific socket fields. - mark_needed /* INET_DIAG_BC_MARK_COND present. */ - cgroup_needed /* INET_DIAG_BC_CGROUP_COND present. */ - userlocks_needed /* INET_DIAG_BC_AUTO present. */ This removes millions of cache lines misses per ss invocation when simple filters are specified on busy servers. offsetof(struct sock, sk_userlocks) = 0xf3 offsetof(struct sock, sk_mark) = 0x20c offsetof(struct sock, sk_cgrp_data) = 0x298 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 5 +++++ net/ipv4/inet_diag.c | 52 ++++++++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 86a0641ec36e..704fd415c2b4 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -38,6 +38,11 @@ struct inet_diag_dump_data { #define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES] struct bpf_sk_storage_diag *bpf_stg_diag; + bool mark_needed; /* INET_DIAG_BC_MARK_COND present. */ +#ifdef CONFIG_SOCK_CGROUP_DATA + bool cgroup_needed; /* INET_DIAG_BC_CGROUP_COND present. */ +#endif + bool userlocks_needed; /* INET_DIAG_BC_AUTO present. */ }; struct inet_connection_sock; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 117103042687..f0b6c5a411a2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -605,18 +605,22 @@ int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) entry.sport = READ_ONCE(inet->inet_num); entry.dport = ntohs(READ_ONCE(inet->inet_dport)); entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); - entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; - if (sk_fullsock(sk)) - entry.mark = READ_ONCE(sk->sk_mark); - else if (sk->sk_state == TCP_NEW_SYN_RECV) - entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; - else if (sk->sk_state == TCP_TIME_WAIT) - entry.mark = inet_twsk(sk)->tw_mark; - else - entry.mark = 0; + if (cb_data->userlocks_needed) + entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; + if (cb_data->mark_needed) { + if (sk_fullsock(sk)) + entry.mark = READ_ONCE(sk->sk_mark); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; + else + entry.mark = 0; + } #ifdef CONFIG_SOCK_CGROUP_DATA - entry.cgroup_id = sk_fullsock(sk) ? - cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; + if (cb_data->cgroup_needed) + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); @@ -716,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, } #endif -static int inet_diag_bc_audit(const struct nlattr *attr, +static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data, const struct sk_buff *skb) { - bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const struct nlattr *attr = cb_data->inet_diag_nla_bc; const void *bytecode, *bc; int bytecode_len, len; + bool net_admin; + + if (!attr) + return 0; - if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + if (nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; + net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); @@ -757,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; + cb_data->mark_needed = true; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; + cb_data->cgroup_needed = true; break; #endif case INET_DIAG_BC_AUTO: + cb_data->userlocks_needed = true; + fallthrough; case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; @@ -841,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) kfree(cb_data); return err; } - nla = cb_data->inet_diag_nla_bc; - if (nla) { - err = inet_diag_bc_audit(nla, skb); - if (err) { - kfree(cb_data); - return err; - } + err = inet_diag_bc_audit(cb_data, skb); + if (err) { + kfree(cb_data); + return err; } nla = cb_data->inet_diag_nla_bpf_stgs; -- cgit v1.2.3 From caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:16 +0000 Subject: net: dst: introduce dst->dev_rcu Followup of commit 88fe14253e18 ("net: dst: add four helpers to annotate data-races around dst->dev"). We want to gradually add explicit RCU protection to dst->dev, including lockdep support. Add an union to alias dst->dev_rcu and dst->dev. Add dst_dev_net_rcu() helper. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 16 +++++++++++----- net/core/dst.c | 2 +- net/ipv4/route.c | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index bab01363bb97..f8aa1239b4db 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -24,7 +24,10 @@ struct sk_buff; struct dst_entry { - struct net_device *dev; + union { + struct net_device *dev; + struct net_device __rcu *dev_rcu; + }; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -570,9 +573,12 @@ static inline struct net_device *dst_dev(const struct dst_entry *dst) static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) { - /* In the future, use rcu_dereference(dst->dev) */ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(dst->dev); + return rcu_dereference(dst->dev_rcu); +} + +static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) +{ + return dev_net_rcu(dst_dev_rcu(dst)); } static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) @@ -592,7 +598,7 @@ static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) { - return dev_net_rcu(skb_dst_dev(skb)); + return dev_net_rcu(skb_dst_dev_rcu(skb)); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); diff --git a/net/core/dst.c b/net/core/dst.c index e2de8b68c41d..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); WRITE_ONCE(dst->output, dst_discard_out); - WRITE_ONCE(dst->dev, blackhole_netdev); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cc86a917a1bb..44382d175589 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1027,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1327,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); -- cgit v1.2.3 From 99a2ace61b211b0be861b07fbaa062fca4b58879 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:20 +0000 Subject: net: use dst_dev_rcu() in sk_setup_caps() Use RCU to protect accesses to dst->dev from sk_setup_caps() and sk_dst_gso_max_size(). Also use dst_dev_rcu() in ip6_dst_mtu_maybe_forward(), and ip_dst_mtu_maybe_forward(). ip4_dst_hoplimit() can use dst_dev_net_rcu(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 6 ++++-- include/net/ip6_route.h | 2 +- include/net/route.h | 2 +- net/core/sock.c | 16 ++++++++++------ 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index befcba575129..6dbd2bf8fa9c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -467,12 +467,14 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); + const struct net_device *dev; unsigned int mtu, res; struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + dev = dst_dev_rcu(dst); + net = dev_net_rcu(dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -486,7 +488,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst_dev(dst)->mtu); + mtu = READ_ONCE(dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9255f21818ee..59f48ca3abdf 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst_dev(dst)); + idev = __in6_dev_get(dst_dev_rcu(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/include/net/route.h b/include/net/route.h index c71998f464f8..f90106f383c5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -390,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/net/core/sock.c b/net/core/sock.c index e66ad1ec3a2d..9a8290fcc35d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2587,7 +2587,7 @@ free: } EXPORT_SYMBOL_GPL(sk_clone_lock); -static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { bool is_ipv6 = false; u32 max_size; @@ -2597,8 +2597,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : - READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2607,9 +2607,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk->sk_route_caps = dst_dev(dst)->features; + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2625,13 +2628,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); -- cgit v1.2.3 From a59076f2669ec23a122549e1f4114e8d4255b632 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:57 -0700 Subject: lsm: security_lsmblob_to_secctx module selection Add a parameter lsmid to security_lsmblob_to_secctx() to identify which of the security modules that may be active should provide the security context. If the value of lsmid is LSM_ID_UNDEF the first LSM providing a hook is used. security_secid_to_secctx() is unchanged, and will always report the first LSM providing a hook. Signed-off-by: Casey Schaufler [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/security.h | 6 ++++-- kernel/audit.c | 4 ++-- kernel/auditsc.c | 8 +++++--- net/netlabel/netlabel_user.c | 3 ++- security/security.c | 18 ++++++++++++++++-- 5 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..6d1ed6e7387b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - struct lsm_context *cp) + struct lsm_context *cp, + int lsmid) { return -EOPNOTSUPP; } diff --git a/kernel/audit.c b/kernel/audit.c index 547967cb4266..226c8ae00d04 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1473,7 +1473,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -2188,7 +2188,7 @@ int audit_log_task_context(struct audit_buffer *ab) if (!lsmprop_is_set(&prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); + error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); if (error < 0) { if (error != -EINVAL) goto error_path; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8ec768e2c1e5..3b606fd4ae8e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1109,7 +1109,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { + if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { @@ -1395,7 +1395,8 @@ static void show_special(struct audit_context *context, int *call_panic) struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + &lsmctx, + LSM_ID_UNDEF) < 0) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", lsmctx.context); @@ -1560,7 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, if (lsmprop_is_set(&n->oprop)) { struct lsm_context ctx; - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { + if (security_lsmprop_to_secctx(&n->oprop, &ctx, + LSM_ID_UNDEF) < 0) { if (call_panic) *call_panic = 2; } else { diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 0d04d23aafe7..6d6545297ee3 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -98,7 +98,8 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { + security_lsmprop_to_secctx(&audit_info->prop, &ctx, + LSM_ID_UNDEF) > 0) { audit_log_format(audit_buf, " subj=%s", ctx.context); security_release_secctx(&ctx); } diff --git a/security/security.c b/security/security.c index ad163f06bf7a..dd588f548a2b 100644 --- a/security/security.c +++ b/security/security.c @@ -4342,17 +4342,31 @@ EXPORT_SYMBOL(security_secid_to_secctx); * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx * @prop: lsm specific information * @cp: the LSM context + * @lsmid: which security module to report * * Convert a @prop entry to security context. If @cp is NULL the * length of the result will be returned. This does mean that the * length could change between calls to check the length and the * next call which actually allocates and returns the @cp. * + * @lsmid identifies which LSM should supply the context. + * A value of LSM_ID_UNDEF indicates that the first LSM suppling + * the hook should be used. This is used in cases where the + * ID of the supplying LSM is unambiguous. + * * Return: Return length of data on success, error on failure. */ -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid) { - return call_int_hook(lsmprop_to_secctx, prop, cp); + struct lsm_static_call *scall; + + lsm_for_each_hook(scall, lsmprop_to_secctx) { + if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id) + continue; + return scall->hl->hook.lsmprop_to_secctx(prop, cp); + } + return LSM_RET_DEFAULT(lsmprop_to_secctx); } EXPORT_SYMBOL(security_lsmprop_to_secctx); -- cgit v1.2.3 From eb59d494eebd4c5414728a35cdea6a0ba78ff26e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:58 -0700 Subject: audit: add record for multiple task security contexts Replace the single skb pointer in an audit_buffer with a list of skb pointers. Add the audit_stamp information to the audit_buffer as there's no guarantee that there will be an audit_context containing the stamp associated with the event. At audit_log_end() time create auxiliary records as have been added to the list. Functions are created to manage the skb list in the audit_buffer. Create a new audit record AUDIT_MAC_TASK_CONTEXTS. An example of the MAC_TASK_CONTEXTS record is: type=MAC_TASK_CONTEXTS msg=audit(1600880931.832:113) subj_apparmor=unconfined subj_smack=_ When an audit event includes a AUDIT_MAC_TASK_CONTEXTS record the "subj=" field in other records in the event will be "subj=?". An AUDIT_MAC_TASK_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on a subject security context. Refactor audit_log_task_context(), creating a new audit_log_subj_ctx(). This is used in netlabel auditing to provide multiple subject security contexts as necessary. Suggested-by: Paul Moore Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 16 ++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 208 ++++++++++++++++++++++++++++++++++++------- net/netlabel/netlabel_user.c | 9 +- security/apparmor/lsm.c | 3 + security/selinux/hooks.c | 3 + security/smack/smack_lsm.c | 3 + 7 files changed, 202 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index e3f06eba9c6e..a1f068bcb3a0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,9 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +190,7 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +216,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +253,11 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +282,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5..8cad2f307719 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,7 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index 226c8ae00d04..c924b30f2524 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,11 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +201,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +286,27 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1776,10 +1805,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1795,6 +1827,10 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_head_init(&ab->skb_list); + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1860,7 +1896,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1915,14 +1950,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &stamp); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)stamp.ctime.tv_sec, - stamp.ctime.tv_nsec/1000000, - stamp.serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2178,31 +2213,128 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, @@ -2411,6 +2543,26 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else + audit_log_lost("rate limit exceeded"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2423,25 +2575,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 6d6545297ee3..0da652844dd6 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,13 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx, - LSM_ID_UNDEF) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..220d1684b8d4 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..975b84b466b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7618,6 +7618,9 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..eaff9b8901a7 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,9 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + return 0; } -- cgit v1.2.3 From 0ffbc876d03c80b83d70aeefac7bbb94a9f4e135 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:59 -0700 Subject: audit: add record for multiple object contexts Create a new audit record AUDIT_MAC_OBJ_CONTEXTS. An example of the MAC_OBJ_CONTEXTS record is: type=MAC_OBJ_CONTEXTS msg=audit(1601152467.009:1050): obj_selinux=unconfined_u:object_r:user_home_t:s0 When an audit event includes a AUDIT_MAC_OBJ_CONTEXTS record the "obj=" field in other records in the event will be "obj=?". An AUDIT_MAC_OBJ_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on an object security context. Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 7 ++++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 58 +++++++++++++++++++++++++++++++++++++++++++++- kernel/auditsc.c | 38 ++++++------------------------ security/selinux/hooks.c | 4 +++- security/smack/smack_lsm.c | 4 +++- 6 files changed, 78 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index a1f068bcb3a0..536f8ee8da81 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -151,6 +151,7 @@ extern unsigned compat_signal_class[]; /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; @@ -191,6 +192,7 @@ extern void audit_log_path_denied(int type, extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -258,6 +260,11 @@ static inline int audit_log_subj_ctx(struct audit_buffer *ab, { return 0; } +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 8cad2f307719..14a1c1fe013a 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -149,6 +149,7 @@ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ #define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ +#define AUDIT_MAC_OBJ_CONTEXTS 1426 /* Multiple LSM objext contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index c924b30f2524..bd7474fd8d2c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,7 +85,9 @@ static unsigned int audit_net_id; /* Number of modules that provide a security context. List of lsms that provide a security context */ static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; /** * struct audit_net - audit private network namespace data @@ -305,6 +307,12 @@ void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) return; audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } } /** @@ -1142,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -2337,6 +2344,55 @@ int audit_log_task_context(struct audit_buffer *ab) } EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3b606fd4ae8e..d1966144bdfe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,16 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx, - LSM_ID_UNDEF) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1558,18 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx, - LSM_ID_UNDEF) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 975b84b466b4..3999f58a1842 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7619,7 +7619,9 @@ static __init int selinux_init(void) cred_init_security(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&selinux_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index eaff9b8901a7..fdf2f193a291 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5268,7 +5268,9 @@ static __init int smack_init(void) init_smack_known_list(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&smack_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); return 0; } -- cgit v1.2.3 From 44b6169ada7fe3cf4cb91e6b06019eaa22719f28 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 27 Aug 2025 08:10:53 +0200 Subject: scsi: fc: Avoid -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end has been introduced in GCC-14, and we are getting ready to enable it, globally. So, in order to avoid ending up with a flexible-array member in the middle of multiple other structs, we use the '__struct_group()' helper to create a new tagged 'struct fc_df_desc_fpin_reg_hdr'. This structure groups together all the members of the flexible 'struct fc_df_desc_fpin_reg' except the flexible array. As a result, the array is effectively separated from the rest of the members without modifying the memory layout of the flexible structure. We then change the type of the middle struct members currently causing trouble from 'struct fc_df_desc_fpin_reg' to 'struct fc_df_desc_fpin_reg_hdr'. We also want to ensure that in case new members need to be added to the flexible structure, they are always included within the newly created tagged struct. For this, we use '_Static_assert()'. This ensures that the memory layout for both the flexible structure and the new tagged struct is the same after any changes. This approach avoids having to implement 'struct fc_df_desc_fpin_reg_hdr' as a completely separate structure, thus preventing having to maintain two independent but basically identical structures, closing the door to potential bugs in the future. The above is also done for flexible structures 'struct fc_els_rdf' and 'struct fc_els_rdf_resp' So, with these changes, fix the following warnings: drivers/scsi/lpfc/lpfc_hw4.h:4936:41: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] drivers/scsi/lpfc/lpfc_hw4.h:4942:41: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] drivers/scsi/lpfc/lpfc_hw4.h:4947:41: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/aK6hbQLyQlvlySf8@kspp Reviewed-by: Hannes Reinecke Reviewed-by: Justin Tee Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 2 +- drivers/scsi/lpfc/lpfc_hw4.h | 6 ++--- include/uapi/scsi/fc/fc_els.h | 58 +++++++++++++++++++++++++++++-------------- 3 files changed, 43 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index fca81e0c7c2e..432761fb49de 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -3762,7 +3762,7 @@ lpfc_issue_els_rdf(struct lpfc_vport *vport, uint8_t retry) memset(prdf, 0, cmdsize); prdf->rdf.fpin_cmd = ELS_RDF; prdf->rdf.desc_len = cpu_to_be32(sizeof(struct lpfc_els_rdf_req) - - sizeof(struct fc_els_rdf)); + sizeof(struct fc_els_rdf_hdr)); prdf->reg_d1.reg_desc.desc_tag = cpu_to_be32(ELS_DTAG_FPIN_REGISTER); prdf->reg_d1.reg_desc.desc_len = cpu_to_be32( FC_TLV_DESC_LENGTH_FROM_SZ(prdf->reg_d1)); diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index bc709786e6af..a7f7ed86d2b0 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -4909,18 +4909,18 @@ struct send_frame_wqe { #define ELS_RDF_REG_TAG_CNT 4 struct lpfc_els_rdf_reg_desc { - struct fc_df_desc_fpin_reg reg_desc; /* descriptor header */ + struct fc_df_desc_fpin_reg_hdr reg_desc; /* descriptor header */ __be32 desc_tags[ELS_RDF_REG_TAG_CNT]; /* tags in reg_desc */ }; struct lpfc_els_rdf_req { - struct fc_els_rdf rdf; /* hdr up to descriptors */ + struct fc_els_rdf_hdr rdf; /* hdr up to descriptors */ struct lpfc_els_rdf_reg_desc reg_d1; /* 1st descriptor */ }; struct lpfc_els_rdf_rsp { - struct fc_els_rdf_resp rdf_resp; /* hdr up to descriptors */ + struct fc_els_rdf_resp_hdr rdf_resp; /* hdr up to descriptors */ struct lpfc_els_rdf_reg_desc reg_d1; /* 1st descriptor */ }; diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h index 16782c360de3..019096beb179 100644 --- a/include/uapi/scsi/fc/fc_els.h +++ b/include/uapi/scsi/fc/fc_els.h @@ -11,6 +11,12 @@ #include #include +#ifdef __KERNEL__ +#include /* for offsetof */ +#else +#include /* for offsetof */ +#endif + /* * Fibre Channel Switch - Enhanced Link Services definitions. * From T11 FC-LS Rev 1.2 June 7, 2005. @@ -1109,12 +1115,15 @@ struct fc_els_fpin { /* Diagnostic Function Descriptor - FPIN Registration */ struct fc_df_desc_fpin_reg { - __be32 desc_tag; /* FPIN Registration (0x00030001) */ - __be32 desc_len; /* Length of Descriptor (in bytes). - * Size of descriptor excluding - * desc_tag and desc_len fields. - */ - __be32 count; /* Number of desc_tags elements */ + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(fc_df_desc_fpin_reg_hdr, __hdr, /* no attrs */, + __be32 desc_tag; /* FPIN Registration (0x00030001) */ + __be32 desc_len; /* Length of Descriptor (in bytes). + * Size of descriptor excluding + * desc_tag and desc_len fields. + */ + __be32 count; /* Number of desc_tags elements */ + ); __be32 desc_tags[]; /* Array of Descriptor Tags. * Each tag indicates a function * supported by the N_Port (request) @@ -1124,33 +1133,44 @@ struct fc_df_desc_fpin_reg { * See ELS_FN_DTAG_xxx for tag values. */ }; +_Static_assert(offsetof(struct fc_df_desc_fpin_reg, desc_tags) == sizeof(struct fc_df_desc_fpin_reg_hdr), + "struct member likely outside of __struct_group()"); /* * ELS_RDF - Register Diagnostic Functions */ struct fc_els_rdf { - __u8 fpin_cmd; /* command (0x19) */ - __u8 fpin_zero[3]; /* specified as zero - part of cmd */ - __be32 desc_len; /* Length of Descriptor List (in bytes). - * Size of ELS excluding fpin_cmd, - * fpin_zero and desc_len fields. - */ + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(fc_els_rdf_hdr, __hdr, /* no attrs */, + __u8 fpin_cmd; /* command (0x19) */ + __u8 fpin_zero[3]; /* specified as zero - part of cmd */ + __be32 desc_len; /* Length of Descriptor List (in bytes). + * Size of ELS excluding fpin_cmd, + * fpin_zero and desc_len fields. + */ + ); struct fc_tlv_desc desc[]; /* Descriptor list */ }; +_Static_assert(offsetof(struct fc_els_rdf, desc) == sizeof(struct fc_els_rdf_hdr), + "struct member likely outside of __struct_group()"); /* * ELS RDF LS_ACC Response. */ struct fc_els_rdf_resp { - struct fc_els_ls_acc acc_hdr; - __be32 desc_list_len; /* Length of response (in - * bytes). Excludes acc_hdr - * and desc_list_len fields. - */ - struct fc_els_lsri_desc lsri; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(fc_els_rdf_resp_hdr, __hdr, /* no attrs */, + struct fc_els_ls_acc acc_hdr; + __be32 desc_list_len; /* Length of response (in + * bytes). Excludes acc_hdr + * and desc_list_len fields. + */ + struct fc_els_lsri_desc lsri; + ); struct fc_tlv_desc desc[]; /* Supported Descriptor list */ }; - +_Static_assert(offsetof(struct fc_els_rdf_resp, desc) == sizeof(struct fc_els_rdf_resp_hdr), + "struct member likely outside of __struct_group()"); /* * Diagnostic Capability Descriptors for EDC ELS -- cgit v1.2.3 From b620462bba6655b47d127db70d18123c7af522d4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 29 Aug 2025 08:38:16 -0700 Subject: scsi: ufs: core: Move the tracing enumeration types into a new file The header file defines constants and data structures related to the UFS standard. Move the enumeration types related to tracing into a new header file because these are not defined in the UFS standard. An intended side effect of this patch is that the tracing enumeration types are no longer visible to UFS host drivers. Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20250829153841.2201700-1-bvanassche@acm.org Reviewed-by: Avri Altman Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs_trace.h | 1 + drivers/ufs/core/ufs_trace_types.h | 24 ++++++++++++++++++++++++ include/ufs/ufs.h | 17 ----------------- 3 files changed, 25 insertions(+), 17 deletions(-) create mode 100644 drivers/ufs/core/ufs_trace_types.h (limited to 'include') diff --git a/drivers/ufs/core/ufs_trace.h b/drivers/ufs/core/ufs_trace.h index caa32e23ffa5..584c2b5c6ad9 100644 --- a/drivers/ufs/core/ufs_trace.h +++ b/drivers/ufs/core/ufs_trace.h @@ -11,6 +11,7 @@ #include #include +#include "ufs_trace_types.h" #define str_opcode(opcode) \ __print_symbolic(opcode, \ diff --git a/drivers/ufs/core/ufs_trace_types.h b/drivers/ufs/core/ufs_trace_types.h new file mode 100644 index 000000000000..f2d5ad1d92b9 --- /dev/null +++ b/drivers/ufs/core/ufs_trace_types.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _UFS_TRACE_TYPES_H_ +#define _UFS_TRACE_TYPES_H_ + +enum ufs_trace_str_t { + UFS_CMD_SEND, + UFS_CMD_COMP, + UFS_DEV_COMP, + UFS_QUERY_SEND, + UFS_QUERY_COMP, + UFS_QUERY_ERR, + UFS_TM_SEND, + UFS_TM_COMP, + UFS_TM_ERR +}; + +enum ufs_trace_tsf_t { + UFS_TSF_CDB, + UFS_TSF_OSF, + UFS_TSF_TM_INPUT, + UFS_TSF_TM_OUTPUT +}; + +#endif /* _UFS_TRACE_TYPES_H_ */ diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h index 72fd385037a6..245a6a829ce9 100644 --- a/include/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -653,21 +653,4 @@ struct ufs_dev_info { bool hid_sup; }; -/* - * This enum is used in string mapping in ufs_trace.h. - */ -enum ufs_trace_str_t { - UFS_CMD_SEND, UFS_CMD_COMP, UFS_DEV_COMP, - UFS_QUERY_SEND, UFS_QUERY_COMP, UFS_QUERY_ERR, - UFS_TM_SEND, UFS_TM_COMP, UFS_TM_ERR -}; - -/* - * Transaction Specific Fields (TSF) type in the UPIU package, this enum is - * used in ufs_trace.h for UFS command trace. - */ -enum ufs_trace_tsf_t { - UFS_TSF_CDB, UFS_TSF_OSF, UFS_TSF_TM_INPUT, UFS_TSF_TM_OUTPUT -}; - #endif /* End of Header */ -- cgit v1.2.3 From 6234d0df236ab1a71c6bd75e4f5fa15339d5272b Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 22 Aug 2025 03:27:21 +0300 Subject: media: v4l2-common: Constify media_pad argument to v4l2_get_link_freq() The v4l2_get_link_freq() macro doesn't modify the pad argument. Make it possible to call it with a const media_pad pointer. Link: https://lore.kernel.org/r/20250822002734.23516-2-laurent.pinchart@ideasonboard.com Signed-off-by: Laurent Pinchart Reviewed-by: Frank Li Acked-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-common.c | 2 +- include/media/v4l2-common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 0574f5d685f8..6f8bce4d6b62 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -550,7 +550,7 @@ static s64 v4l2_get_link_freq_ctrl(struct v4l2_ctrl_handler *handler, return freq > 0 ? freq : -EINVAL; } -s64 v4l2_get_link_freq(struct media_pad *pad, unsigned int mul, +s64 v4l2_get_link_freq(const struct media_pad *pad, unsigned int mul, unsigned int div) { struct v4l2_mbus_config mbus_config = {}; diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index e31b4434ea5d..d8e23991a656 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -579,7 +579,7 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, * * %-EINVAL: Invalid link frequency value */ #ifdef CONFIG_MEDIA_CONTROLLER -s64 v4l2_get_link_freq(struct media_pad *pad, unsigned int mul, +s64 v4l2_get_link_freq(const struct media_pad *pad, unsigned int mul, unsigned int div); #endif -- cgit v1.2.3 From 76f1e2ee545b3165e1e24293b59414699118266a Mon Sep 17 00:00:00 2001 From: Denzeel Oliva Date: Sat, 30 Aug 2025 16:28:41 +0000 Subject: dt-bindings: clock: exynos990: Extend clocks IDs Add missing clock definitions for DPU and CMUREF. Acked-by: Rob Herring (Arm) Signed-off-by: Denzeel Oliva Link: https://lore.kernel.org/r/20250830-fix-cmu-top-v5-4-7c62f608309e@gmail.com Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/samsung,exynos990.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/samsung,exynos990.h b/include/dt-bindings/clock/samsung,exynos990.h index 6b9df09d2822..c5c79e078f2f 100644 --- a/include/dt-bindings/clock/samsung,exynos990.h +++ b/include/dt-bindings/clock/samsung,exynos990.h @@ -208,6 +208,10 @@ #define CLK_GOUT_CMU_SSP_BUS 197 #define CLK_GOUT_CMU_TNR_BUS 198 #define CLK_GOUT_CMU_VRA_BUS 199 +#define CLK_MOUT_CMU_CMUREF 200 +#define CLK_MOUT_CMU_DPU_BUS 201 +#define CLK_MOUT_CMU_CLK_CMUREF 202 +#define CLK_DOUT_CMU_CLK_CMUREF 203 /* CMU_HSI0 */ #define CLK_MOUT_HSI0_BUS_USER 1 -- cgit v1.2.3 From 91f98de42310c70f9a23595b3b20aa305717d955 Mon Sep 17 00:00:00 2001 From: Hakyeong Kim Date: Mon, 25 Aug 2025 17:14:27 +0530 Subject: dt-bindings: clock: Add ARTPEC-8 clock controller Add dt-schema for Axis ARTPEC-8 SoC clock controller. The Clock Management Unit (CMU) has a top-level block CMU_CMU which generates clocks for other blocks. Add device-tree binding definitions for following CMU blocks: - CMU_CMU - CMU_BUS - CMU_CORE - CMU_CPUCL - CMU_FSYS - CMU_IMEM - CMU_PERI Signed-off-by: Hakyeong Kim Signed-off-by: SeonGu Kang Reviewed-by: Rob Herring (Arm) Signed-off-by: Ravi Patel Link: https://lore.kernel.org/r/20250825114436.46882-2-ravi.patel@samsung.com Signed-off-by: Krzysztof Kozlowski --- .../bindings/clock/axis,artpec8-clock.yaml | 213 +++++++++++++++++++++ include/dt-bindings/clock/axis,artpec8-clk.h | 169 ++++++++++++++++ 2 files changed, 382 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml create mode 100644 include/dt-bindings/clock/axis,artpec8-clk.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml b/Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml new file mode 100644 index 000000000000..277af48ac841 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml @@ -0,0 +1,213 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/axis,artpec8-clock.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Axis ARTPEC-8 SoC clock controller + +maintainers: + - Jesper Nilsson + +description: | + ARTPEC-8 clock controller is comprised of several CMU (Clock Management Unit) + units, generating clocks for different domains. Those CMU units are modeled + as separate device tree nodes, and might depend on each other. + The root clock in that root tree is an external clock: OSCCLK (25 MHz). + This external clock must be defined as a fixed-rate clock in dts. + + CMU_CMU is a top-level CMU, where all base clocks are prepared using PLLs and + dividers; all other clocks of function blocks (other CMUs) are usually + derived from CMU_CMU. + + Each clock is assigned an identifier and client nodes can use this identifier + to specify the clock which they consume. All clocks available for usage + in clock consumer nodes are defined as preprocessor macros in + 'include/dt-bindings/clock/axis,artpec8-clk.h' header. + +properties: + compatible: + enum: + - axis,artpec8-cmu-cmu + - axis,artpec8-cmu-bus + - axis,artpec8-cmu-core + - axis,artpec8-cmu-cpucl + - axis,artpec8-cmu-fsys + - axis,artpec8-cmu-imem + - axis,artpec8-cmu-peri + + reg: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 5 + + clock-names: + minItems: 1 + maxItems: 5 + + "#clock-cells": + const: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - "#clock-cells" + +allOf: + - if: + properties: + compatible: + const: axis,artpec8-cmu-cmu + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + + clock-names: + items: + - const: fin_pll + + - if: + properties: + compatible: + const: axis,artpec8-cmu-bus + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_BUS BUS clock (from CMU_CMU) + - description: CMU_BUS DLP clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: bus + - const: dlp + + - if: + properties: + compatible: + const: axis,artpec8-cmu-core + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_CORE main clock (from CMU_CMU) + - description: CMU_CORE DLP clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: main + - const: dlp + + - if: + properties: + compatible: + const: axis,artpec8-cmu-cpucl + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_CPUCL switch clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: switch + + - if: + properties: + compatible: + const: axis,artpec8-cmu-fsys + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_FSYS SCAN0 clock (from CMU_CMU) + - description: CMU_FSYS SCAN1 clock (from CMU_CMU) + - description: CMU_FSYS BUS clock (from CMU_CMU) + - description: CMU_FSYS IP clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: scan0 + - const: scan1 + - const: bus + - const: ip + + - if: + properties: + compatible: + const: axis,artpec8-cmu-imem + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_IMEM ACLK clock (from CMU_CMU) + - description: CMU_IMEM JPEG clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: aclk + - const: jpeg + + - if: + properties: + compatible: + const: axis,artpec8-cmu-peri + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_PERI IP clock (from CMU_CMU) + - description: CMU_PERI AUDIO clock (from CMU_CMU) + - description: CMU_PERI DISP clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: ip + - const: audio + - const: disp + +additionalProperties: false + +examples: + # Clock controller node for CMU_FSYS + - | + #include + + cmu_fsys: clock-controller@16c10000 { + compatible = "axis,artpec8-cmu-fsys"; + reg = <0x16c10000 0x4000>; + #clock-cells = <1>; + clocks = <&fin_pll>, + <&cmu_cmu CLK_DOUT_CMU_FSYS_SCAN0>, + <&cmu_cmu CLK_DOUT_CMU_FSYS_SCAN1>, + <&cmu_cmu CLK_DOUT_CMU_FSYS_BUS>, + <&cmu_cmu CLK_DOUT_CMU_FSYS_IP>; + clock-names = "fin_pll", "scan0", "scan1", "bus", "ip"; + }; + +... diff --git a/include/dt-bindings/clock/axis,artpec8-clk.h b/include/dt-bindings/clock/axis,artpec8-clk.h new file mode 100644 index 000000000000..1e6e1409dd94 --- /dev/null +++ b/include/dt-bindings/clock/axis,artpec8-clk.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. + * https://www.samsung.com + * Copyright (c) 2025 Axis Communications AB. + * https://www.axis.com + * + * Device Tree binding constants for ARTPEC-8 clock controller. + */ + +#ifndef _DT_BINDINGS_CLOCK_ARTPEC8_H +#define _DT_BINDINGS_CLOCK_ARTPEC8_H + +/* CMU_CMU */ +#define CLK_FOUT_SHARED0_PLL 1 +#define CLK_DOUT_SHARED0_DIV2 2 +#define CLK_DOUT_SHARED0_DIV3 3 +#define CLK_DOUT_SHARED0_DIV4 4 +#define CLK_FOUT_SHARED1_PLL 5 +#define CLK_DOUT_SHARED1_DIV2 6 +#define CLK_DOUT_SHARED1_DIV3 7 +#define CLK_DOUT_SHARED1_DIV4 8 +#define CLK_FOUT_AUDIO_PLL 9 +#define CLK_DOUT_CMU_BUS 10 +#define CLK_DOUT_CMU_BUS_DLP 11 +#define CLK_DOUT_CMU_CDC_CORE 12 +#define CLK_DOUT_CMU_OTP 13 +#define CLK_DOUT_CMU_CORE_MAIN 14 +#define CLK_DOUT_CMU_CORE_DLP 15 +#define CLK_DOUT_CMU_CPUCL_SWITCH 16 +#define CLK_DOUT_CMU_DLP_CORE 17 +#define CLK_DOUT_CMU_FSYS_BUS 18 +#define CLK_DOUT_CMU_FSYS_IP 19 +#define CLK_DOUT_CMU_FSYS_SCAN0 20 +#define CLK_DOUT_CMU_FSYS_SCAN1 21 +#define CLK_DOUT_CMU_GPU_3D 22 +#define CLK_DOUT_CMU_GPU_2D 23 +#define CLK_DOUT_CMU_IMEM_ACLK 24 +#define CLK_DOUT_CMU_IMEM_JPEG 25 +#define CLK_DOUT_CMU_MIF_SWITCH 26 +#define CLK_DOUT_CMU_MIF_BUSP 27 +#define CLK_DOUT_CMU_PERI_DISP 28 +#define CLK_DOUT_CMU_PERI_IP 29 +#define CLK_DOUT_CMU_PERI_AUDIO 30 +#define CLK_DOUT_CMU_RSP_CORE 31 +#define CLK_DOUT_CMU_TRFM_CORE 32 +#define CLK_DOUT_CMU_VCA_ACE 33 +#define CLK_DOUT_CMU_VCA_OD 34 +#define CLK_DOUT_CMU_VIO_CORE 35 +#define CLK_DOUT_CMU_VIO_AUDIO 36 +#define CLK_DOUT_CMU_VIP0_CORE 37 +#define CLK_DOUT_CMU_VIP1_CORE 38 +#define CLK_DOUT_CMU_VPP_CORE 39 + +/* CMU_BUS */ +#define CLK_MOUT_BUS_ACLK_USER 1 +#define CLK_MOUT_BUS_DLP_USER 2 +#define CLK_DOUT_BUS_PCLK 3 + +/* CMU_CORE */ +#define CLK_MOUT_CORE_ACLK_USER 1 +#define CLK_MOUT_CORE_DLP_USER 2 +#define CLK_DOUT_CORE_PCLK 3 + +/* CMU_CPUCL */ +#define CLK_FOUT_CPUCL_PLL 1 +#define CLK_MOUT_CPUCL_PLL 2 +#define CLK_MOUT_CPUCL_SWITCH_USER 3 +#define CLK_DOUT_CPUCL_CPU 4 +#define CLK_DOUT_CPUCL_CLUSTER_ACLK 5 +#define CLK_DOUT_CPUCL_CLUSTER_PCLKDBG 6 +#define CLK_DOUT_CPUCL_CLUSTER_CNTCLK 7 +#define CLK_DOUT_CPUCL_CLUSTER_ATCLK 8 +#define CLK_DOUT_CPUCL_PCLK 9 +#define CLK_DOUT_CPUCL_CMUREF 10 +#define CLK_DOUT_CPUCL_DBG 11 +#define CLK_DOUT_CPUCL_PCLKDBG 12 +#define CLK_GOUT_CPUCL_CLUSTER_CPU 13 +#define CLK_GOUT_CPUCL_SHORTSTOP 14 +#define CLK_GOUT_CPUCL_CSSYS_IPCLKPORT_PCLKDBG 15 +#define CLK_GOUT_CPUCL_CSSYS_IPCLKPORT_ATCLK 16 + +/* CMU_FSYS */ +#define CLK_FOUT_FSYS_PLL 1 +#define CLK_MOUT_FSYS_SCAN0_USER 2 +#define CLK_MOUT_FSYS_SCAN1_USER 3 +#define CLK_MOUT_FSYS_BUS_USER 4 +#define CLK_MOUT_FSYS_MMC_USER 5 +#define CLK_DOUT_FSYS_PCIE_PIPE 6 +#define CLK_DOUT_FSYS_ADC 7 +#define CLK_DOUT_FSYS_PCIE_PHY_REFCLK_SYSPLL 8 +#define CLK_DOUT_FSYS_EQOS_INT125 9 +#define CLK_DOUT_FSYS_OTP_MEM 10 +#define CLK_DOUT_FSYS_SCLK_UART 11 +#define CLK_DOUT_FSYS_EQOS_25 12 +#define CLK_DOUT_FSYS_EQOS_2p5 13 +#define CLK_DOUT_FSYS_BUS300 14 +#define CLK_DOUT_FSYS_BUS_QSPI 15 +#define CLK_DOUT_FSYS_MMC_CARD0 16 +#define CLK_DOUT_FSYS_MMC_CARD1 17 +#define CLK_DOUT_SCAN_CLK_FSYS_125 18 +#define CLK_DOUT_FSYS_QSPI 19 +#define CLK_DOUT_FSYS_SFMC_NAND 20 +#define CLK_DOUT_FSYS_SCAN_CLK_MMC 21 +#define CLK_GOUT_FSYS_USB20DRD_IPCLKPORT_ACLK_PHYCTRL_20 22 +#define CLK_GOUT_FSYS_USB20DRD_IPCLKPORT_BUS_CLK_EARLY 23 +#define CLK_GOUT_FSYS_XHB_USB_IPCLKPORT_CLK 24 +#define CLK_GOUT_FSYS_XHB_AHBBR_IPCLKPORT_CLK 25 +#define CLK_GOUT_FSYS_I2C0_IPCLKPORT_I_PCLK 26 +#define CLK_GOUT_FSYS_I2C1_IPCLKPORT_I_PCLK 27 +#define CLK_GOUT_FSYS_PWM_IPCLKPORT_I_PCLK_S0 28 +#define CLK_GOUT_FSYS_DWC_PCIE_CTL_INST_0_MSTR_ACLK_UG 29 +#define CLK_GOUT_FSYS_DWC_PCIE_CTL_INXT_0_SLV_ACLK_UG 30 +#define CLK_GOUT_FSYS_DWC_PCIE_CTL_INST_0_DBI_ACLK_UG 31 +#define CLK_GOUT_FSYS_PIPE_PAL_INST_0_I_APB_PCLK 32 +#define CLK_GOUT_FSYS_EQOS_TOP_IPCLKPORT_ACLK_I 33 +#define CLK_GOUT_FSYS_EQOS_TOP_IPCLKPORT_CLK_CSR_I 34 +#define CLK_GOUT_FSYS_EQOS_TOP_IPCLKPORT_I_RGMII_TXCLK_2P5 35 +#define CLK_GOUT_FSYS_SFMC_IPCLKPORT_I_ACLK_NAND 36 +#define CLK_GOUT_FSYS_MMC0_IPCLKPORT_SDCLKIN 37 +#define CLK_GOUT_FSYS_MMC0_IPCLKPORT_I_ACLK 38 +#define CLK_GOUT_FSYS_MMC1_IPCLKPORT_SDCLKIN 39 +#define CLK_GOUT_FSYS_MMC1_IPCLKPORT_I_ACLK 40 +#define CLK_GOUT_FSYS_PCIE_PHY_REFCLK_IN 41 +#define CLK_GOUT_FSYS_UART0_PCLK 42 +#define CLK_GOUT_FSYS_UART0_SCLK_UART 43 +#define CLK_GOUT_FSYS_BUS_QSPI 44 +#define CLK_GOUT_FSYS_QSPI_IPCLKPORT_HCLK 45 +#define CLK_GOUT_FSYS_QSPI_IPCLKPORT_SSI_CLK 46 + +/* CMU_IMEM */ +#define CLK_MOUT_IMEM_ACLK_USER 1 +#define CLK_MOUT_IMEM_GIC_CA53 2 +#define CLK_MOUT_IMEM_GIC_CA5 3 +#define CLK_MOUT_IMEM_JPEG_USER 4 +#define CLK_GOUT_IMEM_MCT_PCLK 5 +#define CLK_GOUT_IMEM_PCLK_TMU0_APBIF 6 + +/* CMU_PERI */ +#define CLK_MOUT_PERI_IP_USER 1 +#define CLK_MOUT_PERI_AUDIO_USER 2 +#define CLK_MOUT_PERI_I2S0 3 +#define CLK_MOUT_PERI_I2S1 4 +#define CLK_MOUT_PERI_DISP_USER 5 +#define CLK_DOUT_PERI_SPI 6 +#define CLK_DOUT_PERI_UART1 7 +#define CLK_DOUT_PERI_UART2 8 +#define CLK_DOUT_PERI_PCLK 9 +#define CLK_DOUT_PERI_I2S0 10 +#define CLK_DOUT_PERI_I2S1 11 +#define CLK_DOUT_PERI_DSIM 12 +#define CLK_GOUT_PERI_UART1_PCLK 13 +#define CLK_GOUT_PERI_UART1_SCLK_UART 14 +#define CLK_GOUT_PERI_UART2_PCLK 15 +#define CLK_GOUT_PERI_UART2_SCLK_UART 16 +#define CLK_GOUT_PERI_I2C2_IPCLKPORT_I_PCLK 17 +#define CLK_GOUT_PERI_I2C3_IPCLKPORT_I_PCLK 18 +#define CLK_GOUT_PERI_SPI0_PCLK 19 +#define CLK_GOUT_PERI_SPI0_SCLK_SPI 20 +#define CLK_GOUT_PERI_APB_ASYNC_DSIM_IPCLKPORT_PCLKS 21 +#define CLK_GOUT_PERI_I2SSC0_IPCLKPORT_CLK_HST 22 +#define CLK_GOUT_PERI_I2SSC1_IPCLKPORT_CLK_HST 23 +#define CLK_GOUT_PERI_AUDIO_OUT_IPCLKPORT_CLK 24 +#define CLK_GOUT_PERI_I2SSC0_IPCLKPORT_CLK 25 +#define CLK_GOUT_PERI_I2SSC1_IPCLKPORT_CLK 26 +#define CLK_GOUT_PERI_DMA4DSIM_IPCLKPORT_CLK_APB_CLK 27 +#define CLK_GOUT_PERI_DMA4DSIM_IPCLKPORT_CLK_AXI_CLK 28 + +#endif /* _DT_BINDINGS_CLOCK_ARTPEC8_H */ -- cgit v1.2.3 From 1d8fdabe19267338f29b58f968499e5b55e6a3b6 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Fri, 29 Aug 2025 12:25:43 +0100 Subject: iio: frequency: adf4350: Fix ADF4350_REG3_12BIT_CLKDIV_MODE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clk div bits (2 bits wide) do not start in bit 16 but in bit 15. Fix it accordingly. Fixes: e31166f0fd48 ("iio: frequency: New driver for Analog Devices ADF4350/ADF4351 Wideband Synthesizers") Signed-off-by: Michael Hennerich Signed-off-by: Nuno Sá Link: https://patch.msgid.link/20250829-adf4350-fix-v2-2-0bf543ba797d@analog.com Cc: Signed-off-by: Jonathan Cameron --- include/linux/iio/frequency/adf4350.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/frequency/adf4350.h b/include/linux/iio/frequency/adf4350.h index de45cf2ee1e4..ce2086f97e3f 100644 --- a/include/linux/iio/frequency/adf4350.h +++ b/include/linux/iio/frequency/adf4350.h @@ -51,7 +51,7 @@ /* REG3 Bit Definitions */ #define ADF4350_REG3_12BIT_CLKDIV(x) ((x) << 3) -#define ADF4350_REG3_12BIT_CLKDIV_MODE(x) ((x) << 16) +#define ADF4350_REG3_12BIT_CLKDIV_MODE(x) ((x) << 15) #define ADF4350_REG3_12BIT_CSR_EN (1 << 18) #define ADF4351_REG3_CHARGE_CANCELLATION_EN (1 << 21) #define ADF4351_REG3_ANTI_BACKLASH_3ns_EN (1 << 22) -- cgit v1.2.3 From eef6dcbc52fa83c392a2f4a52845f347b233a584 Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Sat, 23 Aug 2025 11:24:19 +0530 Subject: dt-bindings: gpio: Add Tegra256 support Extend the existing Tegra186 GPIO controller device tree bindings with support for the GPIO controller found on Tegra256. The number of pins is slightly different, but the programming model remains the same Add a new header, include/dt-bindings/gpio/tegra256-gpio.h, that defines port IDs as well as the TEGRA256_MAIN_GPIO() helper, both of which are used in conjunction to create a unique specifier for each pin. The OS can reconstruct the port ID and pin from these values to determine the register region for the corresponding GPIO. However, the OS does not use the macro definitions in this file. The symbolic names help associate these GPIO specifiers with the names used in the technical documentation available for the chip. Signed-off-by: Prathamesh Shete Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250823055420.24664-1-pshete@nvidia.com Signed-off-by: Bartosz Golaszewski --- .../bindings/gpio/nvidia,tegra186-gpio.yaml | 2 ++ include/dt-bindings/gpio/tegra256-gpio.h | 28 ++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 include/dt-bindings/gpio/tegra256-gpio.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml index 065f5761a93f..2bd620a1099b 100644 --- a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml @@ -85,6 +85,7 @@ properties: - nvidia,tegra194-gpio-aon - nvidia,tegra234-gpio - nvidia,tegra234-gpio-aon + - nvidia,tegra256-gpio reg-names: items: @@ -155,6 +156,7 @@ allOf: - nvidia,tegra186-gpio - nvidia,tegra194-gpio - nvidia,tegra234-gpio + - nvidia,tegra256-gpio then: properties: interrupts: diff --git a/include/dt-bindings/gpio/tegra256-gpio.h b/include/dt-bindings/gpio/tegra256-gpio.h new file mode 100644 index 000000000000..a0353a302aeb --- /dev/null +++ b/include/dt-bindings/gpio/tegra256-gpio.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. */ + +/* + * This header provides constants for the nvidia,tegra256-gpio DT binding. + * + * The first cell in Tegra's GPIO specifier is the GPIO ID. + * The macros below provide names for this. + * + * The second cell contains standard flag values specified in gpio.h. + */ + +#ifndef _DT_BINDINGS_GPIO_TEGRA256_GPIO_H +#define _DT_BINDINGS_GPIO_TEGRA256_GPIO_H + +#include + +/* GPIOs implemented by main GPIO controller */ +#define TEGRA256_MAIN_GPIO_PORT_A 0 +#define TEGRA256_MAIN_GPIO_PORT_B 1 +#define TEGRA256_MAIN_GPIO_PORT_C 2 +#define TEGRA256_MAIN_GPIO_PORT_D 3 + +#define TEGRA256_MAIN_GPIO(port, offset) \ + ((TEGRA256_MAIN_GPIO_PORT_##port * 8) + (offset)) + +#endif + -- cgit v1.2.3 From 37b27bd5d6217b75d315f28b4399aad0a336f299 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:02 -0400 Subject: fs: add an icount_read helper Instead of doing direct access to ->i_count, add a helper to handle this. This will make it easier to convert i_count to a refcount later. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/9bc62a84c6b9d6337781203f60837bd98fbc4a96.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- arch/powerpc/platforms/cell/spufs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/ext4/ialloc.c | 4 ++-- fs/fs-writeback.c | 2 +- fs/hpfs/inode.c | 2 +- fs/inode.c | 8 ++++---- fs/nfs/inode.c | 4 ++-- fs/notify/fsnotify.c | 2 +- fs/smb/client/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_trace.h | 2 +- include/linux/fs.h | 5 +++++ include/trace/events/filelock.h | 2 +- security/landlock/fs.c | 2 +- 16 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index d5a2c77bc908..ce839783c0df 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file) if (ctx->owner != current->mm) return -EINVAL; - if (atomic_read(&inode->i_count) != 1) + if (icount_read(inode) != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index de722b232ec1..5bcd8e25fa78 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4538,7 +4538,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff82..62dba710504d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b29..ba4fd9aba1c1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..6088a67b2aae 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1767,7 +1767,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db..34008442ee26 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/inode.c b/fs/inode.c index 01a554e11279..fe4868e2a954 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..b52805951856 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -2229,7 +2229,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c2..46bfc543f946 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 75be4b46bc6f..211d5b8b42f4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2749,7 +2749,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..a0269ba96e3d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..df8eab11dc48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1035,7 +1035,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1794e3e3156..34001503fc8b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1151,7 +1151,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/include/linux/fs.h b/include/linux/fs.h index c34554d8c4fe..c4fd010cf5bf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2611,6 +2611,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline int icount_read(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982..fdd36b1daa25 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -189,7 +189,7 @@ TRACE_EVENT(generic_add_lease, __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); - __entry->icount = atomic_read(&inode->i_count); + __entry->icount = icount_read(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c04f8879ad03..0bade2c5aa1d 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1281,7 +1281,7 @@ static void hook_sb_delete(struct super_block *const sb) struct landlock_object *object; /* Only handles referenced inodes. */ - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) continue; /* -- cgit v1.2.3 From e5bca063c150877c45b88ff134b6ef7a5eae8e7a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 30 Aug 2025 12:55:39 +0200 Subject: fs: remove vfs_ioctl export vfs_ioctl() is no longer called by anything outside of fs/ioctl.c, so remove the global symbol and export as it is not needed. Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/2025083038-carving-amuck-a4ae@gregkh Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/ioctl.c | 3 +-- include/linux/fs.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/fs/ioctl.c b/fs/ioctl.c index 83d07218b6cd..1c152c2b1b67 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 34693cae15a2..4daf9b30a641 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2053,8 +2053,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -- cgit v1.2.3 From 7c4a379e0622e7d8e7eb7dbc76445cdd6306aad8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 29 Aug 2025 16:43:02 +0200 Subject: ALSA: emu10k1: Use guard() for emu1010 FPGA locking The snd_emu1010_fpga_lock() and _unlock() call pairs can be simplified gracefully with the introduction of guard(). Only code refactoring, and no functional changes. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250829144342.4290-28-tiwai@suse.de --- include/sound/emu10k1.h | 3 +-- sound/pci/emu10k1/emu10k1_main.c | 15 +++++---------- sound/pci/emu10k1/emumixer.c | 15 ++++++--------- sound/pci/emu10k1/emuproc.c | 8 ++------ sound/pci/emu10k1/io.c | 3 +-- 5 files changed, 15 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h index 38db50b280eb..4f94565c9d15 100644 --- a/include/sound/emu10k1.h +++ b/include/sound/emu10k1.h @@ -1842,8 +1842,7 @@ unsigned int snd_emu10k1_ptr20_read(struct snd_emu10k1 * emu, unsigned int reg, void snd_emu10k1_ptr20_write(struct snd_emu10k1 *emu, unsigned int reg, unsigned int chn, unsigned int data); int snd_emu10k1_spi_write(struct snd_emu10k1 * emu, unsigned int data); int snd_emu10k1_i2c_write(struct snd_emu10k1 *emu, u32 reg, u32 value); -static inline void snd_emu1010_fpga_lock(struct snd_emu10k1 *emu) { mutex_lock(&emu->emu1010.lock); }; -static inline void snd_emu1010_fpga_unlock(struct snd_emu10k1 *emu) { mutex_unlock(&emu->emu1010.lock); }; +DEFINE_GUARD(snd_emu1010_fpga_lock, struct snd_emu10k1 *, mutex_lock(&(_T)->emu1010.lock), mutex_unlock(&(_T)->emu1010.lock)) void snd_emu1010_fpga_write_lock(struct snd_emu10k1 *emu, u32 reg, u32 value); void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value); void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value); diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c index bbe252b8916c..a3d028e4a212 100644 --- a/sound/pci/emu10k1/emu10k1_main.c +++ b/sound/pci/emu10k1/emu10k1_main.c @@ -768,7 +768,7 @@ static void emu1010_work(struct work_struct *work) return; #endif - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); snd_emu1010_fpga_read(emu, EMU_HANA_IRQ_STATUS, &sts); @@ -779,8 +779,6 @@ static void emu1010_work(struct work_struct *work) if (sts & EMU_HANA_IRQ_WCLK_CHANGED) emu1010_clock_event(emu); - - snd_emu1010_fpga_unlock(emu); } static void emu1010_interrupt(struct snd_emu10k1 *emu) @@ -814,13 +812,13 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu) * Proper init follows in snd_emu10k1_init(). */ outl(HCFG_LOCKSOUNDCACHE | HCFG_LOCKTANKCACHE_MASK, emu->port + HCFG); - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); dev_info(emu->card->dev, "emu1010: Loading Hana Firmware\n"); err = snd_emu1010_load_firmware(emu, 0, &emu->firmware); if (err < 0) { dev_info(emu->card->dev, "emu1010: Loading Firmware failed\n"); - goto fail; + return err; } /* ID, should read & 0x7f = 0x55 when FPGA programmed. */ @@ -830,8 +828,7 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu) dev_info(emu->card->dev, "emu1010: Loading Hana Firmware file failed, reg = 0x%x\n", reg); - err = -ENODEV; - goto fail; + return -ENODEV; } dev_info(emu->card->dev, "emu1010: Hana Firmware loaded\n"); @@ -891,9 +888,7 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu) // so it is safe to simply enable the outputs. snd_emu1010_fpga_write(emu, EMU_HANA_UNMUTE, EMU_UNMUTE); -fail: - snd_emu1010_fpga_unlock(emu); - return err; + return 0; } /* * Create the EMU10K1 instance diff --git a/sound/pci/emu10k1/emumixer.c b/sound/pci/emu10k1/emumixer.c index d665d5d1ad7c..6d86584be750 100644 --- a/sound/pci/emu10k1/emumixer.c +++ b/sound/pci/emu10k1/emumixer.c @@ -662,9 +662,8 @@ static int snd_emu1010_output_source_put(struct snd_kcontrol *kcontrol, change = (emu->emu1010.output_source[channel] != val); if (change) { emu->emu1010.output_source[channel] = val; - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); snd_emu1010_output_source_apply(emu, channel, val); - snd_emu1010_fpga_unlock(emu); } return change; } @@ -708,9 +707,8 @@ static int snd_emu1010_input_source_put(struct snd_kcontrol *kcontrol, change = (emu->emu1010.input_source[channel] != val); if (change) { emu->emu1010.input_source[channel] = val; - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); snd_emu1010_input_source_apply(emu, channel, val); - snd_emu1010_fpga_unlock(emu); } return change; } @@ -985,7 +983,7 @@ static int snd_emu1010_clock_source_put(struct snd_kcontrol *kcontrol, val = ucontrol->value.enumerated.item[0] ; if (val >= emu_ci->num) return -EINVAL; - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); spin_lock_irq(&emu->reg_lock); change = (emu->emu1010.clock_source != val); if (change) { @@ -1002,7 +1000,6 @@ static int snd_emu1010_clock_source_put(struct snd_kcontrol *kcontrol, } else { spin_unlock_irq(&emu->reg_lock); } - snd_emu1010_fpga_unlock(emu); return change; } @@ -2330,9 +2327,9 @@ int snd_emu10k1_mixer(struct snd_emu10k1 *emu, for (i = 0; i < emu_ri->n_outs; i++) emu->emu1010.output_source[i] = emu1010_map_source(emu_ri, emu_ri->out_dflts[i]); - snd_emu1010_fpga_lock(emu); - snd_emu1010_apply_sources(emu); - snd_emu1010_fpga_unlock(emu); + scoped_guard(snd_emu1010_fpga_lock, emu) { + snd_emu1010_apply_sources(emu); + } kctl = emu->ctl_clock_source = snd_ctl_new1(&snd_emu1010_clock_source, emu); err = snd_ctl_add(card, kctl); diff --git a/sound/pci/emu10k1/emuproc.c b/sound/pci/emu10k1/emuproc.c index bd4734dc04cd..a12518dd4eed 100644 --- a/sound/pci/emu10k1/emuproc.c +++ b/sound/pci/emu10k1/emuproc.c @@ -166,7 +166,7 @@ static void snd_emu10k1_proc_spdif_read(struct snd_info_entry *entry, u32 value2; if (emu->card_capabilities->emu_model) { - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); // This represents the S/PDIF lock status on 0404b, which is // kinda weird and unhelpful, because monitoring it via IRQ is @@ -200,8 +200,6 @@ static void snd_emu10k1_proc_spdif_read(struct snd_info_entry *entry, snd_iprintf(buffer, "\nS/PDIF mode: %s%s\n", value & EMU_HANA_SPDIF_MODE_RX_PRO ? "professional" : "consumer", value & EMU_HANA_SPDIF_MODE_RX_NOCOPY ? ", no copy" : ""); - - snd_emu1010_fpga_unlock(emu); } else { snd_emu10k1_proc_spdif_status(emu, buffer, "CD-ROM S/PDIF In", CDCS, CDSRCS); snd_emu10k1_proc_spdif_status(emu, buffer, "Optical or Coax S/PDIF In", GPSCS, GPSRCS); @@ -464,7 +462,7 @@ static void snd_emu_proc_emu1010_reg_read(struct snd_info_entry *entry, u32 value; int i; - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); snd_iprintf(buffer, "EMU1010 Registers:\n\n"); @@ -504,8 +502,6 @@ static void snd_emu_proc_emu1010_reg_read(struct snd_info_entry *entry, snd_emu_proc_emu1010_link_read(emu, buffer, 0x701); } } - - snd_emu1010_fpga_unlock(emu); } static void snd_emu_proc_io_reg_read(struct snd_info_entry *entry, diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c index b60ab5671e00..69debe781177 100644 --- a/sound/pci/emu10k1/io.c +++ b/sound/pci/emu10k1/io.c @@ -297,9 +297,8 @@ void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value) void snd_emu1010_fpga_write_lock(struct snd_emu10k1 *emu, u32 reg, u32 value) { - snd_emu1010_fpga_lock(emu); + guard(snd_emu1010_fpga_lock)(emu); snd_emu1010_fpga_write_locked(emu, reg, value); - snd_emu1010_fpga_unlock(emu); } void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value) -- cgit v1.2.3 From 3abb538fffc8af73859f16e6e274962d9b53c907 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 29 Aug 2025 16:52:47 +0200 Subject: ALSA: gus: Use guard() for mutex locks Replace the manual mutex lock/unlock pairs with guard() for code simplification. As replaced with the guard(mutex), no longer used snd_gf1_mem_lock() is dropped, too. Only code refactoring, and no behavior change. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250829145300.5460-8-tiwai@suse.de --- include/sound/gus.h | 1 - sound/isa/gus/gus_dma.c | 10 +++------- sound/isa/gus/gus_main.c | 1 - sound/isa/gus/gus_mem.c | 33 ++++++--------------------------- 4 files changed, 9 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/sound/gus.h b/include/sound/gus.h index 1c8fb6c93e50..321ae93625eb 100644 --- a/include/sound/gus.h +++ b/include/sound/gus.h @@ -515,7 +515,6 @@ struct _SND_IW_LFO_PROGRAM { /* gus_mem.c */ -void snd_gf1_mem_lock(struct snd_gf1_mem * alloc, int xup); int snd_gf1_mem_xfree(struct snd_gf1_mem * alloc, struct snd_gf1_mem_block * block); struct snd_gf1_mem_block *snd_gf1_mem_alloc(struct snd_gf1_mem * alloc, int owner, char *name, int size, int w_16, diff --git a/sound/isa/gus/gus_dma.c b/sound/isa/gus/gus_dma.c index bb5a4e2fbfb3..7b2e7878adc7 100644 --- a/sound/isa/gus/gus_dma.c +++ b/sound/isa/gus/gus_dma.c @@ -143,18 +143,15 @@ static void snd_gf1_dma_interrupt(struct snd_gus_card * gus) int snd_gf1_dma_init(struct snd_gus_card * gus) { - mutex_lock(&gus->dma_mutex); + guard(mutex)(&gus->dma_mutex); gus->gf1.dma_shared++; - if (gus->gf1.dma_shared > 1) { - mutex_unlock(&gus->dma_mutex); + if (gus->gf1.dma_shared > 1) return 0; - } gus->gf1.interrupt_handler_dma_write = snd_gf1_dma_interrupt; gus->gf1.dma_data_pcm = gus->gf1.dma_data_pcm_last = gus->gf1.dma_data_synth = gus->gf1.dma_data_synth_last = NULL; - mutex_unlock(&gus->dma_mutex); return 0; } @@ -162,7 +159,7 @@ int snd_gf1_dma_done(struct snd_gus_card * gus) { struct snd_gf1_dma_block *block; - mutex_lock(&gus->dma_mutex); + guard(mutex)(&gus->dma_mutex); gus->gf1.dma_shared--; if (!gus->gf1.dma_shared) { snd_dma_disable(gus->gf1.dma1); @@ -179,7 +176,6 @@ int snd_gf1_dma_done(struct snd_gus_card * gus) gus->gf1.dma_data_pcm_last = gus->gf1.dma_data_synth_last = NULL; } - mutex_unlock(&gus->dma_mutex); return 0; } diff --git a/sound/isa/gus/gus_main.c b/sound/isa/gus/gus_main.c index 873ef4046cd6..a82185c86143 100644 --- a/sound/isa/gus/gus_main.c +++ b/sound/isa/gus/gus_main.c @@ -447,4 +447,3 @@ EXPORT_SYMBOL(snd_gf1_translate_freq); EXPORT_SYMBOL(snd_gf1_mem_alloc); EXPORT_SYMBOL(snd_gf1_mem_xfree); EXPORT_SYMBOL(snd_gf1_mem_free); -EXPORT_SYMBOL(snd_gf1_mem_lock); diff --git a/sound/isa/gus/gus_mem.c b/sound/isa/gus/gus_mem.c index 054058779db6..8d95d8d5abdf 100644 --- a/sound/isa/gus/gus_mem.c +++ b/sound/isa/gus/gus_mem.c @@ -15,15 +15,6 @@ static void snd_gf1_mem_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer); #endif -void snd_gf1_mem_lock(struct snd_gf1_mem * alloc, int xup) -{ - if (!xup) { - mutex_lock(&alloc->memory_mutex); - } else { - mutex_unlock(&alloc->memory_mutex); - } -} - static struct snd_gf1_mem_block * snd_gf1_mem_xalloc(struct snd_gf1_mem *alloc, struct snd_gf1_mem_block *block, const char *name) @@ -50,7 +41,6 @@ snd_gf1_mem_xalloc(struct snd_gf1_mem *alloc, struct snd_gf1_mem_block *block, alloc->first = nblock; else nblock->prev->next = nblock; - mutex_unlock(&alloc->memory_mutex); return nblock; } pblock = pblock->next; @@ -71,7 +61,6 @@ int snd_gf1_mem_xfree(struct snd_gf1_mem * alloc, struct snd_gf1_mem_block * blo { if (block->share) { /* ok.. shared block */ block->share--; - mutex_unlock(&alloc->memory_mutex); return 0; } if (alloc->first == block) { @@ -183,7 +172,7 @@ struct snd_gf1_mem_block *snd_gf1_mem_alloc(struct snd_gf1_mem * alloc, int owne { struct snd_gf1_mem_block block, *nblock; - snd_gf1_mem_lock(alloc, 0); + guard(mutex)(&alloc->memory_mutex); if (share_id != NULL) { nblock = snd_gf1_mem_share(alloc, share_id); if (nblock != NULL) { @@ -193,36 +182,27 @@ struct snd_gf1_mem_block *snd_gf1_mem_alloc(struct snd_gf1_mem * alloc, int owne goto __std; } nblock->share++; - snd_gf1_mem_lock(alloc, 1); return NULL; } } __std: - if (snd_gf1_mem_find(alloc, &block, size, w_16, align) < 0) { - snd_gf1_mem_lock(alloc, 1); + if (snd_gf1_mem_find(alloc, &block, size, w_16, align) < 0) return NULL; - } if (share_id != NULL) memcpy(&block.share_id, share_id, sizeof(block.share_id)); block.owner = owner; nblock = snd_gf1_mem_xalloc(alloc, &block, name); - snd_gf1_mem_lock(alloc, 1); return nblock; } int snd_gf1_mem_free(struct snd_gf1_mem * alloc, unsigned int address) { - int result; struct snd_gf1_mem_block *block; - snd_gf1_mem_lock(alloc, 0); + guard(mutex)(&alloc->memory_mutex); block = snd_gf1_mem_look(alloc, address); - if (block) { - result = snd_gf1_mem_xfree(alloc, block); - snd_gf1_mem_lock(alloc, 1); - return result; - } - snd_gf1_mem_lock(alloc, 1); + if (block) + return snd_gf1_mem_xfree(alloc, block); return -EINVAL; } @@ -282,7 +262,7 @@ static void snd_gf1_mem_info_read(struct snd_info_entry *entry, gus = entry->private_data; alloc = &gus->gf1.mem_alloc; - mutex_lock(&alloc->memory_mutex); + guard(mutex)(&alloc->memory_mutex); snd_iprintf(buffer, "8-bit banks : \n "); for (i = 0; i < 4; i++) snd_iprintf(buffer, "0x%06x (%04ik)%s", alloc->banks_8[i].address, alloc->banks_8[i].size >> 10, i + 1 < 4 ? "," : ""); @@ -326,7 +306,6 @@ static void snd_gf1_mem_info_read(struct snd_info_entry *entry, } snd_iprintf(buffer, " Total: memory = %i, used = %i, free = %i\n", total, used, total - used); - mutex_unlock(&alloc->memory_mutex); #if 0 ultra_iprintf(buffer, " Verify: free = %i, max 8-bit block = %i, max 16-bit block = %i\n", ultra_memory_free_size(card, &card->gf1.mem_alloc), -- cgit v1.2.3 From 826f35b829f43dc62fb847eca6f79e8698b4994d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 29 Aug 2025 17:13:19 +0200 Subject: ALSA: synth: Use guard() for preset locks Define a macro for the preset locking/unlocking pairs for soundfont using guard() macro as a further code cleanup. The new macro is put in soundfont.h (and some function renames) along with it for avoiding unnecessary troubles with clang. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250829151335.7342-6-tiwai@suse.de --- include/sound/soundfont.h | 18 ++++++++++++++ sound/synth/emux/soundfont.c | 56 ++++++++++---------------------------------- 2 files changed, 30 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/sound/soundfont.h b/include/sound/soundfont.h index 8a40cc15f66d..48f8cf6de3ac 100644 --- a/include/sound/soundfont.h +++ b/include/sound/soundfont.h @@ -114,5 +114,23 @@ int snd_sf_calc_parm_decay(int msec); extern int snd_sf_vol_table[128]; int snd_sf_linear_to_log(unsigned int amount, int offset, int ratio); +/* lock access to sflist */ +static inline void snd_soundfont_lock_preset(struct snd_sf_list *sflist) +{ + mutex_lock(&sflist->presets_mutex); + guard(spinlock_irqsave)(&sflist->lock); + sflist->presets_locked = 1; +} + +/* remove lock */ +static inline void snd_soundfont_unlock_preset(struct snd_sf_list *sflist) +{ + guard(spinlock_irqsave)(&sflist->lock); + sflist->presets_locked = 0; + mutex_unlock(&sflist->presets_mutex); +} + +DEFINE_GUARD(snd_soundfont_lock_preset, struct snd_sf_list *, + snd_soundfont_lock_preset(_T), snd_soundfont_unlock_preset(_T)) #endif /* __SOUND_SOUNDFONT_H */ diff --git a/sound/synth/emux/soundfont.c b/sound/synth/emux/soundfont.c index cbff9f7ad38c..59f3b1b6df4a 100644 --- a/sound/synth/emux/soundfont.c +++ b/sound/synth/emux/soundfont.c @@ -60,30 +60,6 @@ static int get_index(int bank, int instr, int key); static void snd_sf_init(struct snd_sf_list *sflist); static void snd_sf_clear(struct snd_sf_list *sflist); -/* - * lock access to sflist - */ -static void -lock_preset(struct snd_sf_list *sflist) -{ - mutex_lock(&sflist->presets_mutex); - guard(spinlock_irqsave)(&sflist->lock); - sflist->presets_locked = 1; -} - - -/* - * remove lock - */ -static void -unlock_preset(struct snd_sf_list *sflist) -{ - guard(spinlock_irqsave)(&sflist->lock); - sflist->presets_locked = 0; - mutex_unlock(&sflist->presets_mutex); -} - - /* * close the patch if the patch was opened by this client. */ @@ -140,10 +116,8 @@ snd_soundfont_load(struct snd_card *card, if (patch.type == SNDRV_SFNT_OPEN_PATCH) { /* grab sflist to open */ - lock_preset(sflist); - rc = open_patch(sflist, data, count, client); - unlock_preset(sflist); - return rc; + guard(snd_soundfont_lock_preset)(sflist); + return open_patch(sflist, data, count, client); } /* check if other client already opened patch */ @@ -152,7 +126,7 @@ snd_soundfont_load(struct snd_card *card, return -EBUSY; } - lock_preset(sflist); + guard(snd_soundfont_lock_preset)(sflist); rc = -EINVAL; switch (patch.type) { case SNDRV_SFNT_LOAD_INFO: @@ -190,7 +164,6 @@ snd_soundfont_load(struct snd_card *card, } break; } - unlock_preset(sflist); return rc; } @@ -1153,11 +1126,8 @@ snd_soundfont_load_guspatch(struct snd_card *card, struct snd_sf_list *sflist, const char __user *data, long count) { - int rc; - lock_preset(sflist); - rc = load_guspatch(card, sflist, data, count); - unlock_preset(sflist); - return rc; + guard(snd_soundfont_lock_preset)(sflist); + return load_guspatch(card, sflist, data, count); } @@ -1446,11 +1416,11 @@ snd_sf_free(struct snd_sf_list *sflist) if (sflist == NULL) return; - lock_preset(sflist); - if (sflist->callback.sample_reset) - sflist->callback.sample_reset(sflist->callback.private_data); - snd_sf_clear(sflist); - unlock_preset(sflist); + scoped_guard(snd_soundfont_lock_preset, sflist) { + if (sflist->callback.sample_reset) + sflist->callback.sample_reset(sflist->callback.private_data); + snd_sf_clear(sflist); + } kfree(sflist); } @@ -1462,11 +1432,10 @@ snd_sf_free(struct snd_sf_list *sflist) int snd_soundfont_remove_samples(struct snd_sf_list *sflist) { - lock_preset(sflist); + guard(snd_soundfont_lock_preset)(sflist); if (sflist->callback.sample_reset) sflist->callback.sample_reset(sflist->callback.private_data); snd_sf_clear(sflist); - unlock_preset(sflist); return 0; } @@ -1482,7 +1451,7 @@ snd_soundfont_remove_unlocked(struct snd_sf_list *sflist) struct snd_sf_zone *zp, *nextzp; struct snd_sf_sample *sp, *nextsp; - lock_preset(sflist); + guard(snd_soundfont_lock_preset)(sflist); if (sflist->callback.sample_reset) sflist->callback.sample_reset(sflist->callback.private_data); @@ -1516,6 +1485,5 @@ snd_soundfont_remove_unlocked(struct snd_sf_list *sflist) rebuild_presets(sflist); - unlock_preset(sflist); return 0; } -- cgit v1.2.3 From bbf7a84787d0dc5910d121b025dd5f4dea060768 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 26 Aug 2025 06:22:05 +0000 Subject: ASoC: soc-dapm: rename snd_soc_dapm_kcontrol_widget() to snd_soc_dapm_kcontrol_to_widget() snd_soc_dapm_kcontrol_widget() is unclear naming, rename it to snd_soc_dapm_kcontrol_to_widget(). This is prepare for dapm cleanup. This patch keeps compatible by using define, but old name will be replaced on each drivers and removed from ASoC in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/875xeay54j.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 ++- sound/soc/soc-dapm.c | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 2e9196b6ffba..220eb6d08534 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -719,7 +719,7 @@ int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream, void snd_soc_dapm_dai_free_widgets(struct snd_soc_dapm_widget_list **list); struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_dapm(struct snd_kcontrol *kcontrol); -struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_widget(struct snd_kcontrol *kcontrol); +struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_to_widget(struct snd_kcontrol *kcontrol); int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); enum snd_soc_bias_level snd_soc_dapm_get_bias_level(struct snd_soc_dapm_context *dapm); @@ -729,6 +729,7 @@ void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so #define snd_soc_component_force_bias_level(c, l) snd_soc_dapm_force_bias_level(&(c)->dapm, l) #define snd_soc_component_get_bias_level(c) snd_soc_dapm_get_bias_level(&(c)->dapm) #define snd_soc_component_init_bias_level(c, l) snd_soc_dapm_init_bias_level(&(c)->dapm, l) +#define snd_soc_dapm_kcontrol_widget snd_soc_dapm_kcontrol_to_widget #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 6782a0d6cd47..bfb2edc2f91d 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -877,16 +877,15 @@ static bool dapm_kcontrol_set_value(const struct snd_kcontrol *kcontrol, } /** - * snd_soc_dapm_kcontrol_widget() - Returns the widget associated to a + * snd_soc_dapm_kcontrol_to_widget() - Returns the widget associated to a * kcontrol * @kcontrol: The kcontrol */ -struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_widget( - struct snd_kcontrol *kcontrol) +struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_to_widget(struct snd_kcontrol *kcontrol) { return dapm_kcontrol_get_wlist(kcontrol)->widgets[0]; } -EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_widget); +EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_to_widget); /** * snd_soc_dapm_kcontrol_dapm() - Returns the dapm context associated to a -- cgit v1.2.3 From 2532041865305594e37c4c22bd650d52ea805ec8 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 26 Aug 2025 06:22:12 +0000 Subject: ASoC: soc-dapm: rename snd_soc_dapm_kcontrol_dapm() to snd_soc_dapm_kcontrol_to_dapm() snd_soc_dapm_kcontrol_dapm() is unclear naming, rename it to snd_soc_dapm_kcontrol_to_dapm(). This is prepare for dapm cleanup. This patch keeps compatible by using define, but old name will be replaced on each drivers and removed from ASoC in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/874ituy54c.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 ++- sound/soc/soc-dapm.c | 8 +++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 220eb6d08534..8add8de7a8c9 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -718,7 +718,7 @@ int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream, bool (*custom_stop_condition)(struct snd_soc_dapm_widget *, enum snd_soc_dapm_direction)); void snd_soc_dapm_dai_free_widgets(struct snd_soc_dapm_widget_list **list); -struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_dapm(struct snd_kcontrol *kcontrol); +struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_to_dapm(struct snd_kcontrol *kcontrol); struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_to_widget(struct snd_kcontrol *kcontrol); int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); @@ -730,6 +730,7 @@ void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so #define snd_soc_component_get_bias_level(c) snd_soc_dapm_get_bias_level(&(c)->dapm) #define snd_soc_component_init_bias_level(c, l) snd_soc_dapm_init_bias_level(&(c)->dapm, l) #define snd_soc_dapm_kcontrol_widget snd_soc_dapm_kcontrol_to_widget +#define snd_soc_dapm_kcontrol_dapm snd_soc_dapm_kcontrol_to_dapm #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index bfb2edc2f91d..25b2ea14d659 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -888,19 +888,17 @@ struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_to_widget(struct snd_kcontrol EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_to_widget); /** - * snd_soc_dapm_kcontrol_dapm() - Returns the dapm context associated to a - * kcontrol + * snd_soc_dapm_kcontrol_to_dapm() - Returns the dapm context associated to a kcontrol * @kcontrol: The kcontrol * * Note: This function must only be used on kcontrols that are known to have * been registered for a CODEC. Otherwise the behaviour is undefined. */ -struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_dapm( - struct snd_kcontrol *kcontrol) +struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_to_dapm(struct snd_kcontrol *kcontrol) { return dapm_kcontrol_get_wlist(kcontrol)->widgets[0]->dapm; } -EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_dapm); +EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_to_dapm); static void dapm_reset(struct snd_soc_card *card) { -- cgit v1.2.3 From f6883f0f03575ecc8c4c5d2a04339bac91eb33d7 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 26 Aug 2025 06:22:19 +0000 Subject: ASoC: soc-dapm: rename dapm_kcontrol_get_value() to snd_soc_dapm_kcontrol_get_value() dapm_kcontrol_get_value() is global function, adds snd_soc_ prefix This patch keeps compatible by using define, but old name will be replaced on each drivers and removed from ASoC in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87349ey546.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 ++- sound/soc/soc-dapm.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 8add8de7a8c9..cd02fedb2624 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -699,7 +699,6 @@ int snd_soc_dapm_sync_unlocked(struct snd_soc_dapm_context *dapm); int snd_soc_dapm_force_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin); int snd_soc_dapm_force_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin); int snd_soc_dapm_ignore_suspend(struct snd_soc_dapm_context *dapm, const char *pin); -unsigned int dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol); void snd_soc_dapm_mark_endpoints_dirty(struct snd_soc_card *card); /* @@ -720,6 +719,7 @@ void snd_soc_dapm_dai_free_widgets(struct snd_soc_dapm_widget_list **list); struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_to_dapm(struct snd_kcontrol *kcontrol); struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_to_widget(struct snd_kcontrol *kcontrol); +unsigned int snd_soc_dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol); int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); enum snd_soc_bias_level snd_soc_dapm_get_bias_level(struct snd_soc_dapm_context *dapm); @@ -731,6 +731,7 @@ void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so #define snd_soc_component_init_bias_level(c, l) snd_soc_dapm_init_bias_level(&(c)->dapm, l) #define snd_soc_dapm_kcontrol_widget snd_soc_dapm_kcontrol_to_widget #define snd_soc_dapm_kcontrol_dapm snd_soc_dapm_kcontrol_to_dapm +#define dapm_kcontrol_get_value snd_soc_dapm_kcontrol_get_value #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 25b2ea14d659..f8ecf86a3d2b 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -838,13 +838,13 @@ static struct list_head *dapm_kcontrol_get_path_list( list_for_each_entry(path, dapm_kcontrol_get_path_list(kcontrol), \ list_kcontrol) -unsigned int dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol) +unsigned int snd_soc_dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol) { struct dapm_kcontrol_data *data = snd_kcontrol_chip(kcontrol); return data->value; } -EXPORT_SYMBOL_GPL(dapm_kcontrol_get_value); +EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_get_value); static bool dapm_kcontrol_set_value(const struct snd_kcontrol *kcontrol, unsigned int value) -- cgit v1.2.3 From 8a9772ec08f87c9e45ab1ad2c8d2b8c1763836eb Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 26 Aug 2025 06:22:26 +0000 Subject: ASoC: soc-dapm: rename snd_soc_kcontrol_component() to snd_soc_kcontrol_to_component() We have 2 similar functions, both converts date from snd_kcontrol to snd_soc_component. (A) snd_soc_kcontrol_component() (B) snd_soc_dapm_kcontrol_component() (A) is just wrapper for snd_kcontrol_chip(). (B) is for more complex conversion. Having similar functions is confusable. So (A) will be replaced to original snd_kcontrol_chip(). (B) will be stay, but the function name should be xx_to_xx(). And it is defined at soc-component.h. It should be implemented at soc-dapm.c. This patch renames it to snd_soc_dapm_kcontrol_to_component(), and move to soc-dapm.c. This patch keeps compatible by using define, but old name will be replaced on each drivers and removed from ASoC in the future. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/871poyy53x.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 14 -------------- include/sound/soc-dapm.h | 2 ++ sound/soc/soc-dapm.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 54bfa0cb1085..48e45cbe82e5 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -286,20 +286,6 @@ static inline struct snd_soc_dapm_context *snd_soc_component_get_dapm( return &component->dapm; } -/** - * snd_soc_dapm_kcontrol_component() - Returns the component associated to a - * kcontrol - * @kcontrol: The kcontrol - * - * This function must only be used on DAPM contexts that are known to be part of - * a COMPONENT (e.g. in a COMPONENT driver). Otherwise the behavior is undefined - */ -static inline struct snd_soc_component *snd_soc_dapm_kcontrol_component( - struct snd_kcontrol *kcontrol) -{ - return snd_soc_dapm_to_component(snd_soc_dapm_kcontrol_dapm(kcontrol)); -} - /** * snd_soc_component_cache_sync() - Sync the register cache with the hardware * @component: COMPONENT to sync diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index cd02fedb2624..ed39458b94bf 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -719,6 +719,7 @@ void snd_soc_dapm_dai_free_widgets(struct snd_soc_dapm_widget_list **list); struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_to_dapm(struct snd_kcontrol *kcontrol); struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_to_widget(struct snd_kcontrol *kcontrol); +struct snd_soc_component *snd_soc_dapm_kcontrol_to_component(struct snd_kcontrol *kcontrol); unsigned int snd_soc_dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol); int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level); @@ -732,6 +733,7 @@ void snd_soc_dapm_init_bias_level(struct snd_soc_dapm_context *dapm, enum snd_so #define snd_soc_dapm_kcontrol_widget snd_soc_dapm_kcontrol_to_widget #define snd_soc_dapm_kcontrol_dapm snd_soc_dapm_kcontrol_to_dapm #define dapm_kcontrol_get_value snd_soc_dapm_kcontrol_get_value +#define snd_soc_dapm_kcontrol_component snd_soc_dapm_kcontrol_to_component #define for_each_dapm_widgets(list, i, widget) \ for ((i) = 0; \ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index f8ecf86a3d2b..d74c096cc208 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -900,6 +900,20 @@ struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_to_dapm(struct snd_kcontrol * } EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_to_dapm); +/** + * snd_soc_dapm_kcontrol_to_component() - Returns the component associated to a + * kcontrol + * @kcontrol: The kcontrol + * + * This function must only be used on DAPM contexts that are known to be part of + * a COMPONENT (e.g. in a COMPONENT driver). Otherwise the behavior is undefined + */ +struct snd_soc_component *snd_soc_dapm_kcontrol_to_component(struct snd_kcontrol *kcontrol) +{ + return snd_soc_dapm_to_component(snd_soc_dapm_kcontrol_to_dapm(kcontrol)); +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_kcontrol_to_component); + static void dapm_reset(struct snd_soc_card *card) { struct snd_soc_dapm_widget *w; -- cgit v1.2.3 From edd3cb05c00a040dc72bed20b14b5ba865188bce Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:51 +0200 Subject: copy_process: pass clone_flags as u64 across calltree With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit, with a new type of u64 for the flags. However, for most consumers of clone_flags the interface was not changed from the previous type of unsigned long. While this works fine as long as none of the new 64-bit flag bits (CLONE_CLEAR_SIGHAND and CLONE_INTO_CGROUP) are evaluated, this is still undesirable in terms of the principle of least surprise. Thus, this commit fixes all relevant interfaces of callees to sys_clone3/copy_process (excluding the architecture-specific copy_thread) to consistently pass clone_flags as u64, so that no truncation to 32-bit integers occurs on 32-bit architectures. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-2-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- block/blk-ioc.c | 2 +- fs/namespace.c | 2 +- include/linux/cgroup.h | 4 ++-- include/linux/cred.h | 2 +- include/linux/iocontext.h | 6 +++--- include/linux/ipc_namespace.h | 4 ++-- include/linux/lsm_hook_defs.h | 2 +- include/linux/mnt_namespace.h | 2 +- include/linux/nsproxy.h | 2 +- include/linux/pid_namespace.h | 4 ++-- include/linux/rseq.h | 4 ++-- include/linux/sched/task.h | 2 +- include/linux/security.h | 4 ++-- include/linux/sem.h | 4 ++-- include/linux/time_namespace.h | 4 ++-- include/linux/uprobes.h | 4 ++-- include/linux/user_events.h | 4 ++-- include/linux/utsname.h | 4 ++-- include/net/net_namespace.h | 4 ++-- include/trace/events/task.h | 6 +++--- ipc/namespace.c | 2 +- ipc/sem.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/cred.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/fork.c | 8 ++++---- kernel/nsproxy.c | 4 ++-- kernel/pid_namespace.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- kernel/time/namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- security/apparmor/lsm.c | 2 +- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/tomoyo/tomoyo.c | 2 +- 38 files changed, 59 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5..d15918d7fabb 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ out: } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..d9c190ffa7df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4200,7 +4200,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..56d9556a181a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -796,7 +796,7 @@ extern struct cgroup_namespace init_cgroup_ns; void free_cgroup_ns(struct cgroup_namespace *ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); @@ -818,7 +818,7 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833f..89ae50ad2ace 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b443..079d8773790c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..4b399893e2b3 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,7 +129,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) @@ -151,7 +151,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..adbe234a6f6c 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..ff290c87b2e7 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,7 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..82533e899ff4 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -103,7 +103,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..0620a3e08e83 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -78,7 +78,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); @@ -97,7 +97,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..a96fd345aa38 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -65,7 +65,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +107,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..34d6a0e108c3 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..9a1d4a6c8673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -1215,7 +1215,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb..275269ce2ec8 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..b6e36525e0be 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -43,7 +43,7 @@ static inline struct time_namespace *get_time_ns(struct time_namespace *ns) return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -129,7 +129,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..915303a82d84 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -205,7 +205,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -281,7 +281,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a0973..57d1ff006090 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..ba34ec0e2f95 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -35,7 +35,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) refcount_inc(&ns->ns.count); } -extern struct uts_namespace *copy_utsname(unsigned long flags, +extern struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct uts_namespace *ns); @@ -55,7 +55,7 @@ static inline void put_uts_ns(struct uts_namespace *ns) { } -static inline struct uts_namespace *copy_utsname(unsigned long flags, +static inline struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..0e008cfe159d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -204,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include #include -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) diff --git a/include/trace/events/task.h b/include/trace/events/task.h index af535b053033..4f0759634306 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -8,14 +8,14 @@ TRACE_EVENT(task_newtask, - TP_PROTO(struct task_struct *task, unsigned long clone_flags), + TP_PROTO(struct task_struct *task, u64 clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) - __field( unsigned long, clone_flags) + __field( u64, clone_flags) __field( short, oom_score_adj) ), @@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", + TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe..a712ec27209c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -106,7 +106,7 @@ fail: return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) diff --git a/ipc/sem.c b/ipc/sem.c index a39cdc7bf88f..0f06e4bd4673 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..dedadb525880 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -47,7 +47,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..dbf6b687dc5c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..b2753014c6dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2160,7 +2160,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; diff --git a/kernel/fork.c b/kernel/fork.c index 4e2c5a3e8989..d6e1fb11eff9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1507,7 +1507,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1545,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1566,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1645,7 +1645,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8af3b9ec3aa8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..06bc7c7f78e0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -171,7 +171,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..6fa85d30d965 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4472,7 +4472,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4707,7 +4707,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..af0866ce2dfc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..f9adfc912ddc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1935,12 +1935,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..888872bcc5bb 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -130,7 +130,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..00d8d7922f86 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -86,7 +86,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..8ec9d83475bf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -539,7 +539,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..ba39cfe0cd08 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task) } static int apparmor_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); diff --git a/security/security.c b/security/security.c index ad163f06bf7a..a769140553bc 100644 --- a/security/security.c +++ b/security/security.c @@ -3185,7 +3185,7 @@ int security_file_truncate(struct file *file) * * Return: Returns a zero on success, negative values on failure. */ -int security_task_alloc(struct task_struct *task, unsigned long clone_flags) +int security_task_alloc(struct task_struct *task, u64 clone_flags) { int rc = lsm_task_alloc(task); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..bb016dd511c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4144,7 +4144,7 @@ static int selinux_file_open(struct file *file) /* task security operations */ static int selinux_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { u32 sid = current_sid(); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index d6ebcd9db80a..48fc59d38ab2 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); -- cgit v1.2.3 From 7051b54fb5aa2d0b77657aef7c272471b36c0327 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Aug 2025 21:56:38 +0000 Subject: tcp: Remove sk->sk_prot->orphan_count. TCP tracks the number of orphaned (SOCK_DEAD but not yet destructed) sockets in tcp_orphan_count. In some code that was shared with DCCP, tcp_orphan_count is referenced via sk->sk_prot->orphan_count. Let's reference tcp_orphan_count directly. inet_csk_prepare_for_destroy_sock() is moved to inet_connection_sock.c due to header dependency. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250829215641.711664-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 4 ++-- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h | 1 - include/net/inet_connection_sock.h | 8 +------- include/net/sock.h | 2 -- include/net/tcp.h | 10 ++++++++++ net/ipv4/inet_connection_sock.c | 11 +++++++++-- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_ipv4.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 10 files changed, 24 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 000116e47e38..4ee970f3bad6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -505,7 +505,7 @@ static void reset_listen_child(struct sock *child) chtls_send_reset(child, CPL_ABORT_SEND_RST, skb); sock_orphan(child); - INC_ORPHAN_COUNT(child); + tcp_orphan_count_inc(); if (child->sk_state == TCP_CLOSE) inet_csk_destroy_sock(child); } @@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent) * created only after 3 way handshake is done. */ sock_orphan(child); - INC_ORPHAN_COUNT(child); + tcp_orphan_count_inc(); chtls_release_resources(child); chtls_conn_done(child); } else { diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h index 667effc2a23c..29ceff5a5fcb 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h @@ -95,7 +95,6 @@ struct deferred_skb_cb { #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok) #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok) #define SACK_OK(tp) ((tp)->rx_opt.sack_ok) -#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count) /* TLS SKB */ #define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 1735db332aab..0737d8e178dd 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -299,14 +299,8 @@ reqsk_timeout(struct request_sock *req, unsigned long max_timeout) return (unsigned long)min_t(u64, timeout, max_timeout); } -static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) -{ - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - this_cpu_inc(*sk->sk_prot->orphan_count); -} - void inet_csk_destroy_sock(struct sock *sk); +void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* diff --git a/include/net/sock.h b/include/net/sock.h index 73cd3316e288..1e7f124871d2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1353,8 +1353,6 @@ struct proto { unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - unsigned int __percpu *orphan_count; - struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; diff --git a/include/net/tcp.h b/include/net/tcp.h index 16dc9cebb9d2..0fb7923b8367 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -54,6 +54,16 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +static inline void tcp_orphan_count_inc(void) +{ + this_cpu_inc(tcp_orphan_count); +} + +static inline void tcp_orphan_count_dec(void) +{ + this_cpu_dec(tcp_orphan_count); +} + DECLARE_PER_CPU(u32, tcp_tw_isn); void tcp_time_wait(struct sock *sk, int state, int timeo); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0ef1eacd539d..142ff8d86fc2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1296,12 +1296,19 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); + tcp_orphan_count_dec(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); +void inet_csk_prepare_for_destroy_sock(struct sock *sk) +{ + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + tcp_orphan_count_inc(); +} + /* This function allows to force a closure of a socket after the call to * tcp_create_openreq_child(). */ @@ -1369,7 +1376,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4bc2b1921d2b..ef4ccfd46ff6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -707,7 +707,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bc8317e92b7..40b774b4f587 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3195,7 +3195,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - this_cpu_inc(tcp_orphan_count); + tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7c1d612afca1..1e58a8a9ff7a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3517,7 +3517,6 @@ struct proto tcp_prot = { .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .orphan_count = &tcp_orphan_count, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b4e56b877273..07ba32156770 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2353,7 +2353,6 @@ struct proto tcpv6_prot = { .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, - .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), -- cgit v1.2.3 From 10343e7e6c7c6558217b56fb44a538ad04752adb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:52 +0000 Subject: inet: ping: remove ping_hash() There is no point in keeping ping_hash(). Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Yue Haibing Link: https://patch.msgid.link/20250829153054.474201-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ping.h | 1 - net/ipv4/ping.c | 10 ---------- net/ipv6/ping.c | 1 - 3 files changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/ping.h b/include/net/ping.h index bc7779262e60..9634b8800814 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -54,7 +54,6 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 74a0beddfcc4..75e1b0f5c697 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -67,7 +67,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) pr_debug("hash(%u) = %u\n", num, res); return res; } -EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -144,14 +143,6 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -int ping_hash(struct sock *sk) -{ - pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); - BUG(); /* "Please do not press this button again." */ - - return 0; -} - void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); @@ -1008,7 +999,6 @@ struct proto ping_prot = { .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 82b0492923d4..d7a2cdaa2631 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -208,7 +208,6 @@ struct proto pingv6_prot = { .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, -- cgit v1.2.3 From 689adb36bd433b24390080606a07d664cca2982e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:53 +0000 Subject: inet: ping: make ping_port_rover per netns Provide isolation between netns for ping idents. Randomize initial ping_port_rover value at netns creation. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250829153054.474201-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/ping.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6373e3f17da8..54a7d187f62a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -251,6 +251,7 @@ struct netns_ipv4 { int sysctl_igmp_qrv; struct ping_group_range ping_group_range; + u16 ping_port_rover; atomic_t dev_addr_genid; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 75e1b0f5c697..98ccd4f9ed65 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -58,8 +58,6 @@ static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); -static u16 ping_port_rover; - static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; @@ -84,12 +82,12 @@ int ping_get_port(struct sock *sk, unsigned short ident) isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { + u16 result = net->ipv4.ping_port_rover + 1; u32 i; - u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) - result++; /* avoid zero */ + continue; /* avoid zero */ hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) @@ -101,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) } /* found */ - ping_port_rover = ident = result; + net->ipv4.ping_port_rover = ident = result; break; next_port: ; @@ -1146,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net) if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; + + net->ipv4.ping_port_rover = get_random_u16(); return 0; } -- cgit v1.2.3 From d9a3e9929452780df16f3414f0d59b5f69d058cf Mon Sep 17 00:00:00 2001 From: Thomas Andreatta Date: Wed, 27 Aug 2025 17:24:43 +0200 Subject: dmaengine: sh: setup_xref error handling This patch modifies the type of setup_xref from void to int and handles errors since the function can fail. `setup_xref` now returns the (eventual) error from `dmae_set_dmars`|`dmae_set_chcr`, while `shdma_tx_submit` handles the result, removing the chunks from the queue and marking PM as idle in case of an error. Signed-off-by: Thomas Andreatta Link: https://lore.kernel.org/r/20250827152442.90962-1-thomas.andreatta2000@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/sh/shdma-base.c | 25 +++++++++++++++++++------ drivers/dma/sh/shdmac.c | 17 +++++++++++++---- include/linux/shdma-base.h | 2 +- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c index 6b4fce453c85..834741adadaa 100644 --- a/drivers/dma/sh/shdma-base.c +++ b/drivers/dma/sh/shdma-base.c @@ -129,12 +129,25 @@ static dma_cookie_t shdma_tx_submit(struct dma_async_tx_descriptor *tx) const struct shdma_ops *ops = sdev->ops; dev_dbg(schan->dev, "Bring up channel %d\n", schan->id); - /* - * TODO: .xfer_setup() might fail on some platforms. - * Make it int then, on error remove chunks from the - * queue again - */ - ops->setup_xfer(schan, schan->slave_id); + + ret = ops->setup_xfer(schan, schan->slave_id); + if (ret < 0) { + dev_err(schan->dev, "setup_xfer failed: %d\n", ret); + + /* Remove chunks from the queue and mark them as idle */ + list_for_each_entry_safe(chunk, c, &schan->ld_queue, node) { + if (chunk->cookie == cookie) { + chunk->mark = DESC_IDLE; + list_move(&chunk->node, &schan->ld_free); + } + } + + schan->pm_state = SHDMA_PM_ESTABLISHED; + ret = pm_runtime_put(schan->dev); + + spin_unlock_irq(&schan->chan_lock); + return ret; + } if (schan->pm_state == SHDMA_PM_PENDING) shdma_chan_xfer_ld_queue(schan); diff --git a/drivers/dma/sh/shdmac.c b/drivers/dma/sh/shdmac.c index 093e449e19ee..603e15102e45 100644 --- a/drivers/dma/sh/shdmac.c +++ b/drivers/dma/sh/shdmac.c @@ -300,21 +300,30 @@ static bool sh_dmae_channel_busy(struct shdma_chan *schan) return dmae_is_busy(sh_chan); } -static void sh_dmae_setup_xfer(struct shdma_chan *schan, - int slave_id) +static int sh_dmae_setup_xfer(struct shdma_chan *schan, int slave_id) { struct sh_dmae_chan *sh_chan = container_of(schan, struct sh_dmae_chan, shdma_chan); + int ret = 0; if (slave_id >= 0) { const struct sh_dmae_slave_config *cfg = sh_chan->config; - dmae_set_dmars(sh_chan, cfg->mid_rid); - dmae_set_chcr(sh_chan, cfg->chcr); + ret = dmae_set_dmars(sh_chan, cfg->mid_rid); + if (ret < 0) + goto END; + + ret = dmae_set_chcr(sh_chan, cfg->chcr); + if (ret < 0) + goto END; + } else { dmae_init(sh_chan); } + +END: + return ret; } /* diff --git a/include/linux/shdma-base.h b/include/linux/shdma-base.h index 6dfd05ef5c2d..03ba4dab2ef7 100644 --- a/include/linux/shdma-base.h +++ b/include/linux/shdma-base.h @@ -96,7 +96,7 @@ struct shdma_ops { int (*desc_setup)(struct shdma_chan *, struct shdma_desc *, dma_addr_t, dma_addr_t, size_t *); int (*set_slave)(struct shdma_chan *, int, dma_addr_t, bool); - void (*setup_xfer)(struct shdma_chan *, int); + int (*setup_xfer)(struct shdma_chan *, int); void (*start_xfer)(struct shdma_chan *, struct shdma_desc *); struct shdma_desc *(*embedded_desc)(void *, int); bool (*chan_irq)(struct shdma_chan *, int); -- cgit v1.2.3 From 7ea95d55e63176899eb96f7aaa34a5646f501b2c Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 26 Aug 2025 11:07:38 -0500 Subject: dmaengine: Fix dma_async_tx_descriptor->tx_submit documentation Commit 790fb9956eea ("linux/dmaengine.h: fix a few kernel-doc warnings") inserted new documentation for @desc_free in the middle of @tx_submit's description. Put @tx_submit's description back together, matching the indentation style of the rest of the documentation for dma_async_tx_descriptor. Fixes: 790fb9956eea ("linux/dmaengine.h: fix a few kernel-doc warnings") Reviewed-by: Dave Jiang Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20250826-dma_async_tx_desc-tx_submit-doc-fix-v1-1-18a4b51697db@amd.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 6de7c05d6bd8..99efe2b9b4ea 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -594,9 +594,9 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * descriptor pending. To be pushed on .issue_pending() call * @desc_free: driver's callback function to free a resusable descriptor * after completion - * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine -- cgit v1.2.3 From c9f62564252c21d739a5003e9b2d6ad0828aa7bd Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 19:01:11 +0200 Subject: mtd: rawnand: s3c2410: Drop driver (no actual S3C64xx user) The s3c2410 NAND driver still supports S3C64xx platform, which in general is supported in the kernel. There are however no references of "s3c6400-nand" platform device ID or "s3c24xx-nand" driver, thus this driver cannot be instantiated for S3C64xx platform and is basically unused. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/Kconfig | 26 - drivers/mtd/nand/raw/Makefile | 1 - drivers/mtd/nand/raw/s3c2410.c | 1192 ------------------------ include/linux/platform_data/mtd-nand-s3c2410.h | 70 -- 4 files changed, 1289 deletions(-) delete mode 100644 drivers/mtd/nand/raw/s3c2410.c delete mode 100644 include/linux/platform_data/mtd-nand-s3c2410.h (limited to 'include') diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig index 4b99d9c422c3..f053806333c3 100644 --- a/drivers/mtd/nand/raw/Kconfig +++ b/drivers/mtd/nand/raw/Kconfig @@ -77,32 +77,6 @@ config MTD_NAND_NDFC help NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs -config MTD_NAND_S3C2410 - tristate "Samsung S3C NAND controller" - depends on ARCH_S3C64XX - help - This enables the NAND flash controller on the S3C24xx and S3C64xx - SoCs - - No board specific support is done by this driver, each board - must advertise a platform_device for the driver to attach. - -config MTD_NAND_S3C2410_DEBUG - bool "Samsung S3C NAND controller debug" - depends on MTD_NAND_S3C2410 - help - Enable debugging of the S3C NAND driver - -config MTD_NAND_S3C2410_CLKSTOP - bool "Samsung S3C NAND IDLE clock stop" - depends on MTD_NAND_S3C2410 - default n - help - Stop the clock to the NAND controller when there is no chip - selected to save power. This will mean there is a small delay - when the is NAND chip selected or released, but will save - approximately 5mA of power when there is nothing happening. - config MTD_NAND_SHARPSL tristate "Sharp SL Series (C7xx + others) NAND controller" depends on ARCH_PXA || COMPILE_TEST diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile index 711d043ad4f8..363cd18eaf16 100644 --- a/drivers/mtd/nand/raw/Makefile +++ b/drivers/mtd/nand/raw/Makefile @@ -9,7 +9,6 @@ obj-$(CONFIG_MTD_NAND_DENALI) += denali.o obj-$(CONFIG_MTD_NAND_DENALI_PCI) += denali_pci.o obj-$(CONFIG_MTD_NAND_DENALI_DT) += denali_dt.o obj-$(CONFIG_MTD_NAND_AU1550) += au1550nd.o -obj-$(CONFIG_MTD_NAND_S3C2410) += s3c2410.o obj-$(CONFIG_MTD_NAND_DAVINCI) += davinci_nand.o obj-$(CONFIG_MTD_NAND_DISKONCHIP) += diskonchip.o obj-$(CONFIG_MTD_NAND_FSMC) += fsmc_nand.o diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c deleted file mode 100644 index 33130d75c5ba..000000000000 --- a/drivers/mtd/nand/raw/s3c2410.c +++ /dev/null @@ -1,1192 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright © 2004-2008 Simtec Electronics - * http://armlinux.simtec.co.uk/ - * Ben Dooks - * - * Samsung S3C2410/S3C2440/S3C2412 NAND driver -*/ - -#define pr_fmt(fmt) "nand-s3c2410: " fmt - -#ifdef CONFIG_MTD_NAND_S3C2410_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -#define S3C2410_NFREG(x) (x) - -#define S3C2410_NFCONF S3C2410_NFREG(0x00) -#define S3C2410_NFCMD S3C2410_NFREG(0x04) -#define S3C2410_NFADDR S3C2410_NFREG(0x08) -#define S3C2410_NFDATA S3C2410_NFREG(0x0C) -#define S3C2410_NFSTAT S3C2410_NFREG(0x10) -#define S3C2410_NFECC S3C2410_NFREG(0x14) -#define S3C2440_NFCONT S3C2410_NFREG(0x04) -#define S3C2440_NFCMD S3C2410_NFREG(0x08) -#define S3C2440_NFADDR S3C2410_NFREG(0x0C) -#define S3C2440_NFDATA S3C2410_NFREG(0x10) -#define S3C2440_NFSTAT S3C2410_NFREG(0x20) -#define S3C2440_NFMECC0 S3C2410_NFREG(0x2C) -#define S3C2412_NFSTAT S3C2410_NFREG(0x28) -#define S3C2412_NFMECC0 S3C2410_NFREG(0x34) -#define S3C2410_NFCONF_EN (1<<15) -#define S3C2410_NFCONF_INITECC (1<<12) -#define S3C2410_NFCONF_nFCE (1<<11) -#define S3C2410_NFCONF_TACLS(x) ((x)<<8) -#define S3C2410_NFCONF_TWRPH0(x) ((x)<<4) -#define S3C2410_NFCONF_TWRPH1(x) ((x)<<0) -#define S3C2410_NFSTAT_BUSY (1<<0) -#define S3C2440_NFCONF_TACLS(x) ((x)<<12) -#define S3C2440_NFCONF_TWRPH0(x) ((x)<<8) -#define S3C2440_NFCONF_TWRPH1(x) ((x)<<4) -#define S3C2440_NFCONT_INITECC (1<<4) -#define S3C2440_NFCONT_nFCE (1<<1) -#define S3C2440_NFCONT_ENABLE (1<<0) -#define S3C2440_NFSTAT_READY (1<<0) -#define S3C2412_NFCONF_NANDBOOT (1<<31) -#define S3C2412_NFCONT_INIT_MAIN_ECC (1<<5) -#define S3C2412_NFCONT_nFCE0 (1<<1) -#define S3C2412_NFSTAT_READY (1<<0) - -/* new oob placement block for use with hardware ecc generation - */ -static int s3c2410_ooblayout_ecc(struct mtd_info *mtd, int section, - struct mtd_oob_region *oobregion) -{ - if (section) - return -ERANGE; - - oobregion->offset = 0; - oobregion->length = 3; - - return 0; -} - -static int s3c2410_ooblayout_free(struct mtd_info *mtd, int section, - struct mtd_oob_region *oobregion) -{ - if (section) - return -ERANGE; - - oobregion->offset = 8; - oobregion->length = 8; - - return 0; -} - -static const struct mtd_ooblayout_ops s3c2410_ooblayout_ops = { - .ecc = s3c2410_ooblayout_ecc, - .free = s3c2410_ooblayout_free, -}; - -/* controller and mtd information */ - -struct s3c2410_nand_info; - -/** - * struct s3c2410_nand_mtd - driver MTD structure - * @chip: The NAND chip information. - * @set: The platform information supplied for this set of NAND chips. - * @info: Link back to the hardware information. -*/ -struct s3c2410_nand_mtd { - struct nand_chip chip; - struct s3c2410_nand_set *set; - struct s3c2410_nand_info *info; -}; - -enum s3c_cpu_type { - TYPE_S3C2410, - TYPE_S3C2412, - TYPE_S3C2440, -}; - -enum s3c_nand_clk_state { - CLOCK_DISABLE = 0, - CLOCK_ENABLE, - CLOCK_SUSPEND, -}; - -/* overview of the s3c2410 nand state */ - -/** - * struct s3c2410_nand_info - NAND controller state. - * @controller: Base controller structure. - * @mtds: An array of MTD instances on this controller. - * @platform: The platform data for this board. - * @device: The platform device we bound to. - * @clk: The clock resource for this controller. - * @regs: The area mapped for the hardware registers. - * @sel_reg: Pointer to the register controlling the NAND selection. - * @sel_bit: The bit in @sel_reg to select the NAND chip. - * @mtd_count: The number of MTDs created from this controller. - * @save_sel: The contents of @sel_reg to be saved over suspend. - * @clk_rate: The clock rate from @clk. - * @clk_state: The current clock state. - * @cpu_type: The exact type of this controller. - */ -struct s3c2410_nand_info { - /* mtd info */ - struct nand_controller controller; - struct s3c2410_nand_mtd *mtds; - struct s3c2410_platform_nand *platform; - - /* device info */ - struct device *device; - struct clk *clk; - void __iomem *regs; - void __iomem *sel_reg; - int sel_bit; - int mtd_count; - unsigned long save_sel; - unsigned long clk_rate; - enum s3c_nand_clk_state clk_state; - - enum s3c_cpu_type cpu_type; -}; - -struct s3c24XX_nand_devtype_data { - enum s3c_cpu_type type; -}; - -/* conversion functions */ - -static struct s3c2410_nand_mtd *s3c2410_nand_mtd_toours(struct mtd_info *mtd) -{ - return container_of(mtd_to_nand(mtd), struct s3c2410_nand_mtd, - chip); -} - -static struct s3c2410_nand_info *s3c2410_nand_mtd_toinfo(struct mtd_info *mtd) -{ - return s3c2410_nand_mtd_toours(mtd)->info; -} - -static struct s3c2410_nand_info *to_nand_info(struct platform_device *dev) -{ - return platform_get_drvdata(dev); -} - -static struct s3c2410_platform_nand *to_nand_plat(struct platform_device *dev) -{ - return dev_get_platdata(&dev->dev); -} - -static inline int allow_clk_suspend(struct s3c2410_nand_info *info) -{ -#ifdef CONFIG_MTD_NAND_S3C2410_CLKSTOP - return 1; -#else - return 0; -#endif -} - -/** - * s3c2410_nand_clk_set_state - Enable, disable or suspend NAND clock. - * @info: The controller instance. - * @new_state: State to which clock should be set. - */ -static void s3c2410_nand_clk_set_state(struct s3c2410_nand_info *info, - enum s3c_nand_clk_state new_state) -{ - if (!allow_clk_suspend(info) && new_state == CLOCK_SUSPEND) - return; - - if (info->clk_state == CLOCK_ENABLE) { - if (new_state != CLOCK_ENABLE) - clk_disable_unprepare(info->clk); - } else { - if (new_state == CLOCK_ENABLE) - clk_prepare_enable(info->clk); - } - - info->clk_state = new_state; -} - -/* timing calculations */ - -#define NS_IN_KHZ 1000000 - -/** - * s3c_nand_calc_rate - calculate timing data. - * @wanted: The cycle time in nanoseconds. - * @clk: The clock rate in kHz. - * @max: The maximum divider value. - * - * Calculate the timing value from the given parameters. - */ -static int s3c_nand_calc_rate(int wanted, unsigned long clk, int max) -{ - int result; - - result = DIV_ROUND_UP((wanted * clk), NS_IN_KHZ); - - pr_debug("result %d from %ld, %d\n", result, clk, wanted); - - if (result > max) { - pr_err("%d ns is too big for current clock rate %ld\n", - wanted, clk); - return -1; - } - - if (result < 1) - result = 1; - - return result; -} - -#define to_ns(ticks, clk) (((ticks) * NS_IN_KHZ) / (unsigned int)(clk)) - -/* controller setup */ - -/** - * s3c2410_nand_setrate - setup controller timing information. - * @info: The controller instance. - * - * Given the information supplied by the platform, calculate and set - * the necessary timing registers in the hardware to generate the - * necessary timing cycles to the hardware. - */ -static int s3c2410_nand_setrate(struct s3c2410_nand_info *info) -{ - struct s3c2410_platform_nand *plat = info->platform; - int tacls_max = (info->cpu_type == TYPE_S3C2412) ? 8 : 4; - int tacls, twrph0, twrph1; - unsigned long clkrate = clk_get_rate(info->clk); - unsigned long set, cfg, mask; - unsigned long flags; - - /* calculate the timing information for the controller */ - - info->clk_rate = clkrate; - clkrate /= 1000; /* turn clock into kHz for ease of use */ - - if (plat != NULL) { - tacls = s3c_nand_calc_rate(plat->tacls, clkrate, tacls_max); - twrph0 = s3c_nand_calc_rate(plat->twrph0, clkrate, 8); - twrph1 = s3c_nand_calc_rate(plat->twrph1, clkrate, 8); - } else { - /* default timings */ - tacls = tacls_max; - twrph0 = 8; - twrph1 = 8; - } - - if (tacls < 0 || twrph0 < 0 || twrph1 < 0) { - dev_err(info->device, "cannot get suitable timings\n"); - return -EINVAL; - } - - dev_info(info->device, "Tacls=%d, %dns Twrph0=%d %dns, Twrph1=%d %dns\n", - tacls, to_ns(tacls, clkrate), twrph0, to_ns(twrph0, clkrate), - twrph1, to_ns(twrph1, clkrate)); - - switch (info->cpu_type) { - case TYPE_S3C2410: - mask = (S3C2410_NFCONF_TACLS(3) | - S3C2410_NFCONF_TWRPH0(7) | - S3C2410_NFCONF_TWRPH1(7)); - set = S3C2410_NFCONF_EN; - set |= S3C2410_NFCONF_TACLS(tacls - 1); - set |= S3C2410_NFCONF_TWRPH0(twrph0 - 1); - set |= S3C2410_NFCONF_TWRPH1(twrph1 - 1); - break; - - case TYPE_S3C2440: - case TYPE_S3C2412: - mask = (S3C2440_NFCONF_TACLS(tacls_max - 1) | - S3C2440_NFCONF_TWRPH0(7) | - S3C2440_NFCONF_TWRPH1(7)); - - set = S3C2440_NFCONF_TACLS(tacls - 1); - set |= S3C2440_NFCONF_TWRPH0(twrph0 - 1); - set |= S3C2440_NFCONF_TWRPH1(twrph1 - 1); - break; - - default: - BUG(); - } - - local_irq_save(flags); - - cfg = readl(info->regs + S3C2410_NFCONF); - cfg &= ~mask; - cfg |= set; - writel(cfg, info->regs + S3C2410_NFCONF); - - local_irq_restore(flags); - - dev_dbg(info->device, "NF_CONF is 0x%lx\n", cfg); - - return 0; -} - -/** - * s3c2410_nand_inithw - basic hardware initialisation - * @info: The hardware state. - * - * Do the basic initialisation of the hardware, using s3c2410_nand_setrate() - * to setup the hardware access speeds and set the controller to be enabled. -*/ -static int s3c2410_nand_inithw(struct s3c2410_nand_info *info) -{ - int ret; - - ret = s3c2410_nand_setrate(info); - if (ret < 0) - return ret; - - switch (info->cpu_type) { - case TYPE_S3C2410: - default: - break; - - case TYPE_S3C2440: - case TYPE_S3C2412: - /* enable the controller and de-assert nFCE */ - - writel(S3C2440_NFCONT_ENABLE, info->regs + S3C2440_NFCONT); - } - - return 0; -} - -/** - * s3c2410_nand_select_chip - select the given nand chip - * @this: NAND chip object. - * @chip: The chip number. - * - * This is called by the MTD layer to either select a given chip for the - * @mtd instance, or to indicate that the access has finished and the - * chip can be de-selected. - * - * The routine ensures that the nFCE line is correctly setup, and any - * platform specific selection code is called to route nFCE to the specific - * chip. - */ -static void s3c2410_nand_select_chip(struct nand_chip *this, int chip) -{ - struct s3c2410_nand_info *info; - struct s3c2410_nand_mtd *nmtd; - unsigned long cur; - - nmtd = nand_get_controller_data(this); - info = nmtd->info; - - if (chip != -1) - s3c2410_nand_clk_set_state(info, CLOCK_ENABLE); - - cur = readl(info->sel_reg); - - if (chip == -1) { - cur |= info->sel_bit; - } else { - if (nmtd->set != NULL && chip > nmtd->set->nr_chips) { - dev_err(info->device, "invalid chip %d\n", chip); - return; - } - - if (info->platform != NULL) { - if (info->platform->select_chip != NULL) - (info->platform->select_chip) (nmtd->set, chip); - } - - cur &= ~info->sel_bit; - } - - writel(cur, info->sel_reg); - - if (chip == -1) - s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); -} - -/* s3c2410_nand_hwcontrol - * - * Issue command and address cycles to the chip -*/ - -static void s3c2410_nand_hwcontrol(struct nand_chip *chip, int cmd, - unsigned int ctrl) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - if (cmd == NAND_CMD_NONE) - return; - - if (ctrl & NAND_CLE) - writeb(cmd, info->regs + S3C2410_NFCMD); - else - writeb(cmd, info->regs + S3C2410_NFADDR); -} - -/* command and control functions */ - -static void s3c2440_nand_hwcontrol(struct nand_chip *chip, int cmd, - unsigned int ctrl) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - if (cmd == NAND_CMD_NONE) - return; - - if (ctrl & NAND_CLE) - writeb(cmd, info->regs + S3C2440_NFCMD); - else - writeb(cmd, info->regs + S3C2440_NFADDR); -} - -/* s3c2410_nand_devready() - * - * returns 0 if the nand is busy, 1 if it is ready -*/ - -static int s3c2410_nand_devready(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - return readb(info->regs + S3C2410_NFSTAT) & S3C2410_NFSTAT_BUSY; -} - -static int s3c2440_nand_devready(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - return readb(info->regs + S3C2440_NFSTAT) & S3C2440_NFSTAT_READY; -} - -static int s3c2412_nand_devready(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - return readb(info->regs + S3C2412_NFSTAT) & S3C2412_NFSTAT_READY; -} - -/* ECC handling functions */ - -static int s3c2410_nand_correct_data(struct nand_chip *chip, u_char *dat, - u_char *read_ecc, u_char *calc_ecc) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - unsigned int diff0, diff1, diff2; - unsigned int bit, byte; - - pr_debug("%s(%p,%p,%p,%p)\n", __func__, mtd, dat, read_ecc, calc_ecc); - - diff0 = read_ecc[0] ^ calc_ecc[0]; - diff1 = read_ecc[1] ^ calc_ecc[1]; - diff2 = read_ecc[2] ^ calc_ecc[2]; - - pr_debug("%s: rd %*phN calc %*phN diff %02x%02x%02x\n", - __func__, 3, read_ecc, 3, calc_ecc, - diff0, diff1, diff2); - - if (diff0 == 0 && diff1 == 0 && diff2 == 0) - return 0; /* ECC is ok */ - - /* sometimes people do not think about using the ECC, so check - * to see if we have an 0xff,0xff,0xff read ECC and then ignore - * the error, on the assumption that this is an un-eccd page. - */ - if (read_ecc[0] == 0xff && read_ecc[1] == 0xff && read_ecc[2] == 0xff - && info->platform->ignore_unset_ecc) - return 0; - - /* Can we correct this ECC (ie, one row and column change). - * Note, this is similar to the 256 error code on smartmedia */ - - if (((diff0 ^ (diff0 >> 1)) & 0x55) == 0x55 && - ((diff1 ^ (diff1 >> 1)) & 0x55) == 0x55 && - ((diff2 ^ (diff2 >> 1)) & 0x55) == 0x55) { - /* calculate the bit position of the error */ - - bit = ((diff2 >> 3) & 1) | - ((diff2 >> 4) & 2) | - ((diff2 >> 5) & 4); - - /* calculate the byte position of the error */ - - byte = ((diff2 << 7) & 0x100) | - ((diff1 << 0) & 0x80) | - ((diff1 << 1) & 0x40) | - ((diff1 << 2) & 0x20) | - ((diff1 << 3) & 0x10) | - ((diff0 >> 4) & 0x08) | - ((diff0 >> 3) & 0x04) | - ((diff0 >> 2) & 0x02) | - ((diff0 >> 1) & 0x01); - - dev_dbg(info->device, "correcting error bit %d, byte %d\n", - bit, byte); - - dat[byte] ^= (1 << bit); - return 1; - } - - /* if there is only one bit difference in the ECC, then - * one of only a row or column parity has changed, which - * means the error is most probably in the ECC itself */ - - diff0 |= (diff1 << 8); - diff0 |= (diff2 << 16); - - /* equal to "(diff0 & ~(1 << __ffs(diff0)))" */ - if ((diff0 & (diff0 - 1)) == 0) - return 1; - - return -1; -} - -/* ECC functions - * - * These allow the s3c2410 and s3c2440 to use the controller's ECC - * generator block to ECC the data as it passes through] -*/ - -static void s3c2410_nand_enable_hwecc(struct nand_chip *chip, int mode) -{ - struct s3c2410_nand_info *info; - unsigned long ctrl; - - info = s3c2410_nand_mtd_toinfo(nand_to_mtd(chip)); - ctrl = readl(info->regs + S3C2410_NFCONF); - ctrl |= S3C2410_NFCONF_INITECC; - writel(ctrl, info->regs + S3C2410_NFCONF); -} - -static void s3c2412_nand_enable_hwecc(struct nand_chip *chip, int mode) -{ - struct s3c2410_nand_info *info; - unsigned long ctrl; - - info = s3c2410_nand_mtd_toinfo(nand_to_mtd(chip)); - ctrl = readl(info->regs + S3C2440_NFCONT); - writel(ctrl | S3C2412_NFCONT_INIT_MAIN_ECC, - info->regs + S3C2440_NFCONT); -} - -static void s3c2440_nand_enable_hwecc(struct nand_chip *chip, int mode) -{ - struct s3c2410_nand_info *info; - unsigned long ctrl; - - info = s3c2410_nand_mtd_toinfo(nand_to_mtd(chip)); - ctrl = readl(info->regs + S3C2440_NFCONT); - writel(ctrl | S3C2440_NFCONT_INITECC, info->regs + S3C2440_NFCONT); -} - -static int s3c2410_nand_calculate_ecc(struct nand_chip *chip, - const u_char *dat, u_char *ecc_code) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - ecc_code[0] = readb(info->regs + S3C2410_NFECC + 0); - ecc_code[1] = readb(info->regs + S3C2410_NFECC + 1); - ecc_code[2] = readb(info->regs + S3C2410_NFECC + 2); - - pr_debug("%s: returning ecc %*phN\n", __func__, 3, ecc_code); - - return 0; -} - -static int s3c2412_nand_calculate_ecc(struct nand_chip *chip, - const u_char *dat, u_char *ecc_code) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - unsigned long ecc = readl(info->regs + S3C2412_NFMECC0); - - ecc_code[0] = ecc; - ecc_code[1] = ecc >> 8; - ecc_code[2] = ecc >> 16; - - pr_debug("%s: returning ecc %*phN\n", __func__, 3, ecc_code); - - return 0; -} - -static int s3c2440_nand_calculate_ecc(struct nand_chip *chip, - const u_char *dat, u_char *ecc_code) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - unsigned long ecc = readl(info->regs + S3C2440_NFMECC0); - - ecc_code[0] = ecc; - ecc_code[1] = ecc >> 8; - ecc_code[2] = ecc >> 16; - - pr_debug("%s: returning ecc %06lx\n", __func__, ecc & 0xffffff); - - return 0; -} - -/* over-ride the standard functions for a little more speed. We can - * use read/write block to move the data buffers to/from the controller -*/ - -static void s3c2410_nand_read_buf(struct nand_chip *this, u_char *buf, int len) -{ - readsb(this->legacy.IO_ADDR_R, buf, len); -} - -static void s3c2440_nand_read_buf(struct nand_chip *this, u_char *buf, int len) -{ - struct mtd_info *mtd = nand_to_mtd(this); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - readsl(info->regs + S3C2440_NFDATA, buf, len >> 2); - - /* cleanup if we've got less than a word to do */ - if (len & 3) { - buf += len & ~3; - - for (; len & 3; len--) - *buf++ = readb(info->regs + S3C2440_NFDATA); - } -} - -static void s3c2410_nand_write_buf(struct nand_chip *this, const u_char *buf, - int len) -{ - writesb(this->legacy.IO_ADDR_W, buf, len); -} - -static void s3c2440_nand_write_buf(struct nand_chip *this, const u_char *buf, - int len) -{ - struct mtd_info *mtd = nand_to_mtd(this); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - writesl(info->regs + S3C2440_NFDATA, buf, len >> 2); - - /* cleanup any fractional write */ - if (len & 3) { - buf += len & ~3; - - for (; len & 3; len--, buf++) - writeb(*buf, info->regs + S3C2440_NFDATA); - } -} - -/* device management functions */ - -static void s3c24xx_nand_remove(struct platform_device *pdev) -{ - struct s3c2410_nand_info *info = to_nand_info(pdev); - - if (info == NULL) - return; - - /* Release all our mtds and their partitions, then go through - * freeing the resources used - */ - - if (info->mtds != NULL) { - struct s3c2410_nand_mtd *ptr = info->mtds; - int mtdno; - - for (mtdno = 0; mtdno < info->mtd_count; mtdno++, ptr++) { - pr_debug("releasing mtd %d (%p)\n", mtdno, ptr); - WARN_ON(mtd_device_unregister(nand_to_mtd(&ptr->chip))); - nand_cleanup(&ptr->chip); - } - } - - /* free the common resources */ - - if (!IS_ERR(info->clk)) - s3c2410_nand_clk_set_state(info, CLOCK_DISABLE); -} - -static int s3c2410_nand_add_partition(struct s3c2410_nand_info *info, - struct s3c2410_nand_mtd *mtd, - struct s3c2410_nand_set *set) -{ - if (set) { - struct mtd_info *mtdinfo = nand_to_mtd(&mtd->chip); - - mtdinfo->name = set->name; - - return mtd_device_register(mtdinfo, set->partitions, - set->nr_partitions); - } - - return -ENODEV; -} - -static int s3c2410_nand_setup_interface(struct nand_chip *chip, int csline, - const struct nand_interface_config *conf) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - struct s3c2410_platform_nand *pdata = info->platform; - const struct nand_sdr_timings *timings; - int tacls; - - timings = nand_get_sdr_timings(conf); - if (IS_ERR(timings)) - return -ENOTSUPP; - - tacls = timings->tCLS_min - timings->tWP_min; - if (tacls < 0) - tacls = 0; - - pdata->tacls = DIV_ROUND_UP(tacls, 1000); - pdata->twrph0 = DIV_ROUND_UP(timings->tWP_min, 1000); - pdata->twrph1 = DIV_ROUND_UP(timings->tCLH_min, 1000); - - return s3c2410_nand_setrate(info); -} - -/** - * s3c2410_nand_init_chip - initialise a single instance of an chip - * @info: The base NAND controller the chip is on. - * @nmtd: The new controller MTD instance to fill in. - * @set: The information passed from the board specific platform data. - * - * Initialise the given @nmtd from the information in @info and @set. This - * readies the structure for use with the MTD layer functions by ensuring - * all pointers are setup and the necessary control routines selected. - */ -static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info, - struct s3c2410_nand_mtd *nmtd, - struct s3c2410_nand_set *set) -{ - struct device_node *np = info->device->of_node; - struct nand_chip *chip = &nmtd->chip; - void __iomem *regs = info->regs; - - nand_set_flash_node(chip, set->of_node); - - chip->legacy.write_buf = s3c2410_nand_write_buf; - chip->legacy.read_buf = s3c2410_nand_read_buf; - chip->legacy.select_chip = s3c2410_nand_select_chip; - chip->legacy.chip_delay = 50; - nand_set_controller_data(chip, nmtd); - chip->options = set->options; - chip->controller = &info->controller; - - /* - * let's keep behavior unchanged for legacy boards booting via pdata and - * auto-detect timings only when booting with a device tree. - */ - if (!np) - chip->options |= NAND_KEEP_TIMINGS; - - switch (info->cpu_type) { - case TYPE_S3C2410: - chip->legacy.IO_ADDR_W = regs + S3C2410_NFDATA; - info->sel_reg = regs + S3C2410_NFCONF; - info->sel_bit = S3C2410_NFCONF_nFCE; - chip->legacy.cmd_ctrl = s3c2410_nand_hwcontrol; - chip->legacy.dev_ready = s3c2410_nand_devready; - break; - - case TYPE_S3C2440: - chip->legacy.IO_ADDR_W = regs + S3C2440_NFDATA; - info->sel_reg = regs + S3C2440_NFCONT; - info->sel_bit = S3C2440_NFCONT_nFCE; - chip->legacy.cmd_ctrl = s3c2440_nand_hwcontrol; - chip->legacy.dev_ready = s3c2440_nand_devready; - chip->legacy.read_buf = s3c2440_nand_read_buf; - chip->legacy.write_buf = s3c2440_nand_write_buf; - break; - - case TYPE_S3C2412: - chip->legacy.IO_ADDR_W = regs + S3C2440_NFDATA; - info->sel_reg = regs + S3C2440_NFCONT; - info->sel_bit = S3C2412_NFCONT_nFCE0; - chip->legacy.cmd_ctrl = s3c2440_nand_hwcontrol; - chip->legacy.dev_ready = s3c2412_nand_devready; - - if (readl(regs + S3C2410_NFCONF) & S3C2412_NFCONF_NANDBOOT) - dev_info(info->device, "System booted from NAND\n"); - - break; - } - - chip->legacy.IO_ADDR_R = chip->legacy.IO_ADDR_W; - - nmtd->info = info; - nmtd->set = set; - - chip->ecc.engine_type = info->platform->engine_type; - - /* - * If you use u-boot BBT creation code, specifying this flag will - * let the kernel fish out the BBT from the NAND. - */ - if (set->flash_bbt) - chip->bbt_options |= NAND_BBT_USE_FLASH; -} - -/** - * s3c2410_nand_attach_chip - Init the ECC engine after NAND scan - * @chip: The NAND chip - * - * This hook is called by the core after the identification of the NAND chip, - * once the relevant per-chip information is up to date.. This call ensure that - * we update the internal state accordingly. - * - * The internal state is currently limited to the ECC state information. -*/ -static int s3c2410_nand_attach_chip(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - switch (chip->ecc.engine_type) { - - case NAND_ECC_ENGINE_TYPE_NONE: - dev_info(info->device, "ECC disabled\n"); - break; - - case NAND_ECC_ENGINE_TYPE_SOFT: - /* - * This driver expects Hamming based ECC when engine_type is set - * to NAND_ECC_ENGINE_TYPE_SOFT. Force ecc.algo to - * NAND_ECC_ALGO_HAMMING to avoid adding an extra ecc_algo field - * to s3c2410_platform_nand. - */ - chip->ecc.algo = NAND_ECC_ALGO_HAMMING; - dev_info(info->device, "soft ECC\n"); - break; - - case NAND_ECC_ENGINE_TYPE_ON_HOST: - chip->ecc.calculate = s3c2410_nand_calculate_ecc; - chip->ecc.correct = s3c2410_nand_correct_data; - chip->ecc.strength = 1; - - switch (info->cpu_type) { - case TYPE_S3C2410: - chip->ecc.hwctl = s3c2410_nand_enable_hwecc; - chip->ecc.calculate = s3c2410_nand_calculate_ecc; - break; - - case TYPE_S3C2412: - chip->ecc.hwctl = s3c2412_nand_enable_hwecc; - chip->ecc.calculate = s3c2412_nand_calculate_ecc; - break; - - case TYPE_S3C2440: - chip->ecc.hwctl = s3c2440_nand_enable_hwecc; - chip->ecc.calculate = s3c2440_nand_calculate_ecc; - break; - } - - dev_dbg(info->device, "chip %p => page shift %d\n", - chip, chip->page_shift); - - /* change the behaviour depending on whether we are using - * the large or small page nand device */ - if (chip->page_shift > 10) { - chip->ecc.size = 256; - chip->ecc.bytes = 3; - } else { - chip->ecc.size = 512; - chip->ecc.bytes = 3; - mtd_set_ooblayout(nand_to_mtd(chip), - &s3c2410_ooblayout_ops); - } - - dev_info(info->device, "hardware ECC\n"); - break; - - default: - dev_err(info->device, "invalid ECC mode!\n"); - return -EINVAL; - } - - if (chip->bbt_options & NAND_BBT_USE_FLASH) - chip->options |= NAND_SKIP_BBTSCAN; - - return 0; -} - -static const struct nand_controller_ops s3c24xx_nand_controller_ops = { - .attach_chip = s3c2410_nand_attach_chip, - .setup_interface = s3c2410_nand_setup_interface, -}; - -static int s3c24xx_nand_probe_dt(struct platform_device *pdev) -{ - const struct s3c24XX_nand_devtype_data *devtype_data; - struct s3c2410_platform_nand *pdata; - struct s3c2410_nand_info *info = platform_get_drvdata(pdev); - struct device_node *np = pdev->dev.of_node, *child; - struct s3c2410_nand_set *sets; - - devtype_data = of_device_get_match_data(&pdev->dev); - if (!devtype_data) - return -ENODEV; - - info->cpu_type = devtype_data->type; - - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - return -ENOMEM; - - pdev->dev.platform_data = pdata; - - pdata->nr_sets = of_get_child_count(np); - if (!pdata->nr_sets) - return 0; - - sets = devm_kcalloc(&pdev->dev, pdata->nr_sets, sizeof(*sets), - GFP_KERNEL); - if (!sets) - return -ENOMEM; - - pdata->sets = sets; - - for_each_available_child_of_node(np, child) { - sets->name = (char *)child->name; - sets->of_node = child; - sets->nr_chips = 1; - - of_node_get(child); - - sets++; - } - - return 0; -} - -static int s3c24xx_nand_probe_pdata(struct platform_device *pdev) -{ - struct s3c2410_nand_info *info = platform_get_drvdata(pdev); - - info->cpu_type = platform_get_device_id(pdev)->driver_data; - - return 0; -} - -/* s3c24xx_nand_probe - * - * called by device layer when it finds a device matching - * one our driver can handled. This code checks to see if - * it can allocate all necessary resources then calls the - * nand layer to look for devices -*/ -static int s3c24xx_nand_probe(struct platform_device *pdev) -{ - struct s3c2410_platform_nand *plat; - struct s3c2410_nand_info *info; - struct s3c2410_nand_mtd *nmtd; - struct s3c2410_nand_set *sets; - struct resource *res; - int err = 0; - int size; - int nr_sets; - int setno; - - info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL); - if (info == NULL) { - err = -ENOMEM; - goto exit_error; - } - - platform_set_drvdata(pdev, info); - - nand_controller_init(&info->controller); - info->controller.ops = &s3c24xx_nand_controller_ops; - - /* get the clock source and enable it */ - - info->clk = devm_clk_get(&pdev->dev, "nand"); - if (IS_ERR(info->clk)) { - dev_err(&pdev->dev, "failed to get clock\n"); - err = -ENOENT; - goto exit_error; - } - - s3c2410_nand_clk_set_state(info, CLOCK_ENABLE); - - if (pdev->dev.of_node) - err = s3c24xx_nand_probe_dt(pdev); - else - err = s3c24xx_nand_probe_pdata(pdev); - - if (err) - goto exit_error; - - plat = to_nand_plat(pdev); - - /* allocate and map the resource */ - - /* currently we assume we have the one resource */ - res = pdev->resource; - size = resource_size(res); - - info->device = &pdev->dev; - info->platform = plat; - - info->regs = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(info->regs)) { - err = PTR_ERR(info->regs); - goto exit_error; - } - - dev_dbg(&pdev->dev, "mapped registers at %p\n", info->regs); - - if (!plat->sets || plat->nr_sets < 1) { - err = -EINVAL; - goto exit_error; - } - - sets = plat->sets; - nr_sets = plat->nr_sets; - - info->mtd_count = nr_sets; - - /* allocate our information */ - - size = nr_sets * sizeof(*info->mtds); - info->mtds = devm_kzalloc(&pdev->dev, size, GFP_KERNEL); - if (info->mtds == NULL) { - err = -ENOMEM; - goto exit_error; - } - - /* initialise all possible chips */ - - nmtd = info->mtds; - - for (setno = 0; setno < nr_sets; setno++, nmtd++, sets++) { - struct mtd_info *mtd = nand_to_mtd(&nmtd->chip); - - pr_debug("initialising set %d (%p, info %p)\n", - setno, nmtd, info); - - mtd->dev.parent = &pdev->dev; - s3c2410_nand_init_chip(info, nmtd, sets); - - err = nand_scan(&nmtd->chip, sets ? sets->nr_chips : 1); - if (err) - goto exit_error; - - s3c2410_nand_add_partition(info, nmtd, sets); - } - - /* initialise the hardware */ - err = s3c2410_nand_inithw(info); - if (err != 0) - goto exit_error; - - if (allow_clk_suspend(info)) { - dev_info(&pdev->dev, "clock idle support enabled\n"); - s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); - } - - return 0; - - exit_error: - s3c24xx_nand_remove(pdev); - - if (err == 0) - err = -EINVAL; - return err; -} - -/* PM Support */ -#ifdef CONFIG_PM - -static int s3c24xx_nand_suspend(struct platform_device *dev, pm_message_t pm) -{ - struct s3c2410_nand_info *info = platform_get_drvdata(dev); - - if (info) { - info->save_sel = readl(info->sel_reg); - - /* For the moment, we must ensure nFCE is high during - * the time we are suspended. This really should be - * handled by suspending the MTDs we are using, but - * that is currently not the case. */ - - writel(info->save_sel | info->sel_bit, info->sel_reg); - - s3c2410_nand_clk_set_state(info, CLOCK_DISABLE); - } - - return 0; -} - -static int s3c24xx_nand_resume(struct platform_device *dev) -{ - struct s3c2410_nand_info *info = platform_get_drvdata(dev); - unsigned long sel; - - if (info) { - s3c2410_nand_clk_set_state(info, CLOCK_ENABLE); - s3c2410_nand_inithw(info); - - /* Restore the state of the nFCE line. */ - - sel = readl(info->sel_reg); - sel &= ~info->sel_bit; - sel |= info->save_sel & info->sel_bit; - writel(sel, info->sel_reg); - - s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); - } - - return 0; -} - -#else -#define s3c24xx_nand_suspend NULL -#define s3c24xx_nand_resume NULL -#endif - -/* driver device registration */ - -static const struct platform_device_id s3c24xx_driver_ids[] = { - { - .name = "s3c6400-nand", - .driver_data = TYPE_S3C2412, /* compatible with 2412 */ - }, - { } -}; - -MODULE_DEVICE_TABLE(platform, s3c24xx_driver_ids); - -static struct platform_driver s3c24xx_nand_driver = { - .probe = s3c24xx_nand_probe, - .remove = s3c24xx_nand_remove, - .suspend = s3c24xx_nand_suspend, - .resume = s3c24xx_nand_resume, - .id_table = s3c24xx_driver_ids, - .driver = { - .name = "s3c24xx-nand", - }, -}; - -module_platform_driver(s3c24xx_nand_driver); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ben Dooks "); -MODULE_DESCRIPTION("S3C24XX MTD NAND driver"); diff --git a/include/linux/platform_data/mtd-nand-s3c2410.h b/include/linux/platform_data/mtd-nand-s3c2410.h deleted file mode 100644 index 25390fc3e795..000000000000 --- a/include/linux/platform_data/mtd-nand-s3c2410.h +++ /dev/null @@ -1,70 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2004 Simtec Electronics - * Ben Dooks - * - * S3C2410 - NAND device controller platform_device info -*/ - -#ifndef __MTD_NAND_S3C2410_H -#define __MTD_NAND_S3C2410_H - -#include - -/** - * struct s3c2410_nand_set - define a set of one or more nand chips - * @flash_bbt: Openmoko u-boot can create a Bad Block Table - * Setting this flag will allow the kernel to - * look for it at boot time and also skip the NAND - * scan. - * @options: Default value to set into 'struct nand_chip' options. - * @nr_chips: Number of chips in this set - * @nr_partitions: Number of partitions pointed to by @partitions - * @name: Name of set (optional) - * @nr_map: Map for low-layer logical to physical chip numbers (option) - * @partitions: The mtd partition list - * - * define a set of one or more nand chips registered with an unique mtd. Also - * allows to pass flag to the underlying NAND layer. 'disable_ecc' will trigger - * a warning at boot time. - */ -struct s3c2410_nand_set { - unsigned int flash_bbt:1; - - unsigned int options; - int nr_chips; - int nr_partitions; - char *name; - int *nr_map; - struct mtd_partition *partitions; - struct device_node *of_node; -}; - -struct s3c2410_platform_nand { - /* timing information for controller, all times in nanoseconds */ - - int tacls; /* time for active CLE/ALE to nWE/nOE */ - int twrph0; /* active time for nWE/nOE */ - int twrph1; /* time for release CLE/ALE from nWE/nOE inactive */ - - unsigned int ignore_unset_ecc:1; - - enum nand_ecc_engine_type engine_type; - - int nr_sets; - struct s3c2410_nand_set *sets; - - void (*select_chip)(struct s3c2410_nand_set *, - int chip); -}; - -/** - * s3c_nand_set_platdata() - register NAND platform data. - * @nand: The NAND platform data to register with s3c_device_nand. - * - * This function copies the given NAND platform data, @nand and registers - * it with the s3c_device_nand. This allows @nand to be __initdata. -*/ -extern void s3c_nand_set_platdata(struct s3c2410_platform_nand *nand); - -#endif /*__MTD_NAND_S3C2410_H */ -- cgit v1.2.3 From dfb84c33079497bf27058b15780e1c7bba4c371b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Aug 2025 13:10:44 +0200 Subject: fuse: allow synchronous FUSE_INIT FUSE_INIT has always been asynchronous with mount. That means that the server processed this request after the mount syscall returned. This means that FUSE_INIT can't supply the root inode's ID, hence it currently has a hardcoded value. There are other limitations such as not being able to perform getxattr during mount, which is needed by selinux. To remove these limitations allow server to process FUSE_INIT while initializing the in-core super block for the fuse filesystem. This can only be done if the server is prepared to handle this, so add FUSE_DEV_IOC_SYNC_INIT ioctl, which a) lets the server know whether this feature is supported, returning ENOTTY othewrwise. b) lets the kernel know to perform a synchronous initialization The implementation is slightly tricky, since fuse_dev/fuse_conn are set up only during super block creation. This is solved by setting the private data of the fuse device file to a special value ((struct fuse_dev *) 1) and waiting for this to be turned into a proper fuse_dev before commecing with operations on the device file. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +- fs/fuse/dev.c | 71 +++++++++++++++++++++++++++++++++++------------ fs/fuse/dev_uring.c | 4 +-- fs/fuse/fuse_dev_i.h | 13 +++++++-- fs/fuse/fuse_i.h | 5 +++- fs/fuse/inode.c | 51 +++++++++++++++++++++++++++------- include/uapi/linux/fuse.h | 1 + 7 files changed, 114 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f6259711ca1c..2f1619e266e1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1530,14 +1530,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1557,8 +1577,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2231,7 +2251,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2253,11 +2273,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2343,7 +2362,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2490,7 +2509,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2521,8 +2540,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2532,7 +2551,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2563,7 +2582,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2581,8 +2600,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2598,8 +2617,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2610,6 +2629,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2625,6 +2657,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2633,7 +2668,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..bef38ed78249 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1139,9 +1139,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..6e8373f97040 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 486fa550c951..233c6111f768 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -904,6 +904,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1318,7 +1321,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9d26a5bc394d..5b7897bf7e45 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include @@ -34,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -1466,7 +1468,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1525,10 +1527,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1867,8 +1889,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1876,8 +1902,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1898,6 +1926,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1918,8 +1947,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1980,7 +2011,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 94621f68a5cc..6b9fb8b08768 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1131,6 +1131,7 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3) struct fuse_lseek_in { uint64_t fh; -- cgit v1.2.3 From 7df87820122acd3204565109f636a1367912655a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 5 Aug 2025 15:45:08 +1000 Subject: pidns: move is-ancestor logic to helper This check will be needed in later patches, and there's no point open-coding it each time. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-1-705f984940e7@cyphar.com Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 9 +++++++++ kernel/pid_namespace.c | 22 ++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..17fdc059f8da 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -84,6 +84,9 @@ extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); +extern bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor); + #else /* !CONFIG_PID_NS */ #include @@ -118,6 +121,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } + +static inline bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + return false; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..b7b45c2597ec 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -390,11 +390,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +420,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); -- cgit v1.2.3 From 28edfaa10ca1b370b1a27fde632000d35c43402c Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Mon, 1 Sep 2025 16:15:07 +0100 Subject: ASoC: SDCA: Add quirk for incorrect function types for 3 systems Certain systems have CS42L43 DisCo that claims to conform to version 0.6.28 but uses the function types from the 1.0 spec. Add a quirk as a workaround. Closes: https://github.com/thesofproject/linux/issues/5515 Cc: stable@vger.kernel.org Signed-off-by: Maciej Strozek Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20250901151518.3197941-1-mstrozek@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca.h | 1 + sound/soc/sdca/sdca_device.c | 20 ++++++++++++++++++++ sound/soc/sdca/sdca_functions.c | 13 ++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/sound/sdca.h b/include/sound/sdca.h index 5a5d6de78d72..9c6a351c9d47 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -46,6 +46,7 @@ struct sdca_device_data { enum sdca_quirk { SDCA_QUIRKS_RT712_VB, + SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING, }; #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_SOC_SDCA) diff --git a/sound/soc/sdca/sdca_device.c b/sound/soc/sdca/sdca_device.c index 0244cdcdd109..4798ce2c8f0b 100644 --- a/sound/soc/sdca/sdca_device.c +++ b/sound/soc/sdca/sdca_device.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -55,11 +56,30 @@ static bool sdca_device_quirk_rt712_vb(struct sdw_slave *slave) return false; } +static bool sdca_device_quirk_skip_func_type_patching(struct sdw_slave *slave) +{ + const char *vendor, *sku; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + sku = dmi_get_system_info(DMI_PRODUCT_SKU); + + if (vendor && sku && + !strcmp(vendor, "Dell Inc.") && + (!strcmp(sku, "0C62") || !strcmp(sku, "0C63") || !strcmp(sku, "0C6B")) && + slave->sdca_data.interface_revision == 0x061c && + slave->id.mfg_id == 0x01fa && slave->id.part_id == 0x4243) + return true; + + return false; +} + bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk) { switch (quirk) { case SDCA_QUIRKS_RT712_VB: return sdca_device_quirk_rt712_vb(slave); + case SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING: + return sdca_device_quirk_skip_func_type_patching(slave); default: break; } diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c index f26f597dca9e..13f68f7b6dd6 100644 --- a/sound/soc/sdca/sdca_functions.c +++ b/sound/soc/sdca/sdca_functions.c @@ -90,6 +90,7 @@ static int find_sdca_function(struct acpi_device *adev, void *data) { struct fwnode_handle *function_node = acpi_fwnode_handle(adev); struct sdca_device_data *sdca_data = data; + struct sdw_slave *slave = container_of(sdca_data, struct sdw_slave, sdca_data); struct device *dev = &adev->dev; struct fwnode_handle *control5; /* used to identify function type */ const char *function_name; @@ -137,11 +138,13 @@ static int find_sdca_function(struct acpi_device *adev, void *data) return ret; } - ret = patch_sdca_function_type(sdca_data->interface_revision, &function_type); - if (ret < 0) { - dev_err(dev, "SDCA version %#x invalid function type %d\n", - sdca_data->interface_revision, function_type); - return ret; + if (!sdca_device_quirk_match(slave, SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING)) { + ret = patch_sdca_function_type(sdca_data->interface_revision, &function_type); + if (ret < 0) { + dev_err(dev, "SDCA version %#x invalid function type %d\n", + sdca_data->interface_revision, function_type); + return ret; + } } function_name = get_sdca_function_name(function_type); -- cgit v1.2.3 From 72ca981dba5e98c2b1c2956016cc4be934d9fbea Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sun, 31 Aug 2025 14:52:37 +0800 Subject: firmware: arm_scmi: Fix function name typo in scmi_perf_proto_ops struct The performance protocol ops table incorrectly referenced power_scale_mw_get instead of the correct power_scale_get. Fix the typo to use the proper function. Signed-off-by: Peng Fan Message-Id: <20250831-scmi-cpufreq-v1-1-493031cf6e9b@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 688466a0e816..aafaac1496b0 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -153,7 +153,7 @@ struct scmi_perf_domain_info { * for a given device * @fast_switch_rate_limit: gets the minimum time (us) required between * successive fast_switching requests - * @power_scale_mw_get: indicates if the power values provided are in milliWatts + * @power_scale_get: indicates if the power values provided are in milliWatts * or in some other (abstract) scale */ struct scmi_perf_proto_ops { -- cgit v1.2.3 From 61f132ca8c46ffee368a951e516d19d4ae767ea8 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 29 Aug 2025 13:06:04 +0800 Subject: ptp: add helpers to get the phc_index by of_node or dev Some Ethernet controllers do not have an integrated PTP timer function. Instead, the PTP timer is a separated device and provides PTP hardware clock to the Ethernet controller to use. Therefore, the Ethernet controller driver needs to obtain the PTP clock's phc_index in its ethtool_ops::get_ts_info(). Currently, most drivers implement this in the following ways. 1. The PTP device driver adds a custom API and exports it to the Ethernet controller driver. 2. The PTP device driver adds private data to its device structure. So the private data structure needs to be exposed to the Ethernet controller driver. When registering the ptp clock, ptp_clock_register() always saves the ptp_clock pointer to the private data of ptp_clock::dev. Therefore, as long as ptp_clock::dev is obtained, the phc_index can be obtained. So the following generic APIs can be added to the ptp driver to obtain the phc_index. 1. ptp_clock_index_by_dev(): Obtain the phc_index by the device pointer of the PTP device. 2.ptp_clock_index_by_of_node(): Obtain the phc_index by the of_node pointer of the PTP device. Also, we can add another API like ptp_clock_index_by_fwnode() to get the phc_index by fwnode of PTP device. However, this API is not used in this patch set, so it is better to add it when needed. Suggested-by: Vladimir Oltean Signed-off-by: Wei Fang Reviewed-by: Frank Li Link: https://patch.msgid.link/20250829050615.1247468-4-wei.fang@nxp.com Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_clock.c | 53 ++++++++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 22 +++++++++++++++++ 2 files changed, 75 insertions(+) (limited to 'include') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 3e0726c6f55b..5739a57958c7 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -488,6 +489,58 @@ int ptp_clock_index(struct ptp_clock *ptp) } EXPORT_SYMBOL(ptp_clock_index); +static int ptp_clock_of_node_match(struct device *dev, const void *data) +{ + const struct device_node *parent_np = data; + + return (dev->parent && dev_of_node(dev->parent) == parent_np); +} + +int ptp_clock_index_by_of_node(struct device_node *np) +{ + struct ptp_clock *ptp; + struct device *dev; + int phc_index; + + dev = class_find_device(&ptp_class, NULL, np, + ptp_clock_of_node_match); + if (!dev) + return -1; + + ptp = dev_get_drvdata(dev); + phc_index = ptp_clock_index(ptp); + put_device(dev); + + return phc_index; +} +EXPORT_SYMBOL_GPL(ptp_clock_index_by_of_node); + +static int ptp_clock_dev_match(struct device *dev, const void *data) +{ + const struct device *parent = data; + + return dev->parent == parent; +} + +int ptp_clock_index_by_dev(struct device *parent) +{ + struct ptp_clock *ptp; + struct device *dev; + int phc_index; + + dev = class_find_device(&ptp_class, NULL, parent, + ptp_clock_dev_match); + if (!dev) + return -1; + + ptp = dev_get_drvdata(dev); + phc_index = ptp_clock_index(ptp); + put_device(dev); + + return phc_index; +} +EXPORT_SYMBOL_GPL(ptp_clock_index_by_dev); + int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 3d089bd4d5e9..7dd7951b23d5 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -360,6 +360,24 @@ extern void ptp_clock_event(struct ptp_clock *ptp, extern int ptp_clock_index(struct ptp_clock *ptp); +/** + * ptp_clock_index_by_of_node() - obtain the device index of + * a PTP clock based on the PTP device of_node + * + * @np: The device of_node pointer of the PTP device. + * Return: The PHC index on success or -1 on failure. + */ +int ptp_clock_index_by_of_node(struct device_node *np); + +/** + * ptp_clock_index_by_dev() - obtain the device index of + * a PTP clock based on the PTP device. + * + * @parent: The parent device (PTP device) pointer of the PTP clock. + * Return: The PHC index on success or -1 on failure. + */ +int ptp_clock_index_by_dev(struct device *parent); + /** * ptp_find_pin() - obtain the pin index of a given auxiliary function * @@ -425,6 +443,10 @@ static inline void ptp_clock_event(struct ptp_clock *ptp, { } static inline int ptp_clock_index(struct ptp_clock *ptp) { return -1; } +static inline int ptp_clock_index_by_of_node(struct device_node *np) +{ return -1; } +static inline int ptp_clock_index_by_dev(struct device *parent) +{ return -1; } static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } -- cgit v1.2.3 From e551fa3159e3050c26ff010c3b595b45d7eb071a Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Sat, 5 Jul 2025 15:20:42 +0800 Subject: mfd: Add support for Loongson Security Engine chip controller Loongson Security Engine chip supports RNG, SM2, SM3 and SM4 accelerator engines. This is the base driver for other specific engine drivers. Co-developed-by: Yinggang Gu Signed-off-by: Yinggang Gu Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Link: https://lore.kernel.org/r/20250705072045.1067-2-zhaoqunqin@loongson.cn Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 ++ drivers/mfd/Makefile | 2 + drivers/mfd/loongson-se.c | 253 ++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/loongson-se.h | 53 +++++++++ 4 files changed, 319 insertions(+) create mode 100644 drivers/mfd/loongson-se.c create mode 100644 include/linux/mfd/loongson-se.h (limited to 'include') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..d30806ae4298 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2428,6 +2428,17 @@ config MFD_INTEL_M10_BMC_PMCI additional drivers must be enabled in order to use the functionality of the device. +config MFD_LOONGSON_SE + tristate "Loongson Security Engine chip controller driver" + depends on LOONGARCH && ACPI + select MFD_CORE + help + The Loongson Security Engine chip supports RNG, SM2, SM3 and + SM4 accelerator engines. Each engine have its own DMA buffer + provided by the controller. The kernel cannot directly send + commands to the engine and must first send them to the controller, + which will forward them to the corresponding engine. + config MFD_QNAP_MCU tristate "QNAP microcontroller unit core driver" depends on SERIAL_DEV_BUS diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..73df7fc8b5ff 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -295,3 +295,5 @@ obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o obj-$(CONFIG_MFD_UPBOARD_FPGA) += upboard-fpga.o + +obj-$(CONFIG_MFD_LOONGSON_SE) += loongson-se.o diff --git a/drivers/mfd/loongson-se.c b/drivers/mfd/loongson-se.c new file mode 100644 index 000000000000..3902ba377d69 --- /dev/null +++ b/drivers/mfd/loongson-se.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2025 Loongson Technology Corporation Limited + * + * Author: Yinggang Gu + * Author: Qunqin Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct loongson_se { + void __iomem *base; + spinlock_t dev_lock; + struct completion cmd_completion; + + void *dmam_base; + int dmam_size; + + struct mutex engine_init_lock; + struct loongson_se_engine engines[SE_ENGINE_MAX]; +}; + +struct loongson_se_controller_cmd { + u32 command_id; + u32 info[7]; +}; + +static int loongson_se_poll(struct loongson_se *se, u32 int_bit) +{ + u32 status; + int err; + + spin_lock_irq(&se->dev_lock); + + /* Notify the controller that the engine needs to be started */ + writel(int_bit, se->base + SE_L2SINT_SET); + + /* Polling until the controller has forwarded the engine command */ + err = readl_relaxed_poll_timeout_atomic(se->base + SE_L2SINT_STAT, status, + !(status & int_bit), + 1, LOONGSON_ENGINE_CMD_TIMEOUT_US); + + spin_unlock_irq(&se->dev_lock); + + return err; +} + +static int loongson_se_send_controller_cmd(struct loongson_se *se, + struct loongson_se_controller_cmd *cmd) +{ + u32 *send_cmd = (u32 *)cmd; + int err, i; + + for (i = 0; i < SE_SEND_CMD_REG_LEN; i++) + writel(send_cmd[i], se->base + SE_SEND_CMD_REG + i * 4); + + err = loongson_se_poll(se, SE_INT_CONTROLLER); + if (err) + return err; + + return wait_for_completion_interruptible(&se->cmd_completion); +} + +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine) +{ + /* + * After engine initialization, the controller already knows + * where to obtain engine commands from. Now all we need to + * do is notify the controller that the engine needs to be started. + */ + int err = loongson_se_poll(engine->se, BIT(engine->id)); + + if (err) + return err; + + return wait_for_completion_interruptible(&engine->completion); +} +EXPORT_SYMBOL_GPL(loongson_se_send_engine_cmd); + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id) +{ + struct loongson_se *se = dev_get_drvdata(dev); + struct loongson_se_engine *engine = &se->engines[id]; + struct loongson_se_controller_cmd cmd; + + engine->se = se; + engine->id = id; + init_completion(&engine->completion); + + /* Divide DMA memory equally among all engines */ + engine->buffer_size = se->dmam_size / SE_ENGINE_MAX; + engine->buffer_off = (se->dmam_size / SE_ENGINE_MAX) * id; + engine->data_buffer = se->dmam_base + engine->buffer_off; + + /* + * There has no engine0, use its data buffer as command buffer for other + * engines. The DMA memory size is obtained from the ACPI table, which + * ensures that the data buffer size of engine0 is larger than the + * command buffer size of all engines. + */ + engine->command = se->dmam_base + id * (2 * SE_ENGINE_CMD_SIZE); + engine->command_ret = engine->command + SE_ENGINE_CMD_SIZE; + + mutex_lock(&se->engine_init_lock); + + /* Tell the controller where to find engine command */ + cmd.command_id = SE_CMD_SET_ENGINE_CMDBUF; + cmd.info[0] = id; + cmd.info[1] = engine->command - se->dmam_base; + cmd.info[2] = 2 * SE_ENGINE_CMD_SIZE; + + if (loongson_se_send_controller_cmd(se, &cmd)) + engine = NULL; + + mutex_unlock(&se->engine_init_lock); + + return engine; +} +EXPORT_SYMBOL_GPL(loongson_se_init_engine); + +static irqreturn_t se_irq_handler(int irq, void *dev_id) +{ + struct loongson_se *se = dev_id; + u32 int_status; + int id; + + spin_lock(&se->dev_lock); + + int_status = readl(se->base + SE_S2LINT_STAT); + + /* For controller */ + if (int_status & SE_INT_CONTROLLER) { + complete(&se->cmd_completion); + int_status &= ~SE_INT_CONTROLLER; + writel(SE_INT_CONTROLLER, se->base + SE_S2LINT_CL); + } + + /* For engines */ + while (int_status) { + id = __ffs(int_status); + complete(&se->engines[id].completion); + int_status &= ~BIT(id); + writel(BIT(id), se->base + SE_S2LINT_CL); + } + + spin_unlock(&se->dev_lock); + + return IRQ_HANDLED; +} + +static int loongson_se_init(struct loongson_se *se, dma_addr_t addr, int size) +{ + struct loongson_se_controller_cmd cmd; + int err; + + cmd.command_id = SE_CMD_START; + err = loongson_se_send_controller_cmd(se, &cmd); + if (err) + return err; + + cmd.command_id = SE_CMD_SET_DMA; + cmd.info[0] = lower_32_bits(addr); + cmd.info[1] = upper_32_bits(addr); + cmd.info[2] = size; + + return loongson_se_send_controller_cmd(se, &cmd); +} + +static const struct mfd_cell engines[] = { + { .name = "loongson-rng" }, + { .name = "tpm_loongson" }, +}; + +static int loongson_se_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct loongson_se *se; + int nr_irq, irq, err, i; + dma_addr_t paddr; + + se = devm_kmalloc(dev, sizeof(*se), GFP_KERNEL); + if (!se) + return -ENOMEM; + + dev_set_drvdata(dev, se); + init_completion(&se->cmd_completion); + spin_lock_init(&se->dev_lock); + mutex_init(&se->engine_init_lock); + + dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (device_property_read_u32(dev, "dmam_size", &se->dmam_size)) + return -ENODEV; + + se->dmam_base = dmam_alloc_coherent(dev, se->dmam_size, &paddr, GFP_KERNEL); + if (!se->dmam_base) + return -ENOMEM; + + se->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(se->base)) + return PTR_ERR(se->base); + + writel(SE_INT_ALL, se->base + SE_S2LINT_EN); + + nr_irq = platform_irq_count(pdev); + if (nr_irq <= 0) + return -ENODEV; + + for (i = 0; i < nr_irq; i++) { + irq = platform_get_irq(pdev, i); + err = devm_request_irq(dev, irq, se_irq_handler, 0, "loongson-se", se); + if (err) + dev_err(dev, "failed to request IRQ: %d\n", irq); + } + + err = loongson_se_init(se, paddr, se->dmam_size); + if (err) + return err; + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, engines, + ARRAY_SIZE(engines), NULL, 0, NULL); +} + +static const struct acpi_device_id loongson_se_acpi_match[] = { + { "LOON0011", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, loongson_se_acpi_match); + +static struct platform_driver loongson_se_driver = { + .probe = loongson_se_probe, + .driver = { + .name = "loongson-se", + .acpi_match_table = loongson_se_acpi_match, + }, +}; +module_platform_driver(loongson_se_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yinggang Gu "); +MODULE_AUTHOR("Qunqin Zhao "); +MODULE_DESCRIPTION("Loongson Security Engine chip controller driver"); diff --git a/include/linux/mfd/loongson-se.h b/include/linux/mfd/loongson-se.h new file mode 100644 index 000000000000..07afa0c2524d --- /dev/null +++ b/include/linux/mfd/loongson-se.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (C) 2025 Loongson Technology Corporation Limited */ + +#ifndef __MFD_LOONGSON_SE_H__ +#define __MFD_LOONGSON_SE_H__ + +#define LOONGSON_ENGINE_CMD_TIMEOUT_US 10000 +#define SE_SEND_CMD_REG 0x0 +#define SE_SEND_CMD_REG_LEN 0x8 +/* Controller command ID */ +#define SE_CMD_START 0x0 +#define SE_CMD_SET_DMA 0x3 +#define SE_CMD_SET_ENGINE_CMDBUF 0x4 + +#define SE_S2LINT_STAT 0x88 +#define SE_S2LINT_EN 0x8c +#define SE_S2LINT_CL 0x94 +#define SE_L2SINT_STAT 0x98 +#define SE_L2SINT_SET 0xa0 + +#define SE_INT_ALL 0xffffffff +#define SE_INT_CONTROLLER BIT(0) + +#define SE_ENGINE_MAX 16 +#define SE_ENGINE_RNG 1 +#define SE_CMD_RNG 0x100 + +#define SE_ENGINE_TPM 5 +#define SE_CMD_TPM 0x500 + +#define SE_ENGINE_CMD_SIZE 32 + +struct loongson_se_engine { + struct loongson_se *se; + int id; + + /* Command buffer */ + void *command; + void *command_ret; + + void *data_buffer; + uint buffer_size; + /* Data buffer offset to DMA base */ + uint buffer_off; + + struct completion completion; + +}; + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id); +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine); + +#endif -- cgit v1.2.3 From a60a5abe19d6acd9d9ea4c1883745399fb5dc023 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Aug 2025 10:15:37 +0200 Subject: netfilter: nf_tables: allow iter callbacks to sleep Quoting Sven Auhagen: we do see on occasions that we get the following error message, more so on x86 systems than on arm64: Error: Could not process rule: Cannot allocate memory delete table inet filter It is not a consistent error and does not happen all the time. We are on Kernel 6.6.80, seems to me like we have something along the lines of the nf_tables: allow clone callbacks to sleep problem using GFP_ATOMIC. As hinted at by Sven, this is because of GFP_ATOMIC allocations during set flush. When set is flushed, all elements are deactivated. This triggers a set walk and each element gets added to the transaction list. The rbtree and rhashtable sets don't allow the iter callback to sleep: rbtree walk acquires read side of an rwlock with bh disabled, rhashtable walk happens with rcu read lock held. Rbtree is simple enough to resolve: When the walk context is ITER_READ, no change is needed (the iter callback must not deactivate elements; we're not in a transaction). When the iter type is ITER_UPDATE, the rwlock isn't needed because the caller holds the transaction mutex, this prevents any and all changes to the ruleset, including add/remove of set elements. Rhashtable is slightly more complex. When the iter type is ITER_READ, no change is needed, like rbtree. For ITER_UPDATE, we hold transaction mutex which prevents elements from getting free'd, even outside of rcu read lock section. So build a temporary list of all elements while doing the rcu iteration and then call the iterator in a second pass. The disadvantage is the need to iterate twice, but this cost comes with the benefit to allow the iter callback to use GFP_KERNEL allocations in a followup patch. The new list based logic makes it necessary to catch recursive calls to the same set earlier. Such walk -> iter -> walk recursion for the same set can happen during ruleset validation in case userspace gave us a bogus (cyclic) ruleset where verdict map m jumps to chain that sooner or later also calls "vmap @m". Before the new ->in_update_walk test, the ruleset is rejected because the infinite recursion causes ctx->level to exceed the allowed maximum. But with the new logic added here, elements would get skipped: nft_rhash_walk_update would see elements that are on the walk_list of an older stack frame. As all recursive calls into same map results in -EMLINK, we can avoid this problem by using the new in_update_walk flag and reject immediately. Next patch converts the problematic GFP_ATOMIC allocations. Reported-by: Sven Auhagen Closes: https://lore.kernel.org/netfilter-devel/BY1PR18MB5874110CAFF1ED098D0BC4E7E07BA@BY1PR18MB5874.namprd18.prod.outlook.com/ Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 2 + net/netfilter/nft_set_hash.c | 100 ++++++++++++++++++++++++++++++++++++-- net/netfilter/nft_set_rbtree.c | 35 ++++++++++--- 3 files changed, 126 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc..e2128663b160 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -556,6 +556,7 @@ struct nft_set_elem_expr { * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element + * @in_update_walk: true during ->walk() in transaction phase * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -590,6 +591,7 @@ struct nft_set { u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; + bool in_update_walk; u32 use; atomic_t nelems; u32 ndeact; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 266d0c637225..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -30,6 +30,7 @@ struct nft_rhash { struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, goto err1; he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set, return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -295,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..b311b66df3e9 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, -- cgit v1.2.3 From f4f9e05904e11bbc772c031b35d0d25caa21d5e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Aug 2025 20:43:47 +0200 Subject: netfilter: nf_reject: remove unneeded exports These functions have no external callers and can be static. Signed-off-by: Florian Westphal --- include/net/netfilter/ipv4/nf_reject.h | 8 -------- include/net/netfilter/ipv6/nf_reject.h | 10 --------- net/ipv4/netfilter/nf_reject_ipv4.c | 27 ++++++++++++++++--------- net/ipv6/netfilter/nf_reject_ipv6.c | 37 +++++++++++++++++++++++----------- 4 files changed, 42 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index c653fcb88354..09de2f2686b5 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -10,14 +10,6 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth); - struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index d729344ba644..94ec0b9f2838 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -9,16 +9,6 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char cod unsigned int hooknum); void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen); - struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 0d3cb2ba6fc8..05631abe3f0d 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,15 @@ #include #include +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl); +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); + static int nf_reject_iphdr_validate(struct sk_buff *skb) { struct iphdr *iph; @@ -136,8 +145,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook) +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { const struct tcphdr *oth; @@ -163,11 +173,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, return oth; } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl) +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -188,10 +197,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, return niph; } -EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth) +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) { struct iphdr *niph = ip_hdr(nskb); struct tcphdr *tcph; @@ -218,7 +226,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) { diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index cb2d38e80de9..6b022449f867 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,19 @@ #include #include +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit); +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); + static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -146,9 +159,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook) +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; @@ -192,11 +206,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, return otcph; } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit) +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -216,11 +230,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, return ip6h; } -EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen) +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; @@ -248,7 +262,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, csum_partial(tcph, sizeof(struct tcphdr), 0)); } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { -- cgit v1.2.3 From 077dc4a275790b09e8a2ce80822ba8970e9dfb99 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Thu, 28 Aug 2025 14:48:31 +0200 Subject: netfilter: nft_payload: extend offset to 65535 bytes In some situations 255 bytes offset is not enough to match or manipulate the desired packet field. Increase the offset limit to 65535 or U16_MAX. In addition, the nla policy maximum value is not set anymore as it is limited to s16. Instead, the maximum value is checked during the payload expression initialization function. Tested with the nft command line tool. table ip filter { chain output { @nh,2040,8 set 0xff @nh,524280,8 set 0xff @nh,524280,8 0xff @nh,2040,8 0xff } } Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 2 +- net/netfilter/nft_payload.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828..7644cfe9267d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -73,7 +73,7 @@ struct nft_ct { struct nft_payload { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 dreg; }; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 059b28ffad0e..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; @@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr { }; static bool -nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; @@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); -- cgit v1.2.3 From 7a8c994cbb2db3c5335cee35fd486557f5aaf7e1 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Mon, 28 Jul 2025 15:06:12 +0800 Subject: ACPI: processor: idle: Optimize ACPI idle driver registration Currently, the ACPI idle driver is registered from within a CPU hotplug callback. Although this didn't cause any functional issues, this is questionable and confusing. And it is better to register the cpuidle driver when all of the CPUs have been brought up. So add a new function to initialize acpi_idle_driver based on the power management information of an available CPU and register cpuidle driver in acpi_processor_driver_init(). Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250728070612.1260859-3-lihuisong@huawei.com [ rjw: Added missing inline modifiers ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_driver.c | 3 ++ drivers/acpi/processor_idle.c | 65 ++++++++++++++++++++++++++--------------- include/acpi/processor.h | 8 +++++ 3 files changed, 53 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 65e779be64ff..bc9f58a02c1d 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -263,6 +263,8 @@ static int __init acpi_processor_driver_init(void) if (result < 0) return result; + acpi_processor_register_idle_driver(); + result = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "acpi/cpu-drv:online", acpi_soft_cpu_online, NULL); @@ -301,6 +303,7 @@ static void __exit acpi_processor_driver_exit(void) cpuhp_remove_state_nocalls(hp_online); cpuhp_remove_state_nocalls(CPUHP_ACPI_CPUDRV_DEAD); + acpi_processor_unregister_idle_driver(); driver_unregister(&acpi_processor_driver); } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index d0fc045a8d31..d8eb0b515188 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1360,7 +1360,48 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr) return 0; } -static int acpi_processor_registered; +void acpi_processor_register_idle_driver(void) +{ + struct acpi_processor *pr; + int ret = -ENODEV; + int cpu; + + /* + * Acpi idle driver is used by all possible CPUs. + * Install the idle handler by the processor power info of one in them. + * Note that we use previously set idle handler will be used on + * platforms that only support C1. + */ + for_each_cpu(cpu, (struct cpumask *)cpu_possible_mask) { + pr = per_cpu(processors, cpu); + if (!pr) + continue; + + ret = acpi_processor_get_power_info(pr); + if (!ret) { + pr->flags.power_setup_done = 1; + acpi_processor_setup_cpuidle_states(pr); + break; + } + } + + if (ret) { + pr_debug("No ACPI power information from any CPUs.\n"); + return; + } + + ret = cpuidle_register_driver(&acpi_idle_driver); + if (ret) { + pr_debug("register %s failed.\n", acpi_idle_driver.name); + return; + } + pr_debug("%s registered with cpuidle.\n", acpi_idle_driver.name); +} + +void acpi_processor_unregister_idle_driver(void) +{ + cpuidle_unregister_driver(&acpi_idle_driver); +} int acpi_processor_power_init(struct acpi_processor *pr) { @@ -1375,22 +1416,7 @@ int acpi_processor_power_init(struct acpi_processor *pr) if (!acpi_processor_get_power_info(pr)) pr->flags.power_setup_done = 1; - /* - * Install the idle handler if processor power management is supported. - * Note that we use previously set idle handler will be used on - * platforms that only support C1. - */ if (pr->flags.power) { - /* Register acpi_idle_driver if not already registered */ - if (!acpi_processor_registered) { - acpi_processor_setup_cpuidle_states(pr); - retval = cpuidle_register_driver(&acpi_idle_driver); - if (retval) - return retval; - pr_debug("%s registered with cpuidle\n", - acpi_idle_driver.name); - } - dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; @@ -1403,14 +1429,11 @@ int acpi_processor_power_init(struct acpi_processor *pr) */ retval = cpuidle_register_device(dev); if (retval) { - if (acpi_processor_registered == 0) - cpuidle_unregister_driver(&acpi_idle_driver); per_cpu(acpi_cpuidle_device, pr->id) = NULL; kfree(dev); return retval; } - acpi_processor_registered++; } return 0; } @@ -1424,10 +1447,6 @@ int acpi_processor_power_exit(struct acpi_processor *pr) if (pr->flags.power) { cpuidle_unregister_device(dev); - acpi_processor_registered--; - if (acpi_processor_registered == 0) - cpuidle_unregister_driver(&acpi_idle_driver); - kfree(dev); } diff --git a/include/acpi/processor.h b/include/acpi/processor.h index d0eccbd920e5..360b673f05e5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -423,6 +423,8 @@ int acpi_processor_power_init(struct acpi_processor *pr); int acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); +void acpi_processor_register_idle_driver(void); +void acpi_processor_unregister_idle_driver(void); #else static inline int acpi_processor_power_init(struct acpi_processor *pr) { @@ -443,6 +445,12 @@ static inline int acpi_processor_hotplug(struct acpi_processor *pr) { return -ENODEV; } +static inline void acpi_processor_register_idle_driver(void) +{ +} +static inline void acpi_processor_unregister_idle_driver(void) +{ +} #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ -- cgit v1.2.3 From 23a6037ce76cb44d93cfea23aec5c7f3971227d4 Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Fri, 29 Aug 2025 17:08:37 -0700 Subject: bonding: Remove support for use_carrier Remove the implementation of use_carrier, the link monitoring method that utilizes ethtool or ioctl to determine the link state of an interface in a bond. Bonding will always behaves as if use_carrier=1, which relies on netif_carrier_ok() to determine the link state of interfaces. To avoid acquiring RTNL many times per second, bonding inspects link state under RCU, but not under RTNL. However, ethtool implementations in drivers may sleep, and therefore this strategy is unsuitable for use with calls into driver ethtool functions. The use_carrier option was introduced in 2003, to provide backwards compatibility for network device drivers that did not support the then-new netif_carrier_ok/on/off system. Device drivers are now expected to support netif_carrier_*, and the use_carrier backwards compatibility logic is no longer necessary. The option itself remains, but when queried always returns 1, and may only be set to 1. Link: https://lore.kernel.org/000000000000eb54bf061cfd666a@google.com Link: https://lore.kernel.org/20240718122017.d2e33aaac43a.I10ab9c9ded97163aef4e4de10985cd8f7de60d28@changeid Signed-off-by: Jay Vosburgh Reported-by: syzbot+b8c48ea38ca27d150063@syzkaller.appspotmail.com Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/2029487.1756512517@famine Signed-off-by: Jakub Kicinski --- Documentation/networking/bonding.rst | 79 ++++-------------------- drivers/net/bonding/bond_main.c | 113 +++-------------------------------- drivers/net/bonding/bond_netlink.c | 14 ++--- drivers/net/bonding/bond_options.c | 7 +-- drivers/net/bonding/bond_sysfs.c | 6 +- include/net/bonding.h | 1 - 6 files changed, 28 insertions(+), 192 deletions(-) (limited to 'include') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index f8f5766703d4..a2b42ae719d2 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -582,10 +582,8 @@ miimon This determines how often the link state of each slave is inspected for link failures. A value of zero disables MII link monitoring. A value of 100 is a good starting point. - The use_carrier option, below, affects how the link state is - determined. See the High Availability section for additional - information. The default value is 100 if arp_interval is not - set. + + The default value is 100 if arp_interval is not set. min_links @@ -896,25 +894,14 @@ updelay use_carrier - Specifies whether or not miimon should use MII or ETHTOOL - ioctls vs. netif_carrier_ok() to determine the link - status. The MII or ETHTOOL ioctls are less efficient and - utilize a deprecated calling sequence within the kernel. The - netif_carrier_ok() relies on the device driver to maintain its - state with netif_carrier_on/off; at this writing, most, but - not all, device drivers support this facility. - - If bonding insists that the link is up when it should not be, - it may be that your network device driver does not support - netif_carrier_on/off. The default state for netif_carrier is - "carrier on," so if a driver does not support netif_carrier, - it will appear as if the link is always up. In this case, - setting use_carrier to 0 will cause bonding to revert to the - MII / ETHTOOL ioctl method to determine the link state. - - A value of 1 enables the use of netif_carrier_ok(), a value of - 0 will use the deprecated MII / ETHTOOL ioctls. The default - value is 1. + Obsolete option that previously selected between MII / + ETHTOOL ioctls and netif_carrier_ok() to determine link + state. + + All link state checks are now done with netif_carrier_ok(). + + For backwards compatibility, this option's value may be inspected + or set. The only valid setting is 1. xmit_hash_policy @@ -2036,22 +2023,8 @@ depending upon the device driver to maintain its carrier state, by querying the device's MII registers, or by making an ethtool query to the device. -If the use_carrier module parameter is 1 (the default value), -then the MII monitor will rely on the driver for carrier state -information (via the netif_carrier subsystem). As explained in the -use_carrier parameter information, above, if the MII monitor fails to -detect carrier loss on the device (e.g., when the cable is physically -disconnected), it may be that the driver does not support -netif_carrier. - -If use_carrier is 0, then the MII monitor will first query the -device's (via ioctl) MII registers and check the link state. If that -request fails (not just that it returns carrier down), then the MII -monitor will make an ethtool ETHTOOL_GLINK request to attempt to obtain -the same information. If both methods fail (i.e., the driver either -does not support or had some error in processing both the MII register -and ethtool requests), then the MII monitor will assume the link is -up. +The MII monitor relies on the driver for carrier state information (via +the netif_carrier subsystem). 8. Potential Sources of Trouble =============================== @@ -2135,34 +2108,6 @@ This will load tg3 and e1000 modules before loading the bonding one. Full documentation on this can be found in the modprobe.d and modprobe manual pages. -8.3. Painfully Slow Or No Failed Link Detection By Miimon ---------------------------------------------------------- - -By default, bonding enables the use_carrier option, which -instructs bonding to trust the driver to maintain carrier state. - -As discussed in the options section, above, some drivers do -not support the netif_carrier_on/_off link state tracking system. -With use_carrier enabled, bonding will always see these links as up, -regardless of their actual state. - -Additionally, other drivers do support netif_carrier, but do -not maintain it in real time, e.g., only polling the link state at -some fixed interval. In this case, miimon will detect failures, but -only after some long period of time has expired. If it appears that -miimon is very slow in detecting link failures, try specifying -use_carrier=0 to see if that improves the failure detection time. If -it does, then it may be that the driver checks the carrier state at a -fixed interval, but does not cache the MII register values (so the -use_carrier=0 method of querying the registers directly works). If -use_carrier=0 does not improve the failover, then the driver may cache -the registers, or the problem may be elsewhere. - -Also, remember that miimon only checks for the device's -carrier state. It has no way to determine the state of devices on or -beyond other ports of a switch, or if a switch is refusing to pass -traffic while still maintaining carrier on. - 9. SNMP agents =============== diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 257333c88710..f25c2d2c9181 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -142,8 +142,7 @@ module_param(downdelay, int, 0); MODULE_PARM_DESC(downdelay, "Delay before considering link down, " "in milliseconds"); module_param(use_carrier, int, 0); -MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; " - "0 for off, 1 for on (default)"); +MODULE_PARM_DESC(use_carrier, "option obsolete, use_carrier cannot be disabled"); module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " @@ -830,77 +829,6 @@ const char *bond_slave_link_status(s8 link) } } -/* if supports MII link status reporting, check its link status. - * - * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(), - * depending upon the setting of the use_carrier parameter. - * - * Return either BMSR_LSTATUS, meaning that the link is up (or we - * can't tell and just pretend it is), or 0, meaning that the link is - * down. - * - * If reporting is non-zero, instead of faking link up, return -1 if - * both ETHTOOL and MII ioctls fail (meaning the device does not - * support them). If use_carrier is set, return whatever it says. - * It'd be nice if there was a good way to tell if a driver supports - * netif_carrier, but there really isn't. - */ -static int bond_check_dev_link(struct bonding *bond, - struct net_device *slave_dev, int reporting) -{ - const struct net_device_ops *slave_ops = slave_dev->netdev_ops; - struct mii_ioctl_data *mii; - struct ifreq ifr; - int ret; - - if (!reporting && !netif_running(slave_dev)) - return 0; - - if (bond->params.use_carrier) - return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0; - - /* Try to get link status using Ethtool first. */ - if (slave_dev->ethtool_ops->get_link) { - netdev_lock_ops(slave_dev); - ret = slave_dev->ethtool_ops->get_link(slave_dev); - netdev_unlock_ops(slave_dev); - - return ret ? BMSR_LSTATUS : 0; - } - - /* Ethtool can't be used, fallback to MII ioctls. */ - if (slave_ops->ndo_eth_ioctl) { - /* TODO: set pointer to correct ioctl on a per team member - * bases to make this more efficient. that is, once - * we determine the correct ioctl, we will always - * call it and not the others for that team - * member. - */ - - /* We cannot assume that SIOCGMIIPHY will also read a - * register; not all network drivers (e.g., e100) - * support that. - */ - - /* Yes, the mii is overlaid on the ifreq.ifr_ifru */ - strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ); - mii = if_mii(&ifr); - - if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) { - mii->reg_num = MII_BMSR; - if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0) - return mii->val_out & BMSR_LSTATUS; - } - } - - /* If reporting, report that either there's no ndo_eth_ioctl, - * or both SIOCGMIIREG and get_link failed (meaning that we - * cannot report link status). If not reporting, pretend - * we're ok. - */ - return reporting ? -1 : BMSR_LSTATUS; -} - /*----------------------------- Multicast list ------------------------------*/ /* Push the promiscuity flag down to appropriate slaves */ @@ -1966,7 +1894,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; struct sockaddr_storage ss; - int link_reporting; int res = 0, i; if (slave_dev->flags & IFF_MASTER && @@ -1976,12 +1903,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, return -EPERM; } - if (!bond->params.use_carrier && - slave_dev->ethtool_ops->get_link == NULL && - slave_ops->ndo_eth_ioctl == NULL) { - slave_warn(bond_dev, slave_dev, "no link monitoring support\n"); - } - /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, @@ -2195,29 +2116,10 @@ skip_mac_set: new_slave->last_tx = new_slave->last_rx; - if (bond->params.miimon && !bond->params.use_carrier) { - link_reporting = bond_check_dev_link(bond, slave_dev, 1); - - if ((link_reporting == -1) && !bond->params.arp_interval) { - /* miimon is set but a bonded network driver - * does not support ETHTOOL/MII and - * arp_interval is not set. Note: if - * use_carrier is enabled, we will never go - * here (because netif_carrier is always - * supported); thus, we don't need to change - * the messages for netif_carrier. - */ - slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n"); - } else if (link_reporting == -1) { - /* unable get link status using mii/ethtool */ - slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n"); - } - } - /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { - if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) { + if (netif_carrier_ok(slave_dev)) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, @@ -2759,7 +2661,7 @@ static int bond_miimon_inspect(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); - link_state = bond_check_dev_link(bond, slave->dev, 0); + link_state = netif_carrier_ok(slave->dev); switch (slave->link) { case BOND_LINK_UP: @@ -6257,10 +6159,10 @@ static int __init bond_check_params(struct bond_params *params) downdelay = 0; } - if ((use_carrier != 0) && (use_carrier != 1)) { - pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n", - use_carrier); - use_carrier = 1; + if (use_carrier != 1) { + pr_err("Error: invalid use_carrier parameter (%d)\n", + use_carrier); + return -EINVAL; } if (num_peer_notif < 0 || num_peer_notif > 255) { @@ -6507,7 +6409,6 @@ static int __init bond_check_params(struct bond_params *params) params->updelay = updelay; params->downdelay = downdelay; params->peer_notif_delay = 0; - params->use_carrier = use_carrier; params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 57fff2421f1b..e573b34a1bbc 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -259,13 +259,11 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], return err; } if (data[IFLA_BOND_USE_CARRIER]) { - int use_carrier = nla_get_u8(data[IFLA_BOND_USE_CARRIER]); - - bond_opt_initval(&newval, use_carrier); - err = __bond_opt_set(bond, BOND_OPT_USE_CARRIER, &newval, - data[IFLA_BOND_USE_CARRIER], extack); - if (err) - return err; + if (nla_get_u8(data[IFLA_BOND_USE_CARRIER]) != 1) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_USE_CARRIER], + "option obsolete, use_carrier cannot be disabled"); + return -EINVAL; + } } if (data[IFLA_BOND_ARP_INTERVAL]) { int arp_interval = nla_get_u32(data[IFLA_BOND_ARP_INTERVAL]); @@ -688,7 +686,7 @@ static int bond_fill_info(struct sk_buff *skb, bond->params.peer_notif_delay * bond->params.miimon)) goto nla_put_failure; - if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, bond->params.use_carrier)) + if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, 1)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_ARP_INTERVAL, bond->params.arp_interval)) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 3b6f815c55ff..c0a5eb8766b5 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -187,7 +187,6 @@ static const struct bond_opt_value bond_primary_reselect_tbl[] = { }; static const struct bond_opt_value bond_use_carrier_tbl[] = { - { "off", 0, 0}, { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; @@ -419,7 +418,7 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { [BOND_OPT_USE_CARRIER] = { .id = BOND_OPT_USE_CARRIER, .name = "use_carrier", - .desc = "Use netif_carrier_ok (vs MII ioctls) in miimon", + .desc = "option obsolete, use_carrier cannot be disabled", .values = bond_use_carrier_tbl, .set = bond_option_use_carrier_set }, @@ -1091,10 +1090,6 @@ static int bond_option_peer_notif_delay_set(struct bonding *bond, static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval) { - netdev_dbg(bond->dev, "Setting use_carrier to %llu\n", - newval->value); - bond->params.use_carrier = newval->value; - return 0; } diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 1e13bb170515..9a75ad3181ab 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -467,14 +467,12 @@ static ssize_t bonding_show_primary_reselect(struct device *d, static DEVICE_ATTR(primary_reselect, 0644, bonding_show_primary_reselect, bonding_sysfs_store_option); -/* Show the use_carrier flag. */ +/* use_carrier is obsolete, but print value for compatibility */ static ssize_t bonding_show_carrier(struct device *d, struct device_attribute *attr, char *buf) { - struct bonding *bond = to_bond(d); - - return sysfs_emit(buf, "%d\n", bond->params.use_carrier); + return sysfs_emit(buf, "1\n"); } static DEVICE_ATTR(use_carrier, 0644, bonding_show_carrier, bonding_sysfs_store_option); diff --git a/include/net/bonding.h b/include/net/bonding.h index e06f0d63b2c1..37335f62f579 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -126,7 +126,6 @@ struct bond_params { int arp_interval; int arp_validate; int arp_all_targets; - int use_carrier; int fail_over_mac; int updelay; int downdelay; -- cgit v1.2.3 From 65128868bb3b0621d2d8e71f19852675a064b373 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:04 -0700 Subject: mm/memory_hotplug: Update comment for hotplug memory callback priorities Add clarification to comment for memory hotplug callback ordering as the current comment does not provide clear language on which callback happens first. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..1305102688d0 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -115,8 +115,8 @@ struct notifier_block; struct mem_section; /* - * Priorities for the hotplug memory callback routines (stored in decreasing - * order in the callback chain) + * Priorities for the hotplug memory callback routines. Invoked from + * high to low. Higher priorities correspond to higher numbers. */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -- cgit v1.2.3 From b57fc652ca24ada3b0c888327f9944ed21559286 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:05 -0700 Subject: drivers/base/node: Add a helper function node_update_perf_attrs() Add helper function node_update_perf_attrs() to allow update of node access coordinates computed by an external agent such as CXL. The helper allows updating of coordinates after the attribute being created by HMAT. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/base/node.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/node.h | 8 ++++++++ 2 files changed, 46 insertions(+) (limited to 'include') diff --git a/drivers/base/node.c b/drivers/base/node.c index 3399594136b2..db18a1c32637 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -248,6 +248,44 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, } EXPORT_SYMBOL_GPL(node_set_perf_attrs); +/** + * node_update_perf_attrs - Update the performance values for given access class + * @nid: Node identifier to be updated + * @coord: Heterogeneous memory performance coordinates + * @access: The access class for the given attributes + */ +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access) +{ + struct node_access_nodes *access_node; + struct node *node; + int i; + + if (WARN_ON_ONCE(!node_online(nid))) + return; + + node = node_devices[nid]; + list_for_each_entry(access_node, &node->access_list, list_node) { + if (access_node->access != access) + continue; + + access_node->coord = *coord; + for (i = 0; access_attrs[i]; i++) { + sysfs_notify(&access_node->dev.kobj, + NULL, access_attrs[i]->name); + } + break; + } + + /* When setting CPU access coordinates, update mempolicy */ + if (access != ACCESS_COORDINATE_CPU) + return; + + if (mempolicy_set_node_perf(nid, coord)) + pr_info("failed to set mempolicy attrs for node %d\n", nid); +} +EXPORT_SYMBOL_GPL(node_update_perf_attrs); + /** * struct node_cache_info - Internal tracking for memory node caches * @dev: Device represeting the cache level diff --git a/include/linux/node.h b/include/linux/node.h index 2c7529335b21..866e3323f1fd 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,6 +85,8 @@ struct node_cache_attrs { void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, enum access_coordinate_class access); +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access); #else static inline void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) @@ -96,6 +98,12 @@ static inline void node_set_perf_attrs(unsigned int nid, enum access_coordinate_class access) { } + +static inline void node_update_perf_attrs(unsigned int nid, + struct access_coordinate *coord, + enum access_coordinate_class access) +{ +} #endif struct node { -- cgit v1.2.3 From 2e454fb8056df6da4bba7d89a57bf60e217463c0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:06 -0700 Subject: cxl, acpi/hmat: Update CXL access coordinates directly instead of through HMAT The current implementation of CXL memory hotplug notifier gets called before the HMAT memory hotplug notifier. The CXL driver calculates the access coordinates (bandwidth and latency values) for the CXL end to end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL memory hotplug notifier writes the access coordinates to the HMAT target structs. Then the HMAT memory hotplug notifier is called and it creates the access coordinates for the node sysfs attributes. During testing on an Intel platform, it was found that although the newly calculated coordinates were pushed to sysfs, the sysfs attributes for the access coordinates showed up with the wrong initiator. The system has 4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are CXL nodes. The expectation is that node 2 would show up as a target to node 0: /sys/devices/system/node/node2/access0/initiators/node0 However it was observed that node 2 showed up as a target under node 1: /sys/devices/system/node/node2/access0/initiators/node1 The original intent of the 'ext_updated' flag in HMAT handling code was to stop HMAT memory hotplug callback from clobbering the access coordinates after CXL has injected its calculated coordinates and replaced the generic target access coordinates provided by the HMAT table in the HMAT target structs. However the flag is hacky at best and blocks the updates from other CXL regions that are onlined in the same node later on. Remove the 'ext_updated' flag usage and just update the access coordinates for the nodes directly without touching HMAT target data. The hotplug memory callback ordering is changed. Instead of changing CXL, move HMAT back so there's room for the levels rather than have CXL share the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL callback to be executed after the HMAT callback. With the change, the CXL hotplug memory notifier runs after the HMAT callback. The HMAT callback will create the node sysfs attributes for access coordinates. The CXL callback will write the access coordinates to the now created node sysfs attributes directly and will not pollute the HMAT target values. A nodemask is introduced to keep track if a node has been updated and prevents further updates. Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") Cc: stable@vger.kernel.org Tested-by: Marc Herbert Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 6 ------ drivers/cxl/core/cdat.c | 5 ----- drivers/cxl/core/core.h | 1 - drivers/cxl/core/region.c | 20 ++++++++++++-------- include/linux/memory.h | 2 +- 5 files changed, 13 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 4958301f5417..5d32490dc4ab 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -74,7 +74,6 @@ struct memory_target { struct node_cache_attrs cache_attrs; u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE]; bool registered; - bool ext_updated; /* externally updated */ }; struct memory_initiator { @@ -391,7 +390,6 @@ int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, coord->read_bandwidth, access); hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, coord->write_bandwidth, access); - target->ext_updated = true; return 0; } @@ -773,10 +771,6 @@ static void hmat_update_target_attrs(struct memory_target *target, u32 best = 0; int i; - /* Don't update if an external agent has changed the data. */ - if (target->ext_updated) - return; - /* Don't update for generic port if there's no device handle */ if ((access == NODE_ACCESS_CLASS_GENPORT_SINK_LOCAL || access == NODE_ACCESS_CLASS_GENPORT_SINK_CPU) && diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c0af645425f4..c891fd618cfd 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1081,8 +1081,3 @@ int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, { return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); } - -bool cxl_need_node_perf_attrs_update(int nid) -{ - return !acpi_node_backed_by_real_pxm(nid); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 2669f251d677..a253d308f3c9 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -139,7 +139,6 @@ long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, enum access_coordinate_class access); -bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 71cc42d05248..0ed95cbc5d5b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -30,6 +30,12 @@ * 3. Decoder targets */ +/* + * nodemask that sets per node when the access_coordinates for the node has + * been updated by the CXL memory hotplug notifier. + */ +static nodemask_t nodemask_region_seen = NODE_MASK_NONE; + static struct cxl_region *to_cxl_region(struct device *dev); #define __ACCESS_ATTR_RO(_level, _name) { \ @@ -2442,14 +2448,8 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { if (cxlr->coord[i].read_bandwidth) { - rc = 0; - if (cxl_need_node_perf_attrs_update(nid)) - node_set_perf_attrs(nid, &cxlr->coord[i], i); - else - rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); - - if (rc == 0) - cset++; + node_update_perf_attrs(nid, &cxlr->coord[i], i); + cset++; } } @@ -2487,6 +2487,10 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid != region_nid) return NOTIFY_DONE; + /* No action needed if node bit already set */ + if (node_test_and_set(nid, nodemask_region_seen)) + return NOTIFY_DONE; + if (!cxl_region_update_coordinates(cxlr, nid)) return NOTIFY_DONE; diff --git a/include/linux/memory.h b/include/linux/memory.h index 1305102688d0..0b755d1ef1ec 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -120,8 +120,8 @@ struct mem_section; */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -#define HMAT_CALLBACK_PRI 2 #define CXL_CALLBACK_PRI 5 +#define HMAT_CALLBACK_PRI 6 #define MM_COMPUTE_BATCH_PRI 10 #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 -- cgit v1.2.3 From e99ecbc4c89adf551cccbbc00b5cb08c50969af6 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:07 -0700 Subject: acpi/hmat: Remove now unused hmat_update_target_coordinates() Remove deadcode since CXL no longer calls hmat_update_target_coordinates(). Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 28 ---------------------------- drivers/cxl/core/cdat.c | 6 ------ drivers/cxl/core/core.h | 2 -- include/linux/acpi.h | 12 ------------ 4 files changed, 48 deletions(-) (limited to 'include') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 5d32490dc4ab..5a36d57289b4 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -367,34 +367,6 @@ static void hmat_update_target_access(struct memory_target *target, } } -int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, - enum access_coordinate_class access) -{ - struct memory_target *target; - int pxm; - - if (nid == NUMA_NO_NODE) - return -EINVAL; - - pxm = node_to_pxm(nid); - guard(mutex)(&target_lock); - target = find_mem_target(pxm); - if (!target) - return -ENODEV; - - hmat_update_target_access(target, ACPI_HMAT_READ_LATENCY, - coord->read_latency, access); - hmat_update_target_access(target, ACPI_HMAT_WRITE_LATENCY, - coord->write_latency, access); - hmat_update_target_access(target, ACPI_HMAT_READ_BANDWIDTH, - coord->read_bandwidth, access); - hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, - coord->write_bandwidth, access); - - return 0; -} -EXPORT_SYMBOL_GPL(hmat_update_target_coordinates); - static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) { struct memory_locality *loc; diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c891fd618cfd..bca1ec279651 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1075,9 +1075,3 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } - -int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, - enum access_coordinate_class access) -{ - return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index a253d308f3c9..0476c3b648de 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -137,8 +137,6 @@ enum cxl_poison_trace_type { long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); -int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, - enum access_coordinate_class access); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1c5bb1e887cd..5ff5d99f6ead 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1595,18 +1595,6 @@ static inline void acpi_use_parent_companion(struct device *dev) ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent)); } -#ifdef CONFIG_ACPI_HMAT -int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, - enum access_coordinate_class access); -#else -static inline int hmat_update_target_coordinates(int nid, - struct access_coordinate *coord, - enum access_coordinate_class access) -{ - return -EOPNOTSUPP; -} -#endif - #ifdef CONFIG_ACPI_NUMA bool acpi_node_backed_by_real_pxm(int nid); #else -- cgit v1.2.3 From 5d14bbf9d1d90cb7ca3e46fe2c8a4277572eab94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Sep 2025 09:31:41 +0000 Subject: net_sched: act: remove tcfa_qstats tcfa_qstats is currently only used to hold drops and overlimits counters. tcf_action_inc_drop_qstats() and tcf_action_inc_overlimit_qstats() currently acquire a->tcfa_lock to increment these counters. Switch to two atomic_t to get lock-free accounting. Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250901093141.2093176-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 14 ++++++-------- net/sched/act_api.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 2894cfff2da3..91a24b5e0b93 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -33,7 +33,10 @@ struct tc_action { struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; - struct gnet_stats_queue tcfa_qstats; + + atomic_t tcfa_drops; + atomic_t tcfa_overlimits; + struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; @@ -53,7 +56,6 @@ struct tc_action { #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats -#define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock @@ -241,9 +243,7 @@ static inline void tcf_action_inc_drop_qstats(struct tc_action *a) qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_drop_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_drops); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) @@ -252,9 +252,7 @@ static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_overlimit_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_overlimits); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9e468e463467..ff6be5cfe2b0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - a->tcfa_qstats.drops += drops; + atomic_add(drops, &a->tcfa_drops); if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { - int err = 0; + struct gnet_stats_queue qstats = {0}; struct gnet_dump d; + int err = 0; if (p == NULL) goto errout; @@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; + qstats.drops = atomic_read(&p->tcfa_drops); + qstats.overlimits = atomic_read(&p->tcfa_overlimits); + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfa_qstats, - p->tcfa_qstats.qlen) < 0) + &qstats, + qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) -- cgit v1.2.3 From 21368fcbb124d51b5d8bd8fa0a286a23c34a0888 Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Mon, 25 Aug 2025 10:28:21 +0200 Subject: bitmap: introduce hardware-specific bitfield operations Hardware of various vendors, but very notably Rockchip, often uses 32-bit registers where the upper 16-bit half of the register is a write-enable mask for the lower half. This type of hardware setup allows for more granular concurrent register write access. Over the years, many drivers have hand-rolled their own version of this macro, usually without any checks, often called something like HIWORD_UPDATE or FIELD_PREP_HIWORD, commonly with slightly different semantics between them. Clearly there is a demand for such a macro, and thus the demand should be satisfied in a common header file. As this is a convention that spans across multiple vendors, and similar conventions may also have cross-vendor adoption, it's best if it lives in a vendor-agnostic header file that can be expanded over time. Add hw_bitfield.h with two macros: FIELD_PREP_WM16, and FIELD_PREP_WM16_CONST. The latter is a version that can be used in initializers, like FIELD_PREP_CONST. Suggested-by: Yury Norov (NVIDIA) Signed-off-by: Nicolas Frattaroli Acked-by: Jakub Kicinski Acked-by: Heiko Stuebner Signed-off-by: Yury Norov (NVIDIA) --- MAINTAINERS | 1 + include/linux/hw_bitfield.h | 62 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 include/linux/hw_bitfield.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..4a2db6c4b8ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4275,6 +4275,7 @@ F: include/linux/bits.h F: include/linux/cpumask.h F: include/linux/cpumask_types.h F: include/linux/find.h +F: include/linux/hw_bitfield.h F: include/linux/nodemask.h F: include/linux/nodemask_types.h F: include/uapi/linux/bits.h diff --git a/include/linux/hw_bitfield.h b/include/linux/hw_bitfield.h new file mode 100644 index 000000000000..df202e167ce4 --- /dev/null +++ b/include/linux/hw_bitfield.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Collabora Ltd. + */ + +#ifndef _LINUX_HW_BITFIELD_H +#define _LINUX_HW_BITFIELD_H + +#include +#include +#include + +/** + * FIELD_PREP_WM16() - prepare a bitfield element with a mask in the upper half + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP_WM16() masks and shifts up the value, as well as bitwise ORs the + * result with the mask shifted up by 16. + * + * This is useful for a common design of hardware registers where the upper + * 16-bit half of a 32-bit register is used as a write-enable mask. In such a + * register, a bit in the lower half is only updated if the corresponding bit + * in the upper half is high. + */ +#define FIELD_PREP_WM16(_mask, _val) \ + ({ \ + typeof(_val) __val = _val; \ + typeof(_mask) __mask = _mask; \ + __BF_FIELD_CHECK(__mask, ((u16)0U), __val, \ + "HWORD_UPDATE: "); \ + (((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) | \ + ((__mask) << 16); \ + }) + +/** + * FIELD_PREP_WM16_CONST() - prepare a constant bitfield element with a mask in + * the upper half + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP_WM16_CONST() masks and shifts up the value, as well as bitwise ORs + * the result with the mask shifted up by 16. + * + * This is useful for a common design of hardware registers where the upper + * 16-bit half of a 32-bit register is used as a write-enable mask. In such a + * register, a bit in the lower half is only updated if the corresponding bit + * in the upper half is high. + * + * Unlike FIELD_PREP_WM16(), this is a constant expression and can therefore + * be used in initializers. Error checking is less comfortable for this + * version. + */ +#define FIELD_PREP_WM16_CONST(_mask, _val) \ + ( \ + FIELD_PREP_CONST(_mask, _val) | \ + (BUILD_BUG_ON_ZERO(const_true((u64)(_mask) > U16_MAX)) + \ + ((_mask) << 16)) \ + ) + + +#endif /* _LINUX_HW_BITFIELD_H */ -- cgit v1.2.3 From df3a7762ee24ba6a33d4215244e329ca300f4819 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 2 Sep 2025 10:06:56 -0600 Subject: io_uring/uring_cmd: add io_uring_cmd_tw_t type alias Introduce a function pointer type alias io_uring_cmd_tw_t for the uring_cmd task work callback. This avoids repeating the signature in several places. Also name both arguments to the callback to clarify what they represent. Signed-off-by: Caleb Sander Mateos Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250902160657.1726828-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 13 ++++++++----- io_uring/uring_cmd.c | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 4bd3a7339243..7211157edfe9 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,11 +11,14 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) +typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, + unsigned issue_flags); + struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ @@ -57,7 +60,7 @@ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, unsigned issue_flags); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags); /* @@ -106,7 +109,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_tw_t task_work_cb, unsigned flags) { } @@ -143,13 +146,13 @@ static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f5a2642bb407..d76d6d27765c 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -126,7 +126,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); -- cgit v1.2.3 From 04a3134f88a4bd03001a3093144819523cfca99e Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 2 Sep 2025 22:45:24 -0700 Subject: net/mlx5: Add PSP capabilities structures and bits Add mlx5_ifc PSP related capabilities structures and HW definitions needed for PSP support in mlx5. Link: https://lore.kernel.org/netdev/20250828162953.2707727-1-daniel.zahka@gmail.com/ Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + .../mellanox/mlx5/core/steering/hws/definer.c | 2 +- include/linux/mlx5/device.h | 4 + include/linux/mlx5/mlx5_ifc.h | 95 +++++++++++++++++++++- 5 files changed, 103 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 57476487e31f..eeb4437975f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -294,6 +294,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, psp)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_PSP); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8517d4e5d5ef..0951c7cc1b5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1798,6 +1798,7 @@ static const int types[] = { MLX5_CAP_VDPA_EMULATION, MLX5_CAP_IPSEC, MLX5_CAP_PORT_SELECTION, + MLX5_CAP_PSP, MLX5_CAP_MACSEC, MLX5_CAP_ADV_VIRTUALIZATION, MLX5_CAP_CRYPTO, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index c6436c3a7a83..c4bb6967f74d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -1280,7 +1280,7 @@ hws_definer_conv_misc2(struct mlx5hws_definer_conv_data *cd, struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; - if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1a0, 0x8) || + if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.psp_syndrome, 0x8) || HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.ipsec_next_header, 0x8) || HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1c0, 0x40) || diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9d2467f982ad..72a83666e67f 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1248,6 +1248,7 @@ enum mlx5_cap_type { MLX5_CAP_IPSEC, MLX5_CAP_CRYPTO = 0x1a, MLX5_CAP_SHAMPO = 0x1d, + MLX5_CAP_PSP = 0x1e, MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, @@ -1487,6 +1488,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_SHAMPO(mdev, cap) \ MLX5_GET(shampo_cap, mdev->caps.hca[MLX5_CAP_SHAMPO]->cur, cap) +#define MLX5_CAP_PSP(mdev, cap)\ + MLX5_GET(psp_cap, (mdev)->caps.hca[MLX5_CAP_PSP]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 44d497272162..e9f14a0c7f4f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -314,6 +314,8 @@ enum { MLX5_CMD_OP_CREATE_UMEM = 0xa08, MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, MLX5_CMD_OP_SYNC_STEERING = 0xb00, + MLX5_CMD_OP_PSP_GEN_SPI = 0xb10, + MLX5_CMD_OP_PSP_ROTATE_KEY = 0xb11, MLX5_CMD_OP_QUERY_VHCA_STATE = 0xb0d, MLX5_CMD_OP_MODIFY_VHCA_STATE = 0xb0e, MLX5_CMD_OP_SYNC_CRYPTO = 0xb12, @@ -489,12 +491,14 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; - u8 reserved_at_60[0x2]; + u8 reformat_l2_to_l3_psp_tunnel[0x1]; + u8 reformat_l3_psp_tunnel_to_l2[0x1]; u8 reformat_insert[0x1]; u8 reformat_remove[0x1]; u8 macsec_encrypt[0x1]; u8 macsec_decrypt[0x1]; - u8 reserved_at_66[0x2]; + u8 psp_encrypt[0x1]; + u8 psp_decrypt[0x1]; u8 reformat_add_macsec[0x1]; u8 reformat_remove_macsec[0x1]; u8 reparse[0x1]; @@ -703,7 +707,7 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 metadata_reg_a[0x20]; - u8 reserved_at_1a0[0x8]; + u8 psp_syndrome[0x8]; u8 macsec_syndrome[0x8]; u8 ipsec_syndrome[0x8]; u8 ipsec_next_header[0x8]; @@ -1511,6 +1515,21 @@ struct mlx5_ifc_macsec_cap_bits { u8 reserved_at_40[0x7c0]; }; +struct mlx5_ifc_psp_cap_bits { + u8 reserved_at_0[0x1]; + u8 psp_crypto_offload[0x1]; + u8 reserved_at_2[0x1]; + u8 psp_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_num_of_psp_spi[0x5]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x7e0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -1876,7 +1895,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_2a0[0x7]; u8 mkey_pcie_tph[0x1]; - u8 reserved_at_2a8[0x3]; + u8 reserved_at_2a8[0x2]; + + u8 psp[0x1]; u8 shampo[0x1]; u8 reserved_at_2ac[0x4]; u8 max_wqe_sz_rq[0x10]; @@ -3803,6 +3824,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; + struct mlx5_ifc_psp_cap_bits psp_cap; u8 reserved_at_0[0x8000]; }; @@ -3832,6 +3854,7 @@ enum { enum { MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC = 0x1, + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_PSP = 0x2, }; struct mlx5_ifc_vlan_bits { @@ -7159,6 +7182,8 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT_OVER_UDP = 0xa, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_UDPV6 = 0xc, + MLX5_REFORMAT_TYPE_ADD_PSP_TUNNEL = 0xd, + MLX5_REFORMAT_TYPE_DEL_PSP_TUNNEL = 0xe, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, @@ -7285,6 +7310,7 @@ enum { MLX5_ACTION_IN_FIELD_IPSEC_SYNDROME = 0x5D, MLX5_ACTION_IN_FIELD_OUT_EMD_47_32 = 0x6F, MLX5_ACTION_IN_FIELD_OUT_EMD_31_0 = 0x70, + MLX5_ACTION_IN_FIELD_PSP_SYNDROME = 0x71, }; struct mlx5_ifc_alloc_modify_header_context_out_bits { @@ -13079,6 +13105,7 @@ enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_TLS = 0x1, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_IPSEC = 0x2, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_MACSEC = 0x4, + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_PSP = 0x6, }; struct mlx5_ifc_tls_static_params_bits { @@ -13496,4 +13523,64 @@ enum mlx5e_pcie_cong_event_mod_field { MLX5_PCIE_CONG_EVENT_MOD_THRESH = BIT(2), }; +struct mlx5_ifc_psp_rotate_key_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_psp_rotate_key_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum mlx5_psp_gen_spi_in_key_size { + MLX5_PSP_GEN_SPI_IN_KEY_SIZE_128 = 0x0, + MLX5_PSP_GEN_SPI_IN_KEY_SIZE_256 = 0x1, +}; + +struct mlx5_ifc_key_spi_bits { + u8 spi[0x20]; + + u8 reserved_at_20[0x60]; + + u8 key[8][0x20]; +}; + +struct mlx5_ifc_psp_gen_spi_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 key_size[0x2]; + u8 reserved_at_62[0xe]; + u8 num_of_spi[0x10]; +}; + +struct mlx5_ifc_psp_gen_spi_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x10]; + u8 num_of_spi[0x10]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_key_spi_bits key_spi[]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From ddeb66d2cb10f03a43d97a0ff2c3869d1951c87d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:36 +0200 Subject: gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-2-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-nomadik.c | 25 ++++++++++++------------- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 2 +- include/linux/gpio/gpio-nomadik.h | 3 +-- 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpio-nomadik.c b/drivers/gpio/gpio-nomadik.c index bcf4b07dd458..fde4b416faa8 100644 --- a/drivers/gpio/gpio-nomadik.c +++ b/drivers/gpio/gpio-nomadik.c @@ -20,6 +20,7 @@ */ #include #include +#include #include #include #include @@ -396,10 +397,10 @@ static int nmk_gpio_get_mode(struct nmk_gpio_chip *nmk_chip, int offset) } void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio) + struct gpio_chip *chip, unsigned int offset) { struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(chip); + struct gpio_desc *desc; int mode; bool is_out; bool data_out; @@ -425,15 +426,15 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, data_out = !!(readl(nmk_chip->addr + NMK_GPIO_DAT) & BIT(offset)); mode = nmk_gpio_get_mode(nmk_chip, offset); #ifdef CONFIG_PINCTRL_NOMADIK - if (mode == NMK_GPIO_ALT_C && pctldev) - mode = nmk_prcm_gpiocr_get_mode(pctldev, gpio); + if (mode == NMK_GPIO_ALT_C && pctldev) { + desc = gpio_device_get_desc(chip->gpiodev, offset); + mode = nmk_prcm_gpiocr_get_mode(pctldev, desc_to_gpio(desc)); + } #endif if (is_out) { seq_printf(s, " gpio-%-3d (%-20.20s) out %s %s", - gpio, - label ?: "(none)", - str_hi_lo(data_out), + offset, label ?: "(none)", str_hi_lo(data_out), (mode < 0) ? "unknown" : modes[mode]); } else { int irq = chip->to_irq(chip, offset); @@ -445,9 +446,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, }; seq_printf(s, " gpio-%-3d (%-20.20s) in %s %s", - gpio, - label ?: "(none)", - pulls[pullidx], + offset, label ?: "(none)", pulls[pullidx], (mode < 0) ? "unknown" : modes[mode]); val = nmk_gpio_get_input(chip, offset); @@ -479,10 +478,10 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static void nmk_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) { - unsigned int i, gpio = chip->base; + unsigned int i; - for (i = 0; i < chip->ngpio; i++, gpio++) { - nmk_gpio_dbg_show_one(s, NULL, chip, i, gpio); + for (i = 0; i < chip->ngpio; i++) { + nmk_gpio_dbg_show_one(s, NULL, chip, i); seq_puts(s, "\n"); } } diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 8940e04fcf4c..db0311b14132 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -584,7 +584,7 @@ static void nmk_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, seq_printf(s, "invalid pin offset"); return; } - nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base, offset); + nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base); } static int nmk_dt_add_map_mux(struct pinctrl_map **map, unsigned int *reserved_maps, diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index b5a84864650d..7ba53b499e16 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -261,8 +261,7 @@ struct platform_device; * true. */ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio); + struct gpio_chip *chip, unsigned int offset); #else -- cgit v1.2.3 From 661f951e371cc134ea31c84238dbdc9a898b8403 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Aug 2025 12:02:44 +0000 Subject: sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask() Leon [1] and Vinicius [2] noted a topology_span_sane() warning during their testing starting from v6.16-rc1. Debug that followed pointed to the tl->mask() for the NODE domain being incorrectly resolved to that of the highest NUMA domain. tl->mask() for NODE is set to the sd_numa_mask() which depends on the global "sched_domains_curr_level" hack. "sched_domains_curr_level" is set to the "tl->numa_level" during tl traversal in build_sched_domains() calling sd_init() but was not reset before topology_span_sane(). Since "tl->numa_level" still reflected the old value from build_sched_domains(), topology_span_sane() for the NODE domain trips when the span of the last NUMA domain overlaps. Instead of replicating the "sched_domains_curr_level" hack, get rid of it entirely and instead, pass the entire "sched_domain_topology_level" object to tl->cpumask() function to prevent such mishap in the future. sd_numa_mask() now directly references "tl->numa_level" instead of relying on the global "sched_domains_curr_level" hack to index into sched_domains_numa_masks[]. The original warning was reproducible on the following NUMA topology reported by Leon: $ sudo numactl -H available: 5 nodes (0-4) node 0 cpus: 0 1 node 0 size: 2927 MB node 0 free: 1603 MB node 1 cpus: 2 3 node 1 size: 3023 MB node 1 free: 3008 MB node 2 cpus: 4 5 node 2 size: 3023 MB node 2 free: 3007 MB node 3 cpus: 6 7 node 3 size: 3023 MB node 3 free: 3002 MB node 4 cpus: 8 9 node 4 size: 3022 MB node 4 free: 2718 MB node distances: node 0 1 2 3 4 0: 10 39 38 37 36 1: 39 10 38 37 36 2: 38 38 10 37 36 3: 37 37 37 10 36 4: 36 36 36 36 10 The above topology can be mimicked using the following QEMU cmd that was used to reproduce the warning and test the fix: sudo qemu-system-x86_64 -enable-kvm -cpu host \ -m 20G -smp cpus=10,sockets=10 -machine q35 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=4G,id=m2 \ -object memory-backend-ram,size=4G,id=m3 \ -object memory-backend-ram,size=4G,id=m4 \ -numa node,cpus=0-1,memdev=m0,nodeid=0 \ -numa node,cpus=2-3,memdev=m1,nodeid=1 \ -numa node,cpus=4-5,memdev=m2,nodeid=2 \ -numa node,cpus=6-7,memdev=m3,nodeid=3 \ -numa node,cpus=8-9,memdev=m4,nodeid=4 \ -numa dist,src=0,dst=1,val=39 \ -numa dist,src=0,dst=2,val=38 \ -numa dist,src=0,dst=3,val=37 \ -numa dist,src=0,dst=4,val=36 \ -numa dist,src=1,dst=0,val=39 \ -numa dist,src=1,dst=2,val=38 \ -numa dist,src=1,dst=3,val=37 \ -numa dist,src=1,dst=4,val=36 \ -numa dist,src=2,dst=0,val=38 \ -numa dist,src=2,dst=1,val=38 \ -numa dist,src=2,dst=3,val=37 \ -numa dist,src=2,dst=4,val=36 \ -numa dist,src=3,dst=0,val=37 \ -numa dist,src=3,dst=1,val=37 \ -numa dist,src=3,dst=2,val=37 \ -numa dist,src=3,dst=4,val=36 \ -numa dist,src=4,dst=0,val=36 \ -numa dist,src=4,dst=1,val=36 \ -numa dist,src=4,dst=2,val=36 \ -numa dist,src=4,dst=3,val=36 \ ... [ prateek: Moved common functions to include/linux/sched/topology.h, reuse the common bits for s390 and ppc, commit message ] Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1] Fixes: ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84cd, f55dac1dafb3 Signed-off-by: Peter Zijlstra (Intel) Reported-by: Leon Romanovsky Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Tested-by: Valentin Schneider # x86 Tested-by: Shrikanth Hegde # powerpc Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2] --- arch/powerpc/Kconfig | 4 ++++ arch/powerpc/include/asm/topology.h | 2 ++ arch/powerpc/kernel/smp.c | 27 +++++++++++---------------- arch/s390/kernel/topology.c | 20 +++++++------------- arch/x86/kernel/smpboot.c | 8 ++++---- include/linux/sched/topology.h | 28 +++++++++++++++++++++++++++- include/linux/topology.h | 2 +- kernel/sched/topology.c | 28 ++++++++++------------------ 8 files changed, 66 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9f..e51a595a0622 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -971,6 +971,10 @@ config SCHED_SMT when dealing with POWER5 cpus at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC + def_bool y + depends on SMP + config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index da15b5efe807..f19ca44512d1 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_SMP #include +struct cpumask *cpu_coregroup_mask(int cpu); + #ifdef CONFIG_PPC64 #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f59e4b9cc207..68edb66c2964 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void) * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. */ -static const struct cpumask *shared_cache_mask(int cpu) +static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) { return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) +static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_smallcore_mask(cpu); } #endif -static struct cpumask *cpu_coregroup_mask(int cpu) +struct cpumask *cpu_coregroup_mask(int cpu) { return per_cpu(cpu_coregroup_map, cpu); } @@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void) return coregroup_enabled; } -static const struct cpumask *cpu_mc_mask(int cpu) -{ - return cpu_coregroup_mask(cpu); -} - static int __init init_big_cores(void) { int cpu; @@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) return false; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update l2-cache mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); @@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) return; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update coregroup mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); @@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu) /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + cpumask_and(mask, mask, cpu_node_mask(cpu)); for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { @@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void) if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); powerpc_topology[i++] = - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { powerpc_topology[i++] = - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); } if (has_coregroup_support()) { powerpc_topology[i++] = - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 46569b8e47dd..1594c80e9bc4 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), - SDTL_INIT(cpu_book_mask, NULL, BOOK), - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab12..eb289abece23 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5263746b63e8..a3a24e115d44 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -30,11 +30,19 @@ struct sd_flag_debug { }; extern const struct sd_flag_debug sd_flag_debug[]; +struct sched_domain_topology_level; + #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} #endif #ifdef CONFIG_SCHED_CLUSTER @@ -42,6 +50,12 @@ static inline int cpu_cluster_flags(void) { return SD_CLUSTER | SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} #endif #ifdef CONFIG_SCHED_MC @@ -49,8 +63,20 @@ static inline int cpu_core_flags(void) { return SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} #endif +static inline const +struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + #ifdef CONFIG_NUMA static inline int cpu_numa_flags(void) { @@ -172,7 +198,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); typedef int (*sched_domain_flags_f)(void); struct sd_data { diff --git a/include/linux/topology.h b/include/linux/topology.h index 33b7fda97d39..6575af39fd10 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) #endif -static inline const struct cpumask *cpu_cpu_mask(int cpu) +static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 977e133bb8a4..18889bd97e22 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) enum numa_topology_type sched_numa_topology_type; static int sched_domains_numa_levels; -static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; @@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1737,17 +1729,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1769,9 +1761,9 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) #ifdef CONFIG_NUMA -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -2411,7 +2403,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { - const struct cpumask *tl_cpu_mask = tl->mask(cpu); + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id; /* lowest bit set in this mask is used as a unique id */ @@ -2419,7 +2411,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) return false; } else { /* First CPU hasn't been seen before, ensure it's a completely new span */ -- cgit v1.2.3 From 91c614f09abf1d45aac6b475d82a36c704b527ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Aug 2025 10:55:55 +0200 Subject: sched: Move STDL_INIT() functions out-of-line Since all these functions are address-taken in SDTL_INIT() and called indirectly, it doesn't really make sense for them to be inline. Suggested-by: Christophe Leroy Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/topology.h | 49 ++++++------------------------------------ kernel/sched/topology.c | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a3a24e115d44..bbcfdf12aa6e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -33,56 +33,21 @@ extern const struct sd_flag_debug sd_flag_debug[]; struct sched_domain_topology_level; #ifdef CONFIG_SCHED_SMT -static inline int cpu_smt_flags(void) -{ - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_smt_mask(cpu); -} +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_CLUSTER -static inline int cpu_cluster_flags(void) -{ - return SD_CLUSTER | SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_clustergroup_mask(cpu); -} +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_MC -static inline int cpu_core_flags(void) -{ - return SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_coregroup_mask(cpu); -} +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); #endif -static inline const -struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_node_mask(cpu); -} - -#ifdef CONFIG_NUMA -static inline int cpu_numa_flags(void) -{ - return SD_NUMA; -} -#endif +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); extern int arch_asym_cpu_priority(int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 18889bd97e22..1104d931c015 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1724,6 +1724,47 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ @@ -1760,6 +1801,10 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { -- cgit v1.2.3 From 2cd571245b43492867bf1b4252485f3e6647b643 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:16 +0800 Subject: sched/fair: Add related data structure for task based throttle Add related data structures for this new throttle functionality. Tesed-by: K Prateek Nayak Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Valentin Schneider Tested-by: Matteo Martelli Link: https://lore.kernel.org/r/20250829081120.806-2-ziqianlu@bytedance.com --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 13 +++++++++++++ kernel/sched/sched.h | 3 +++ 4 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b833350..644a01bdae70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -883,6 +883,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..feb750aae71b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4490,6 +4490,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..8fff40fcbc42 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5748,6 +5748,18 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static void throttle_cfs_rq_work(struct callback_head *work) +{ +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6472,6 +6484,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..a6493d255397 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -739,6 +739,7 @@ struct cfs_rq { int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -2658,6 +2659,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 -- cgit v1.2.3 From 74e2ef72bd4b25ce21c8f309d4f5b91b5df9ff5b Mon Sep 17 00:00:00 2001 From: Gokul Sivakumar Date: Thu, 24 Jul 2025 15:41:36 +0530 Subject: wifi: brcmfmac: fix 43752 SDIO FWVID incorrectly labelled as Cypress (CYW) Cypress(Infineon) is not the vendor for this 43752 SDIO WLAN chip, and so has not officially released any firmware binary for it. It is incorrect to maintain this WLAN chip with firmware vendor ID as "CYW". So relabel the chip's firmware Vendor ID as "WCC" as suggested by the maintainer. Fixes: d2587c57ffd8 ("brcmfmac: add 43752 SDIO ids and initialization") Fixes: f74f1ec22dc2 ("wifi: brcmfmac: add support for Cypress firmware api") Signed-off-by: Gokul Sivakumar Acked-by: Arend van Spriel Link: https://patch.msgid.link/20250724101136.6691-1-gokulkumar.sivakumar@infineon.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 2 +- drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c | 4 ++-- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 8 ++++---- drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h | 1 - include/linux/mmc/sdio_ids.h | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index 8ab7d1e34a6e..6a3f187320fc 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -997,9 +997,9 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = { BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356, WCC), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359, WCC), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43751, WCC), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43752, WCC), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373, CYW), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012, CYW), - BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752, CYW), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359, CYW), CYW_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439, CYW), { /* end: all zeroes */ } diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c index 9074ab49e806..4239f2b21e54 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c @@ -738,8 +738,8 @@ static u32 brcmf_chip_tcm_rambase(struct brcmf_chip_priv *ci) case BRCM_CC_4364_CHIP_ID: case CY_CC_4373_CHIP_ID: return 0x160000; - case CY_CC_43752_CHIP_ID: case BRCM_CC_43751_CHIP_ID: + case BRCM_CC_43752_CHIP_ID: case BRCM_CC_4377_CHIP_ID: return 0x170000; case BRCM_CC_4378_CHIP_ID: @@ -1452,7 +1452,7 @@ bool brcmf_chip_sr_capable(struct brcmf_chip *pub) return (reg & CC_SR_CTL0_ENABLE_MASK) != 0; case BRCM_CC_4359_CHIP_ID: case BRCM_CC_43751_CHIP_ID: - case CY_CC_43752_CHIP_ID: + case BRCM_CC_43752_CHIP_ID: case CY_CC_43012_CHIP_ID: addr = CORE_CC_REG(pmu->base, retention_ctl); reg = chip->ops->read32(chip->ctx, addr); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 8a0bad5119a0..8cf9d7e7c3f7 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -655,10 +655,10 @@ static const struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = { BRCMF_FW_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356), BRCMF_FW_ENTRY(BRCM_CC_4359_CHIP_ID, 0xFFFFFFFF, 4359), BRCMF_FW_ENTRY(BRCM_CC_43751_CHIP_ID, 0xFFFFFFFF, 43752), + BRCMF_FW_ENTRY(BRCM_CC_43752_CHIP_ID, 0xFFFFFFFF, 43752), BRCMF_FW_ENTRY(CY_CC_4373_CHIP_ID, 0xFFFFFFFF, 4373), BRCMF_FW_ENTRY(CY_CC_43012_CHIP_ID, 0xFFFFFFFF, 43012), BRCMF_FW_ENTRY(CY_CC_43439_CHIP_ID, 0xFFFFFFFF, 43439), - BRCMF_FW_ENTRY(CY_CC_43752_CHIP_ID, 0xFFFFFFFF, 43752) }; #define TXCTL_CREDITS 2 @@ -3426,8 +3426,8 @@ err: static bool brcmf_sdio_aos_no_decode(struct brcmf_sdio *bus) { if (bus->ci->chip == BRCM_CC_43751_CHIP_ID || - bus->ci->chip == CY_CC_43012_CHIP_ID || - bus->ci->chip == CY_CC_43752_CHIP_ID) + bus->ci->chip == BRCM_CC_43752_CHIP_ID || + bus->ci->chip == CY_CC_43012_CHIP_ID) return true; else return false; @@ -4278,8 +4278,8 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, switch (sdiod->func1->device) { case SDIO_DEVICE_ID_BROADCOM_43751: + case SDIO_DEVICE_ID_BROADCOM_43752: case SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373: - case SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", CY_4373_F2_WATERMARK); brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, diff --git a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h index b39c5c1ee18b..df3b67ba4db2 100644 --- a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h +++ b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h @@ -60,7 +60,6 @@ #define CY_CC_4373_CHIP_ID 0x4373 #define CY_CC_43012_CHIP_ID 43012 #define CY_CC_43439_CHIP_ID 43439 -#define CY_CC_43752_CHIP_ID 43752 /* USB Device IDs */ #define BRCM_USB_43143_DEVICE_ID 0xbd1e diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index fe3d6d98f8da..673cbdf43453 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -77,7 +77,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43439 0xa9af #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_DEVICE_ID_BROADCOM_43751 0xaae7 -#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752 0xaae8 +#define SDIO_DEVICE_ID_BROADCOM_43752 0xaae8 #define SDIO_VENDOR_ID_CYPRESS 0x04b4 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439 0xbd3d -- cgit v1.2.3 From 0a26e5eb78fb1627beb8e3eb172737f8492d2799 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Aug 2025 15:34:23 -0500 Subject: jiffies: Remove obsolete SHIFTED_HZ comment b3c869d35b9b ("jiffies: Remove compile time assumptions about CLOCK_TICK_RATE") removed the last definition of SHIFTED_HZ but left behind comments about it. Remove the comments as well. Signed-off-by: Bjorn Helgaas Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250825203425.796034-1-helgaas@kernel.org --- include/linux/jiffies.h | 2 +- include/vdso/jiffies.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 91b20788273d..0d1927da8055 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -61,7 +61,7 @@ extern void register_refined_jiffies(long clock_tick_rate); -/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +/* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ diff --git a/include/vdso/jiffies.h b/include/vdso/jiffies.h index 2f9d596c8b29..8ca04a141412 100644 --- a/include/vdso/jiffies.h +++ b/include/vdso/jiffies.h @@ -5,7 +5,7 @@ #include /* for HZ */ #include -/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +/* TICK_NSEC is the time between ticks in nsec */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) #endif /* __VDSO_JIFFIES_H */ -- cgit v1.2.3 From 4a76a0a889cef284327f265f97edc4ff2f3e11cc Mon Sep 17 00:00:00 2001 From: WeiHao Li Date: Sun, 31 Aug 2025 18:48:51 +0800 Subject: dt-bindings: clock: rk3368: Add SCLK_MIPIDSI_24M Add a clock id for mipi dsi reference clock, mipi dsi node used it. Signed-off-by: WeiHao Li Acked-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250831104855.45883-4-cn.liweihao@gmail.com Signed-off-by: Heiko Stuebner --- include/dt-bindings/clock/rk3368-cru.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/rk3368-cru.h b/include/dt-bindings/clock/rk3368-cru.h index ebae3cbf8192..b951e2906948 100644 --- a/include/dt-bindings/clock/rk3368-cru.h +++ b/include/dt-bindings/clock/rk3368-cru.h @@ -72,6 +72,7 @@ #define SCLK_SFC 126 #define SCLK_MAC 127 #define SCLK_MACREF_OUT 128 +#define SCLK_MIPIDSI_24M 129 #define SCLK_TIMER10 133 #define SCLK_TIMER11 134 #define SCLK_TIMER12 135 -- cgit v1.2.3 From a576a849d5f33356e0d8fd3eae4fbaf8869417e5 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 5 Aug 2025 15:34:43 +0200 Subject: of/irq: Convert of_msi_map_id() callers to of_msi_xlate() With the introduction of the of_msi_xlate() function, the OF layer provides an API to map a device ID and retrieve the MSI controller node the ID is mapped to with a single call. of_msi_map_id() is currently used to map a deviceID to a specific MSI controller node; of_msi_xlate() can be used for that purpose too, there is no need to keep the two functions. Convert of_msi_map_id() to of_msi_xlate() calls and update the of_msi_xlate() documentation to describe how the struct device_node pointer passed in should be set-up to either provide the MSI controller node target or receive its pointer upon mapping completion. Signed-off-by: Lorenzo Pieralisi Cc: Thomas Gleixner Cc: Rob Herring Cc: Marc Zyngier Acked-by: Thomas Gleixner Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250805133443.936955-1-lpieralisi@kernel.org Signed-off-by: Rob Herring (Arm) --- drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 2 +- drivers/of/irq.c | 25 +++++-------------------- drivers/pci/msi/irqdomain.c | 2 +- include/linux/of_irq.h | 6 ------ 4 files changed, 7 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c index 11549d85f23b..b5785472765a 100644 --- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c @@ -30,7 +30,7 @@ static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain, u32 out_id; of_node = irq_domain_get_of_node(domain); - out_id = of_node ? of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid) : + out_id = of_node ? of_msi_xlate(&mc_dev->dev, &of_node, mc_dev->icid) : iort_msi_map_id(&mc_dev->dev, mc_dev->icid); return out_id; diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 74aaea61de13..e7c12abd10ab 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -673,13 +673,14 @@ err: /** * of_msi_xlate - map a MSI ID and find relevant MSI controller node * @dev: device for which the mapping is to be done. - * @msi_np: Pointer to store the MSI controller node + * @msi_np: Pointer to target MSI controller node * @id_in: Device ID. * * Walk up the device hierarchy looking for devices with a "msi-map" - * property. If found, apply the mapping to @id_in. @msi_np pointed - * value must be NULL on entry, if an MSI controller is found @msi_np is - * initialized to the MSI controller node with a reference held. + * property. If found, apply the mapping to @id_in. + * If @msi_np points to a non-NULL device node pointer, only entries targeting + * that node will be matched; if it points to a NULL value, it will receive the + * device node of the first matching target phandle, with a reference held. * * Returns: The mapped MSI id. */ @@ -699,22 +700,6 @@ u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in) return id_out; } -/** - * of_msi_map_id - Map a MSI ID for a device. - * @dev: device for which the mapping is to be done. - * @msi_np: device node of the expected msi controller. - * @id_in: unmapped MSI ID for the device. - * - * Walk up the device hierarchy looking for devices with a "msi-map" - * property. If found, apply the mapping to @id_in. - * - * Return: The mapped MSI ID. - */ -u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in) -{ - return of_msi_xlate(dev, &msi_np, id_in); -} - /** * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain * @dev: device for which the mapping is to be done. diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf..555c61b1fc36 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -422,7 +422,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); of_node = irq_domain_get_of_node(domain); - rid = of_node ? of_msi_map_id(&pdev->dev, of_node, rid) : + rid = of_node ? of_msi_xlate(&pdev->dev, &of_node, rid) : iort_msi_map_id(&pdev->dev, rid); return rid; diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index a480063c9cb1..1db8543dfc8a 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -55,7 +55,6 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 bus_token); extern void of_msi_configure(struct device *dev, const struct device_node *np); extern u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in); -u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else static inline void of_irq_init(const struct of_device_id *matches) { @@ -105,11 +104,6 @@ static inline u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, { return id_in; } -static inline u32 of_msi_map_id(struct device *dev, - struct device_node *msi_np, u32 id_in) -{ - return id_in; -} #endif #if defined(CONFIG_OF_IRQ) || defined(CONFIG_SPARC) -- cgit v1.2.3 From 07bab7b81d8a4de604ae4175978adf37137c35d6 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Mon, 23 Jun 2025 23:46:27 +0200 Subject: dt-bindings: clock: rp1: Add missing MIPI DSI defines Declare the positional index for the RP1 MIPI clocks. Signed-off-by: Andrea della Porta Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/c20066500908db854aa4816b40e956296bab526a.1750714412.git.andrea.porta@suse.com Signed-off-by: Florian Fainelli --- include/dt-bindings/clock/raspberrypi,rp1-clocks.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/raspberrypi,rp1-clocks.h b/include/dt-bindings/clock/raspberrypi,rp1-clocks.h index 248efb895f35..7915fb8197bf 100644 --- a/include/dt-bindings/clock/raspberrypi,rp1-clocks.h +++ b/include/dt-bindings/clock/raspberrypi,rp1-clocks.h @@ -58,4 +58,8 @@ #define RP1_PLL_VIDEO_PRI_PH 43 #define RP1_PLL_AUDIO_TERN 44 +/* MIPI clocks managed by the DSI driver */ +#define RP1_CLK_MIPI0_DSI_BYTECLOCK 45 +#define RP1_CLK_MIPI1_DSI_BYTECLOCK 46 + #endif -- cgit v1.2.3 From 54dbd2a8e974b900b18639e75f62702a4334ddc0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 3 Sep 2025 14:52:56 +0300 Subject: PCI/P2PDMA: Reduce scope of pci_has_p2pmem() pci_has_p2pmem() is not used outside of p2pdma.c, and there is no need to export it for use by modules. Signed-off-by: Leon Romanovsky Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe Link: https://patch.msgid.link/d40f3f1decf54c9236bc38b48a6aae612a5c182f.1756900291.git.leon@kernel.org --- drivers/pci/p2pdma.c | 3 +-- include/linux/pci-p2pdma.h | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 1cb5e423eed4..78e108e47254 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); * pci_has_p2pmem - check if a given PCI device has published any p2pmem * @pdev: PCI device to check */ -bool pci_has_p2pmem(struct pci_dev *pdev) +static bool pci_has_p2pmem(struct pci_dev *pdev) { struct pci_p2pdma *p2pdma; bool res; @@ -750,7 +750,6 @@ bool pci_has_p2pmem(struct pci_dev *pdev) return res; } -EXPORT_SYMBOL_GPL(pci_has_p2pmem); /** * pci_p2pmem_find_many - find a peer-to-peer DMA memory device compatible with diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 075c20b161d9..951f81a38f3a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -21,7 +21,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, int num_clients, bool verbose); -bool pci_has_p2pmem(struct pci_dev *pdev); struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients); void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size); void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size); @@ -45,10 +44,6 @@ static inline int pci_p2pdma_distance_many(struct pci_dev *provider, { return -1; } -static inline bool pci_has_p2pmem(struct pci_dev *pdev) -{ - return false; -} static inline struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients) { -- cgit v1.2.3 From 929324913e0caabea91b50fa71e41d70b766f7dc Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 2 Sep 2025 23:11:35 +0200 Subject: net: Add rfs_needed() helper Add a helper to check if RFS is needed or not. Allows to make the code a bit cleaner and the next patch to have MPTCP use this helper to decide whether or not to iterate over the subflows. tun_flow_update() was calling sock_rps_record_flow_hash() regardless of the state of rfs_needed. This was not really a bug as sock_flow_table simply ends up being NULL and thus everything will be fine. This commit here thus also implicitly makes tun_flow_update() respect the state of rfs_needed. Suggested-by: Matthieu Baerts Signed-off-by: Christoph Paasch Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250902-net-next-mptcp-misc-feat-6-18-v2-3-fa02bb3188b1@kernel.org Signed-off-by: Jakub Kicinski --- include/net/rps.h | 85 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index 9917dce42ca4..f1794cd2e7fb 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -85,11 +85,8 @@ static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, WRITE_ONCE(table->ents[index], val); } -#endif /* CONFIG_RPS */ - -static inline void sock_rps_record_flow_hash(__u32 hash) +static inline void _sock_rps_record_flow_hash(__u32 hash) { -#ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table; if (!hash) @@ -99,42 +96,33 @@ static inline void sock_rps_record_flow_hash(__u32 hash) if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); -#endif } -static inline void sock_rps_record_flow(const struct sock *sk) +static inline void _sock_rps_record_flow(const struct sock *sk) { -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } + _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); } -#endif } -static inline void sock_rps_delete_flow(const struct sock *sk) +static inline void _sock_rps_delete_flow(const struct sock *sk) { -#ifdef CONFIG_RPS struct rps_sock_flow_table *table; u32 hash, index; - if (!static_branch_unlikely(&rfs_needed)) - return; - hash = READ_ONCE(sk->sk_rxhash); if (!hash) return; @@ -147,6 +135,45 @@ static inline void sock_rps_delete_flow(const struct sock *sk) WRITE_ONCE(table->ents[index], RPS_NO_CPU); } rcu_read_unlock(); +} +#endif /* CONFIG_RPS */ + +static inline bool rfs_is_needed(void) +{ +#ifdef CONFIG_RPS + return static_branch_unlikely(&rfs_needed); +#else + return false; +#endif +} + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow_hash(hash); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow(sk); +#endif +} + +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_delete_flow(sk); #endif } -- cgit v1.2.3 From 781c118c3ece2b546c6c41f617cccb36cb9534f1 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Fri, 29 Aug 2025 13:28:03 +0530 Subject: dt-bindings: clock: Add DISPCC and reset controller for GLYMUR SoC Add the device tree bindings for the display clock controller which are required on Qualcomm Glymur SoC. Signed-off-by: Taniya Das Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250829-glymur-disp-clock-controllers-v1-1-0ce6fabd837c@oss.qualcomm.com [bjorn: Dropped unnecessary include in DT example] Signed-off-by: Bjorn Andersson --- .../bindings/clock/qcom,glymur-dispcc.yaml | 98 ++++++++++++++++++ include/dt-bindings/clock/qcom,glymur-dispcc.h | 114 +++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/qcom,glymur-dispcc.yaml create mode 100644 include/dt-bindings/clock/qcom,glymur-dispcc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,glymur-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,glymur-dispcc.yaml new file mode 100644 index 000000000000..45f027c70e03 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,glymur-dispcc.yaml @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/qcom,glymur-dispcc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Display Clock & Reset Controller on GLYMUR + +maintainers: + - Taniya Das + +description: | + Qualcomm display clock control module which supports the clocks, resets and + power domains for the MDSS instances on GLYMUR SoC. + + See also: + include/dt-bindings/clock/qcom,dispcc-glymur.h + +properties: + compatible: + enum: + - qcom,glymur-dispcc + + clocks: + items: + - description: Board CXO clock + - description: Board sleep clock + - description: DisplayPort 0 link clock + - description: DisplayPort 0 VCO div clock + - description: DisplayPort 1 link clock + - description: DisplayPort 1 VCO div clock + - description: DisplayPort 2 link clock + - description: DisplayPort 2 VCO div clock + - description: DisplayPort 3 link clock + - description: DisplayPort 3 VCO div clock + - description: DSI 0 PLL byte clock + - description: DSI 0 PLL DSI clock + - description: DSI 1 PLL byte clock + - description: DSI 1 PLL DSI clock + - description: Standalone PHY 0 PLL link clock + - description: Standalone PHY 0 VCO div clock + - description: Standalone PHY 1 PLL link clock + - description: Standalone PHY 1 VCO div clock + + power-domains: + description: + A phandle and PM domain specifier for the MMCX power domain. + maxItems: 1 + + required-opps: + description: + A phandle to an OPP node describing required MMCX performance point. + maxItems: 1 + +required: + - compatible + - clocks + - power-domains + - '#power-domain-cells' + +allOf: + - $ref: qcom,gcc.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + #include + + clock-controller@af00000 { + compatible = "qcom,glymur-dispcc"; + reg = <0x0af00000 0x20000>; + clocks = <&rpmhcc RPMH_CXO_CLK>, + <&sleep_clk>, + <&mdss_dp_phy0 0>, + <&mdss_dp_phy0 1>, + <&mdss_dp_phy1 0>, + <&mdss_dp_phy1 1>, + <&mdss_dp_phy2 0>, + <&mdss_dp_phy2 1>, + <&mdss_dp_phy3 0>, + <&mdss_dp_phy3 1>, + <&mdss_dsi0_phy 0>, + <&mdss_dsi0_phy 1>, + <&mdss_dsi1_phy 0>, + <&mdss_dsi1_phy 1>, + <&mdss_phy0_link 0>, + <&mdss_phy0_vco_div 0>, + <&mdss_phy1_link 1>, + <&mdss_phy1_vco_div 1>; + power-domains = <&rpmhpd RPMHPD_MMCX>; + required-opps = <&rpmhpd_opp_low_svs>; + #clock-cells = <1>; + #reset-cells = <1>; + #power-domain-cells = <1>; + }; +... diff --git a/include/dt-bindings/clock/qcom,glymur-dispcc.h b/include/dt-bindings/clock/qcom,glymur-dispcc.h new file mode 100644 index 000000000000..a845d76defe2 --- /dev/null +++ b/include/dt-bindings/clock/qcom,glymur-dispcc.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025, Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_DISP_CC_GLYMUR_H +#define _DT_BINDINGS_CLK_QCOM_DISP_CC_GLYMUR_H + +/* DISP_CC clocks */ +#define DISP_CC_ESYNC0_CLK 0 +#define DISP_CC_ESYNC0_CLK_SRC 1 +#define DISP_CC_ESYNC1_CLK 2 +#define DISP_CC_ESYNC1_CLK_SRC 3 +#define DISP_CC_MDSS_ACCU_SHIFT_CLK 4 +#define DISP_CC_MDSS_AHB1_CLK 5 +#define DISP_CC_MDSS_AHB_CLK 6 +#define DISP_CC_MDSS_AHB_CLK_SRC 7 +#define DISP_CC_MDSS_BYTE0_CLK 8 +#define DISP_CC_MDSS_BYTE0_CLK_SRC 9 +#define DISP_CC_MDSS_BYTE0_DIV_CLK_SRC 10 +#define DISP_CC_MDSS_BYTE0_INTF_CLK 11 +#define DISP_CC_MDSS_BYTE1_CLK 12 +#define DISP_CC_MDSS_BYTE1_CLK_SRC 13 +#define DISP_CC_MDSS_BYTE1_DIV_CLK_SRC 14 +#define DISP_CC_MDSS_BYTE1_INTF_CLK 15 +#define DISP_CC_MDSS_DPTX0_AUX_CLK 16 +#define DISP_CC_MDSS_DPTX0_AUX_CLK_SRC 17 +#define DISP_CC_MDSS_DPTX0_LINK_CLK 18 +#define DISP_CC_MDSS_DPTX0_LINK_CLK_SRC 19 +#define DISP_CC_MDSS_DPTX0_LINK_DIV_CLK_SRC 20 +#define DISP_CC_MDSS_DPTX0_LINK_DPIN_CLK 21 +#define DISP_CC_MDSS_DPTX0_LINK_DPIN_DIV_CLK_SRC 22 +#define DISP_CC_MDSS_DPTX0_LINK_INTF_CLK 23 +#define DISP_CC_MDSS_DPTX0_PIXEL0_CLK 24 +#define DISP_CC_MDSS_DPTX0_PIXEL0_CLK_SRC 25 +#define DISP_CC_MDSS_DPTX0_PIXEL1_CLK 26 +#define DISP_CC_MDSS_DPTX0_PIXEL1_CLK_SRC 27 +#define DISP_CC_MDSS_DPTX0_USB_ROUTER_LINK_INTF_CLK 28 +#define DISP_CC_MDSS_DPTX1_AUX_CLK 29 +#define DISP_CC_MDSS_DPTX1_AUX_CLK_SRC 30 +#define DISP_CC_MDSS_DPTX1_LINK_CLK 31 +#define DISP_CC_MDSS_DPTX1_LINK_CLK_SRC 32 +#define DISP_CC_MDSS_DPTX1_LINK_DIV_CLK_SRC 33 +#define DISP_CC_MDSS_DPTX1_LINK_DPIN_CLK 34 +#define DISP_CC_MDSS_DPTX1_LINK_DPIN_DIV_CLK_SRC 35 +#define DISP_CC_MDSS_DPTX1_LINK_INTF_CLK 36 +#define DISP_CC_MDSS_DPTX1_PIXEL0_CLK 37 +#define DISP_CC_MDSS_DPTX1_PIXEL0_CLK_SRC 38 +#define DISP_CC_MDSS_DPTX1_PIXEL1_CLK 39 +#define DISP_CC_MDSS_DPTX1_PIXEL1_CLK_SRC 40 +#define DISP_CC_MDSS_DPTX1_USB_ROUTER_LINK_INTF_CLK 41 +#define DISP_CC_MDSS_DPTX2_AUX_CLK 42 +#define DISP_CC_MDSS_DPTX2_AUX_CLK_SRC 43 +#define DISP_CC_MDSS_DPTX2_LINK_CLK 44 +#define DISP_CC_MDSS_DPTX2_LINK_CLK_SRC 45 +#define DISP_CC_MDSS_DPTX2_LINK_DIV_CLK_SRC 46 +#define DISP_CC_MDSS_DPTX2_LINK_DPIN_CLK 47 +#define DISP_CC_MDSS_DPTX2_LINK_DPIN_DIV_CLK_SRC 48 +#define DISP_CC_MDSS_DPTX2_LINK_INTF_CLK 49 +#define DISP_CC_MDSS_DPTX2_PIXEL0_CLK 50 +#define DISP_CC_MDSS_DPTX2_PIXEL0_CLK_SRC 51 +#define DISP_CC_MDSS_DPTX2_PIXEL1_CLK 52 +#define DISP_CC_MDSS_DPTX2_PIXEL1_CLK_SRC 53 +#define DISP_CC_MDSS_DPTX2_USB_ROUTER_LINK_INTF_CLK 54 +#define DISP_CC_MDSS_DPTX3_AUX_CLK 55 +#define DISP_CC_MDSS_DPTX3_AUX_CLK_SRC 56 +#define DISP_CC_MDSS_DPTX3_LINK_CLK 57 +#define DISP_CC_MDSS_DPTX3_LINK_CLK_SRC 58 +#define DISP_CC_MDSS_DPTX3_LINK_DIV_CLK_SRC 59 +#define DISP_CC_MDSS_DPTX3_LINK_DPIN_CLK 60 +#define DISP_CC_MDSS_DPTX3_LINK_DPIN_DIV_CLK_SRC 61 +#define DISP_CC_MDSS_DPTX3_LINK_INTF_CLK 62 +#define DISP_CC_MDSS_DPTX3_PIXEL0_CLK 63 +#define DISP_CC_MDSS_DPTX3_PIXEL0_CLK_SRC 64 +#define DISP_CC_MDSS_ESC0_CLK 65 +#define DISP_CC_MDSS_ESC0_CLK_SRC 66 +#define DISP_CC_MDSS_ESC1_CLK 67 +#define DISP_CC_MDSS_ESC1_CLK_SRC 68 +#define DISP_CC_MDSS_MDP1_CLK 69 +#define DISP_CC_MDSS_MDP_CLK 70 +#define DISP_CC_MDSS_MDP_CLK_SRC 71 +#define DISP_CC_MDSS_MDP_LUT1_CLK 72 +#define DISP_CC_MDSS_MDP_LUT_CLK 73 +#define DISP_CC_MDSS_NON_GDSC_AHB_CLK 74 +#define DISP_CC_MDSS_PCLK0_CLK 75 +#define DISP_CC_MDSS_PCLK0_CLK_SRC 76 +#define DISP_CC_MDSS_PCLK1_CLK 77 +#define DISP_CC_MDSS_PCLK1_CLK_SRC 78 +#define DISP_CC_MDSS_PCLK2_CLK 79 +#define DISP_CC_MDSS_PCLK2_CLK_SRC 80 +#define DISP_CC_MDSS_RSCC_AHB_CLK 81 +#define DISP_CC_MDSS_RSCC_VSYNC_CLK 82 +#define DISP_CC_MDSS_VSYNC1_CLK 83 +#define DISP_CC_MDSS_VSYNC_CLK 84 +#define DISP_CC_MDSS_VSYNC_CLK_SRC 85 +#define DISP_CC_OSC_CLK 86 +#define DISP_CC_OSC_CLK_SRC 87 +#define DISP_CC_PLL0 88 +#define DISP_CC_PLL1 89 +#define DISP_CC_SLEEP_CLK 90 +#define DISP_CC_SLEEP_CLK_SRC 91 +#define DISP_CC_XO_CLK 92 +#define DISP_CC_XO_CLK_SRC 93 + +/* DISP_CC power domains */ +#define DISP_CC_MDSS_CORE_GDSC 0 +#define DISP_CC_MDSS_CORE_INT2_GDSC 1 + +/* DISP_CC resets */ +#define DISP_CC_MDSS_CORE_BCR 0 +#define DISP_CC_MDSS_CORE_INT2_BCR 1 +#define DISP_CC_MDSS_RSCC_BCR 2 + +#endif -- cgit v1.2.3 From 017bda80fd0ddd24ea8cf932c3bd970491e0abc1 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Tue, 2 Sep 2025 15:46:37 +0000 Subject: genetlink: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this context "not that ..." should properly be "note that ...". Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250902154640.759815-4-ast@fiberby.net Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index a03d56765832..7b84f2cef8b1 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -62,7 +62,7 @@ struct genl_info; * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition - * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * @n_split_ops: number of entries in @split_ops, note that with split do/dump * ops the number of entries is not the same as number of commands * @sock_priv_size: the size of per-socket private memory * @sock_priv_init: the per-socket private memory initializer -- cgit v1.2.3 From ae5b84788e5a7876a67f64761b7265529cb5a39a Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Mon, 25 Aug 2025 23:49:09 +0530 Subject: dt-bindings: clock: qcom: Document the Glymur SoC TCSR Clock Controller The Glymur SoC TCSR block provides CLKREF clocks for EDP, PCIe and USB. Add this to the TCSR clock controller binding together with identifiers for the clocks. Reviewed-by: Krzysztof Kozlowski Reviewed-by: Bjorn Andersson Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20250825-glymur-clock-controller-v5-v5-2-01b8c8681bcd@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../bindings/clock/qcom,sm8550-tcsr.yaml | 3 +++ include/dt-bindings/clock/qcom,glymur-tcsr.h | 24 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,glymur-tcsr.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml index 2ed7d59722fc..2c992b3437f2 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml @@ -8,12 +8,14 @@ title: Qualcomm TCSR Clock Controller on SM8550 maintainers: - Bjorn Andersson + - Taniya Das description: | Qualcomm TCSR clock control module provides the clocks, resets and power domains on SM8550 See also: + - include/dt-bindings/clock/qcom,glymur-tcsr.h - include/dt-bindings/clock/qcom,sm8550-tcsr.h - include/dt-bindings/clock/qcom,sm8650-tcsr.h - include/dt-bindings/clock/qcom,sm8750-tcsr.h @@ -22,6 +24,7 @@ properties: compatible: items: - enum: + - qcom,glymur-tcsr - qcom,milos-tcsr - qcom,sar2130p-tcsr - qcom,sm8550-tcsr diff --git a/include/dt-bindings/clock/qcom,glymur-tcsr.h b/include/dt-bindings/clock/qcom,glymur-tcsr.h new file mode 100644 index 000000000000..72614226b113 --- /dev/null +++ b/include/dt-bindings/clock/qcom,glymur-tcsr.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_TCSR_CC_GLYMUR_H +#define _DT_BINDINGS_CLK_QCOM_TCSR_CC_GLYMUR_H + +/* TCSR_CC clocks */ +#define TCSR_EDP_CLKREF_EN 0 +#define TCSR_PCIE_1_CLKREF_EN 1 +#define TCSR_PCIE_2_CLKREF_EN 2 +#define TCSR_PCIE_3_CLKREF_EN 3 +#define TCSR_PCIE_4_CLKREF_EN 4 +#define TCSR_USB2_1_CLKREF_EN 5 +#define TCSR_USB2_2_CLKREF_EN 6 +#define TCSR_USB2_3_CLKREF_EN 7 +#define TCSR_USB2_4_CLKREF_EN 8 +#define TCSR_USB3_0_CLKREF_EN 9 +#define TCSR_USB3_1_CLKREF_EN 10 +#define TCSR_USB4_1_CLKREF_EN 11 +#define TCSR_USB4_2_CLKREF_EN 12 + +#endif -- cgit v1.2.3 From ee2d967030fee156ceb2de80ef63ddeb80d60779 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Mon, 25 Aug 2025 23:49:13 +0530 Subject: dt-bindings: clock: qcom: document the Glymur Global Clock Controller Add device tree bindings for global clock controller on Glymur SoC. Signed-off-by: Taniya Das Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250825-glymur-clock-controller-v5-v5-6-01b8c8681bcd@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/clock/qcom,glymur-gcc.yaml | 121 +++++ include/dt-bindings/clock/qcom,glymur-gcc.h | 578 +++++++++++++++++++++ 2 files changed, 699 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/qcom,glymur-gcc.yaml create mode 100644 include/dt-bindings/clock/qcom,glymur-gcc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,glymur-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,glymur-gcc.yaml new file mode 100644 index 000000000000..b05b0e6c4483 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,glymur-gcc.yaml @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/qcom,glymur-gcc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Global Clock & Reset Controller on Glymur SoC + +maintainers: + - Taniya Das + +description: | + Qualcomm global clock control module provides the clocks, resets and power + domains on Glymur SoC. + + See also: include/dt-bindings/clock/qcom,glymur-gcc.h + +properties: + compatible: + const: qcom,glymur-gcc + + clocks: + items: + - description: Board XO source + - description: Board XO_A source + - description: Sleep clock source + - description: USB 0 Phy DP0 GMUX clock source + - description: USB 0 Phy DP1 GMUX clock source + - description: USB 0 Phy PCIE PIPEGMUX clock source + - description: USB 0 Phy PIPEGMUX clock source + - description: USB 0 Phy SYS PCIE PIPEGMUX clock source + - description: USB 1 Phy DP0 GMUX 2 clock source + - description: USB 1 Phy DP1 GMUX 2 clock source + - description: USB 1 Phy PCIE PIPEGMUX clock source + - description: USB 1 Phy PIPEGMUX clock source + - description: USB 1 Phy SYS PCIE PIPEGMUX clock source + - description: USB 2 Phy DP0 GMUX 2 clock source + - description: USB 2 Phy DP1 GMUX 2 clock source + - description: USB 2 Phy PCIE PIPEGMUX clock source + - description: USB 2 Phy PIPEGMUX clock source + - description: USB 2 Phy SYS PCIE PIPEGMUX clock source + - description: PCIe 3a pipe clock + - description: PCIe 3b pipe clock + - description: PCIe 4 pipe clock + - description: PCIe 5 pipe clock + - description: PCIe 6 pipe clock + - description: QUSB4 0 PHY RX 0 clock source + - description: QUSB4 0 PHY RX 1 clock source + - description: QUSB4 1 PHY RX 0 clock source + - description: QUSB4 1 PHY RX 1 clock source + - description: QUSB4 2 PHY RX 0 clock source + - description: QUSB4 2 PHY RX 1 clock source + - description: UFS PHY RX Symbol 0 clock source + - description: UFS PHY RX Symbol 1 clock source + - description: UFS PHY TX Symbol 0 clock source + - description: USB3 PHY 0 pipe clock source + - description: USB3 PHY 1 pipe clock source + - description: USB3 PHY 2 pipe clock source + - description: USB3 UNI PHY pipe 0 clock source + - description: USB3 UNI PHY pipe 1 clock source + - description: USB4 PHY 0 pcie pipe clock source + - description: USB4 PHY 0 Max pipe clock source + - description: USB4 PHY 1 pcie pipe clock source + - description: USB4 PHY 1 Max pipe clock source + - description: USB4 PHY 2 pcie pipe clock source + - description: USB4 PHY 2 Max pipe clock source + +required: + - compatible + - clocks + - '#power-domain-cells' + +allOf: + - $ref: qcom,gcc.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + clock-controller@100000 { + compatible = "qcom,glymur-gcc"; + reg = <0x100000 0x1f9000>; + clocks = <&rpmhcc RPMH_CXO_CLK>, + <&rpmhcc RPMH_CXO_CLK_A>, + <&sleep_clk>, + <&usb_0_phy_dp0_gmux>, + <&usb_0_phy_dp1_gmux>, + <&usb_0_phy_pcie_pipegmux>, + <&usb_0_phy_pipegmux>, + <&usb_0_phy_sys_pcie_pipegmux>, + <&usb_1_phy_dp0_gmux_2>, + <&usb_1_phy_dp1_gmux_2>, + <&usb_1_phy_pcie_pipegmux>, + <&usb_1_phy_pipegmux>, + <&usb_1_phy_sys_pcie_pipegmux>, + <&usb_2_phy_dp0_gmux 2>, + <&usb_2_phy_dp1_gmux 2>, + <&usb_2_phy_pcie_pipegmux>, + <&usb_2_phy_pipegmux>, + <&usb_2_phy_sys_pcie_pipegmux>, + <&pcie_3a_pipe>, <&pcie_3b_pipe>, + <&pcie_4_pipe>, <&pcie_5_pipe>, + <&pcie_6_pipe>, + <&qusb4_0_phy_rx_0>, <&qusb4_0_phy_rx_1>, + <&qusb4_1_phy_rx_0>, <&qusb4_1_phy_rx_1>, + <&qusb4_2_phy_rx_0>, <&qusb4_2_phy_rx_1>, + <&ufs_phy_rx_symbol_0>, <&ufs_phy_rx_symbol_1>, + <&ufs_phy_tx_symbol_0>, + <&usb3_phy_0_pipe>, <&usb3_phy_1_pipe>, + <&usb3_phy_2_pipe>, + <&usb3_uni_phy_pipe_0>, <&usb3_uni_phy_pipe_1>, + <&usb4_phy_0_pcie_pipe>, <&usb4_phy_0_max_pipe>, + <&usb4_phy_1_pcie_pipe>, <&usb4_phy_1_max_pipe>, + <&usb4_phy_2_pcie_pipe>, <&usb4_phy_2_max_pipe>; + #clock-cells = <1>; + #reset-cells = <1>; + #power-domain-cells = <1>; + }; + +... diff --git a/include/dt-bindings/clock/qcom,glymur-gcc.h b/include/dt-bindings/clock/qcom,glymur-gcc.h new file mode 100644 index 000000000000..10c12b8c51c3 --- /dev/null +++ b/include/dt-bindings/clock/qcom,glymur-gcc.h @@ -0,0 +1,578 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GCC_GLYMUR_H +#define _DT_BINDINGS_CLK_QCOM_GCC_GLYMUR_H + +/* GCC clocks */ +#define GCC_GPLL0 0 +#define GCC_GPLL0_OUT_EVEN 1 +#define GCC_GPLL1 2 +#define GCC_GPLL14 3 +#define GCC_GPLL14_OUT_EVEN 4 +#define GCC_GPLL4 5 +#define GCC_GPLL5 6 +#define GCC_GPLL7 7 +#define GCC_GPLL8 8 +#define GCC_GPLL9 9 +#define GCC_AGGRE_NOC_PCIE_3A_WEST_SF_AXI_CLK 10 +#define GCC_AGGRE_NOC_PCIE_3B_WEST_SF_AXI_CLK 11 +#define GCC_AGGRE_NOC_PCIE_4_WEST_SF_AXI_CLK 12 +#define GCC_AGGRE_NOC_PCIE_5_EAST_SF_AXI_CLK 13 +#define GCC_AGGRE_NOC_PCIE_6_WEST_SF_AXI_CLK 14 +#define GCC_AGGRE_UFS_PHY_AXI_CLK 15 +#define GCC_AGGRE_UFS_PHY_AXI_HW_CTL_CLK 16 +#define GCC_AGGRE_USB2_PRIM_AXI_CLK 17 +#define GCC_AGGRE_USB3_MP_AXI_CLK 18 +#define GCC_AGGRE_USB3_PRIM_AXI_CLK 19 +#define GCC_AGGRE_USB3_SEC_AXI_CLK 20 +#define GCC_AGGRE_USB3_TERT_AXI_CLK 21 +#define GCC_AGGRE_USB4_0_AXI_CLK 22 +#define GCC_AGGRE_USB4_1_AXI_CLK 23 +#define GCC_AGGRE_USB4_2_AXI_CLK 24 +#define GCC_AV1E_AHB_CLK 25 +#define GCC_AV1E_AXI_CLK 26 +#define GCC_AV1E_XO_CLK 27 +#define GCC_BOOT_ROM_AHB_CLK 28 +#define GCC_CAMERA_AHB_CLK 29 +#define GCC_CAMERA_HF_AXI_CLK 30 +#define GCC_CAMERA_SF_AXI_CLK 31 +#define GCC_CAMERA_XO_CLK 32 +#define GCC_CFG_NOC_PCIE_ANOC_AHB_CLK 33 +#define GCC_CFG_NOC_PCIE_ANOC_SOUTH_AHB_CLK 34 +#define GCC_CFG_NOC_USB2_PRIM_AXI_CLK 35 +#define GCC_CFG_NOC_USB3_MP_AXI_CLK 36 +#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK 37 +#define GCC_CFG_NOC_USB3_SEC_AXI_CLK 38 +#define GCC_CFG_NOC_USB3_TERT_AXI_CLK 39 +#define GCC_CFG_NOC_USB_ANOC_AHB_CLK 40 +#define GCC_CFG_NOC_USB_ANOC_SOUTH_AHB_CLK 41 +#define GCC_DISP_AHB_CLK 42 +#define GCC_DISP_HF_AXI_CLK 43 +#define GCC_EVA_AHB_CLK 44 +#define GCC_EVA_AXI0_CLK 45 +#define GCC_EVA_AXI0C_CLK 46 +#define GCC_EVA_XO_CLK 47 +#define GCC_GP1_CLK 48 +#define GCC_GP1_CLK_SRC 49 +#define GCC_GP2_CLK 50 +#define GCC_GP2_CLK_SRC 51 +#define GCC_GP3_CLK 52 +#define GCC_GP3_CLK_SRC 53 +#define GCC_GPU_CFG_AHB_CLK 54 +#define GCC_GPU_GEMNOC_GFX_CLK 55 +#define GCC_GPU_GPLL0_CLK_SRC 56 +#define GCC_GPU_GPLL0_DIV_CLK_SRC 57 +#define GCC_PCIE_0_AUX_CLK 58 +#define GCC_PCIE_0_AUX_CLK_SRC 59 +#define GCC_PCIE_0_CFG_AHB_CLK 60 +#define GCC_PCIE_0_MSTR_AXI_CLK 61 +#define GCC_PCIE_0_PHY_RCHNG_CLK 62 +#define GCC_PCIE_0_PHY_RCHNG_CLK_SRC 63 +#define GCC_PCIE_0_PIPE_CLK 64 +#define GCC_PCIE_0_SLV_AXI_CLK 65 +#define GCC_PCIE_0_SLV_Q2A_AXI_CLK 66 +#define GCC_PCIE_1_AUX_CLK 67 +#define GCC_PCIE_1_AUX_CLK_SRC 68 +#define GCC_PCIE_1_CFG_AHB_CLK 69 +#define GCC_PCIE_1_MSTR_AXI_CLK 70 +#define GCC_PCIE_1_PHY_RCHNG_CLK 71 +#define GCC_PCIE_1_PHY_RCHNG_CLK_SRC 72 +#define GCC_PCIE_1_PIPE_CLK 73 +#define GCC_PCIE_1_SLV_AXI_CLK 74 +#define GCC_PCIE_1_SLV_Q2A_AXI_CLK 75 +#define GCC_PCIE_2_AUX_CLK 76 +#define GCC_PCIE_2_AUX_CLK_SRC 77 +#define GCC_PCIE_2_CFG_AHB_CLK 78 +#define GCC_PCIE_2_MSTR_AXI_CLK 79 +#define GCC_PCIE_2_PHY_RCHNG_CLK 80 +#define GCC_PCIE_2_PHY_RCHNG_CLK_SRC 81 +#define GCC_PCIE_2_PIPE_CLK 82 +#define GCC_PCIE_2_SLV_AXI_CLK 83 +#define GCC_PCIE_2_SLV_Q2A_AXI_CLK 84 +#define GCC_PCIE_3A_AUX_CLK 85 +#define GCC_PCIE_3A_AUX_CLK_SRC 86 +#define GCC_PCIE_3A_CFG_AHB_CLK 87 +#define GCC_PCIE_3A_MSTR_AXI_CLK 88 +#define GCC_PCIE_3A_PHY_RCHNG_CLK 89 +#define GCC_PCIE_3A_PHY_RCHNG_CLK_SRC 90 +#define GCC_PCIE_3A_PIPE_CLK 91 +#define GCC_PCIE_3A_PIPE_CLK_SRC 92 +#define GCC_PCIE_3A_SLV_AXI_CLK 93 +#define GCC_PCIE_3A_SLV_Q2A_AXI_CLK 94 +#define GCC_PCIE_3B_AUX_CLK 95 +#define GCC_PCIE_3B_AUX_CLK_SRC 96 +#define GCC_PCIE_3B_CFG_AHB_CLK 97 +#define GCC_PCIE_3B_MSTR_AXI_CLK 98 +#define GCC_PCIE_3B_PHY_RCHNG_CLK 99 +#define GCC_PCIE_3B_PHY_RCHNG_CLK_SRC 100 +#define GCC_PCIE_3B_PIPE_CLK 101 +#define GCC_PCIE_3B_PIPE_CLK_SRC 102 +#define GCC_PCIE_3B_PIPE_DIV2_CLK 103 +#define GCC_PCIE_3B_PIPE_DIV_CLK_SRC 104 +#define GCC_PCIE_3B_SLV_AXI_CLK 105 +#define GCC_PCIE_3B_SLV_Q2A_AXI_CLK 106 +#define GCC_PCIE_4_AUX_CLK 107 +#define GCC_PCIE_4_AUX_CLK_SRC 108 +#define GCC_PCIE_4_CFG_AHB_CLK 109 +#define GCC_PCIE_4_MSTR_AXI_CLK 110 +#define GCC_PCIE_4_PHY_RCHNG_CLK 111 +#define GCC_PCIE_4_PHY_RCHNG_CLK_SRC 112 +#define GCC_PCIE_4_PIPE_CLK 113 +#define GCC_PCIE_4_PIPE_CLK_SRC 114 +#define GCC_PCIE_4_PIPE_DIV2_CLK 115 +#define GCC_PCIE_4_PIPE_DIV_CLK_SRC 116 +#define GCC_PCIE_4_SLV_AXI_CLK 117 +#define GCC_PCIE_4_SLV_Q2A_AXI_CLK 118 +#define GCC_PCIE_5_AUX_CLK 119 +#define GCC_PCIE_5_AUX_CLK_SRC 120 +#define GCC_PCIE_5_CFG_AHB_CLK 121 +#define GCC_PCIE_5_MSTR_AXI_CLK 122 +#define GCC_PCIE_5_PHY_RCHNG_CLK 123 +#define GCC_PCIE_5_PHY_RCHNG_CLK_SRC 124 +#define GCC_PCIE_5_PIPE_CLK 125 +#define GCC_PCIE_5_PIPE_CLK_SRC 126 +#define GCC_PCIE_5_PIPE_DIV2_CLK 127 +#define GCC_PCIE_5_PIPE_DIV_CLK_SRC 128 +#define GCC_PCIE_5_SLV_AXI_CLK 129 +#define GCC_PCIE_5_SLV_Q2A_AXI_CLK 130 +#define GCC_PCIE_6_AUX_CLK 131 +#define GCC_PCIE_6_AUX_CLK_SRC 132 +#define GCC_PCIE_6_CFG_AHB_CLK 133 +#define GCC_PCIE_6_MSTR_AXI_CLK 134 +#define GCC_PCIE_6_PHY_RCHNG_CLK 135 +#define GCC_PCIE_6_PHY_RCHNG_CLK_SRC 136 +#define GCC_PCIE_6_PIPE_CLK 137 +#define GCC_PCIE_6_PIPE_CLK_SRC 138 +#define GCC_PCIE_6_PIPE_DIV2_CLK 139 +#define GCC_PCIE_6_PIPE_DIV_CLK_SRC 140 +#define GCC_PCIE_6_SLV_AXI_CLK 141 +#define GCC_PCIE_6_SLV_Q2A_AXI_CLK 142 +#define GCC_PCIE_NOC_PWRCTL_CLK 143 +#define GCC_PCIE_NOC_QOSGEN_EXTREF_CLK 144 +#define GCC_PCIE_NOC_SF_CENTER_CLK 145 +#define GCC_PCIE_NOC_SLAVE_SF_EAST_CLK 146 +#define GCC_PCIE_NOC_SLAVE_SF_WEST_CLK 147 +#define GCC_PCIE_NOC_TSCTR_CLK 148 +#define GCC_PCIE_PHY_3A_AUX_CLK 149 +#define GCC_PCIE_PHY_3A_AUX_CLK_SRC 150 +#define GCC_PCIE_PHY_3B_AUX_CLK 151 +#define GCC_PCIE_PHY_3B_AUX_CLK_SRC 152 +#define GCC_PCIE_PHY_4_AUX_CLK 153 +#define GCC_PCIE_PHY_4_AUX_CLK_SRC 154 +#define GCC_PCIE_PHY_5_AUX_CLK 155 +#define GCC_PCIE_PHY_5_AUX_CLK_SRC 156 +#define GCC_PCIE_PHY_6_AUX_CLK 157 +#define GCC_PCIE_PHY_6_AUX_CLK_SRC 158 +#define GCC_PCIE_RSCC_CFG_AHB_CLK 159 +#define GCC_PCIE_RSCC_XO_CLK 160 +#define GCC_PDM2_CLK 161 +#define GCC_PDM2_CLK_SRC 162 +#define GCC_PDM_AHB_CLK 163 +#define GCC_PDM_XO4_CLK 164 +#define GCC_QMIP_AV1E_AHB_CLK 165 +#define GCC_QMIP_CAMERA_CMD_AHB_CLK 166 +#define GCC_QMIP_CAMERA_NRT_AHB_CLK 167 +#define GCC_QMIP_CAMERA_RT_AHB_CLK 168 +#define GCC_QMIP_GPU_AHB_CLK 169 +#define GCC_QMIP_PCIE_3A_AHB_CLK 170 +#define GCC_QMIP_PCIE_3B_AHB_CLK 171 +#define GCC_QMIP_PCIE_4_AHB_CLK 172 +#define GCC_QMIP_PCIE_5_AHB_CLK 173 +#define GCC_QMIP_PCIE_6_AHB_CLK 174 +#define GCC_QMIP_VIDEO_CV_CPU_AHB_CLK 175 +#define GCC_QMIP_VIDEO_CVP_AHB_CLK 176 +#define GCC_QMIP_VIDEO_V_CPU_AHB_CLK 177 +#define GCC_QMIP_VIDEO_VCODEC1_AHB_CLK 178 +#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK 179 +#define GCC_QUPV3_OOB_CORE_2X_CLK 180 +#define GCC_QUPV3_OOB_CORE_CLK 181 +#define GCC_QUPV3_OOB_M_AHB_CLK 182 +#define GCC_QUPV3_OOB_QSPI_S0_CLK 183 +#define GCC_QUPV3_OOB_QSPI_S0_CLK_SRC 184 +#define GCC_QUPV3_OOB_QSPI_S1_CLK 185 +#define GCC_QUPV3_OOB_QSPI_S1_CLK_SRC 186 +#define GCC_QUPV3_OOB_S0_CLK 187 +#define GCC_QUPV3_OOB_S0_CLK_SRC 188 +#define GCC_QUPV3_OOB_S1_CLK 189 +#define GCC_QUPV3_OOB_S1_CLK_SRC 190 +#define GCC_QUPV3_OOB_S_AHB_CLK 191 +#define GCC_QUPV3_OOB_TCXO_CLK 192 +#define GCC_QUPV3_WRAP0_CORE_2X_CLK 193 +#define GCC_QUPV3_WRAP0_CORE_CLK 194 +#define GCC_QUPV3_WRAP0_QSPI_S2_CLK 195 +#define GCC_QUPV3_WRAP0_QSPI_S2_CLK_SRC 196 +#define GCC_QUPV3_WRAP0_QSPI_S3_CLK 197 +#define GCC_QUPV3_WRAP0_QSPI_S3_CLK_SRC 198 +#define GCC_QUPV3_WRAP0_QSPI_S6_CLK 199 +#define GCC_QUPV3_WRAP0_QSPI_S6_CLK_SRC 200 +#define GCC_QUPV3_WRAP0_S0_CLK 201 +#define GCC_QUPV3_WRAP0_S0_CLK_SRC 202 +#define GCC_QUPV3_WRAP0_S1_CLK 203 +#define GCC_QUPV3_WRAP0_S1_CLK_SRC 204 +#define GCC_QUPV3_WRAP0_S2_CLK 205 +#define GCC_QUPV3_WRAP0_S2_CLK_SRC 206 +#define GCC_QUPV3_WRAP0_S3_CLK 207 +#define GCC_QUPV3_WRAP0_S3_CLK_SRC 208 +#define GCC_QUPV3_WRAP0_S4_CLK 209 +#define GCC_QUPV3_WRAP0_S4_CLK_SRC 210 +#define GCC_QUPV3_WRAP0_S5_CLK 211 +#define GCC_QUPV3_WRAP0_S5_CLK_SRC 212 +#define GCC_QUPV3_WRAP0_S6_CLK 213 +#define GCC_QUPV3_WRAP0_S6_CLK_SRC 214 +#define GCC_QUPV3_WRAP0_S7_CLK 215 +#define GCC_QUPV3_WRAP0_S7_CLK_SRC 216 +#define GCC_QUPV3_WRAP1_CORE_2X_CLK 217 +#define GCC_QUPV3_WRAP1_CORE_CLK 218 +#define GCC_QUPV3_WRAP1_QSPI_S2_CLK 219 +#define GCC_QUPV3_WRAP1_QSPI_S2_CLK_SRC 220 +#define GCC_QUPV3_WRAP1_QSPI_S3_CLK 221 +#define GCC_QUPV3_WRAP1_QSPI_S3_CLK_SRC 222 +#define GCC_QUPV3_WRAP1_QSPI_S6_CLK 223 +#define GCC_QUPV3_WRAP1_QSPI_S6_CLK_SRC 224 +#define GCC_QUPV3_WRAP1_S0_CLK 225 +#define GCC_QUPV3_WRAP1_S0_CLK_SRC 226 +#define GCC_QUPV3_WRAP1_S1_CLK 227 +#define GCC_QUPV3_WRAP1_S1_CLK_SRC 228 +#define GCC_QUPV3_WRAP1_S2_CLK 229 +#define GCC_QUPV3_WRAP1_S2_CLK_SRC 230 +#define GCC_QUPV3_WRAP1_S3_CLK 231 +#define GCC_QUPV3_WRAP1_S3_CLK_SRC 232 +#define GCC_QUPV3_WRAP1_S4_CLK 233 +#define GCC_QUPV3_WRAP1_S4_CLK_SRC 234 +#define GCC_QUPV3_WRAP1_S5_CLK 235 +#define GCC_QUPV3_WRAP1_S5_CLK_SRC 236 +#define GCC_QUPV3_WRAP1_S6_CLK 237 +#define GCC_QUPV3_WRAP1_S6_CLK_SRC 238 +#define GCC_QUPV3_WRAP1_S7_CLK 239 +#define GCC_QUPV3_WRAP1_S7_CLK_SRC 240 +#define GCC_QUPV3_WRAP2_CORE_2X_CLK 241 +#define GCC_QUPV3_WRAP2_CORE_CLK 242 +#define GCC_QUPV3_WRAP2_QSPI_S2_CLK 243 +#define GCC_QUPV3_WRAP2_QSPI_S2_CLK_SRC 244 +#define GCC_QUPV3_WRAP2_QSPI_S3_CLK 245 +#define GCC_QUPV3_WRAP2_QSPI_S3_CLK_SRC 246 +#define GCC_QUPV3_WRAP2_QSPI_S6_CLK 247 +#define GCC_QUPV3_WRAP2_QSPI_S6_CLK_SRC 248 +#define GCC_QUPV3_WRAP2_S0_CLK 249 +#define GCC_QUPV3_WRAP2_S0_CLK_SRC 250 +#define GCC_QUPV3_WRAP2_S1_CLK 251 +#define GCC_QUPV3_WRAP2_S1_CLK_SRC 252 +#define GCC_QUPV3_WRAP2_S2_CLK 253 +#define GCC_QUPV3_WRAP2_S2_CLK_SRC 254 +#define GCC_QUPV3_WRAP2_S3_CLK 255 +#define GCC_QUPV3_WRAP2_S3_CLK_SRC 256 +#define GCC_QUPV3_WRAP2_S4_CLK 257 +#define GCC_QUPV3_WRAP2_S4_CLK_SRC 258 +#define GCC_QUPV3_WRAP2_S5_CLK 259 +#define GCC_QUPV3_WRAP2_S5_CLK_SRC 260 +#define GCC_QUPV3_WRAP2_S6_CLK 261 +#define GCC_QUPV3_WRAP2_S6_CLK_SRC 262 +#define GCC_QUPV3_WRAP2_S7_CLK 263 +#define GCC_QUPV3_WRAP2_S7_CLK_SRC 264 +#define GCC_QUPV3_WRAP_0_M_AHB_CLK 265 +#define GCC_QUPV3_WRAP_0_S_AHB_CLK 266 +#define GCC_QUPV3_WRAP_1_M_AHB_CLK 267 +#define GCC_QUPV3_WRAP_1_S_AHB_CLK 268 +#define GCC_QUPV3_WRAP_2_M_AHB_CLK 269 +#define GCC_QUPV3_WRAP_2_S_AHB_CLK 270 +#define GCC_SDCC2_AHB_CLK 271 +#define GCC_SDCC2_APPS_CLK 272 +#define GCC_SDCC2_APPS_CLK_SRC 273 +#define GCC_SDCC4_AHB_CLK 274 +#define GCC_SDCC4_APPS_CLK 275 +#define GCC_SDCC4_APPS_CLK_SRC 276 +#define GCC_UFS_PHY_AHB_CLK 277 +#define GCC_UFS_PHY_AXI_CLK 278 +#define GCC_UFS_PHY_AXI_CLK_SRC 279 +#define GCC_UFS_PHY_AXI_HW_CTL_CLK 280 +#define GCC_UFS_PHY_ICE_CORE_CLK 281 +#define GCC_UFS_PHY_ICE_CORE_CLK_SRC 282 +#define GCC_UFS_PHY_ICE_CORE_HW_CTL_CLK 283 +#define GCC_UFS_PHY_PHY_AUX_CLK 284 +#define GCC_UFS_PHY_PHY_AUX_CLK_SRC 285 +#define GCC_UFS_PHY_PHY_AUX_HW_CTL_CLK 286 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK 287 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK_SRC 288 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK 289 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK_SRC 290 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK 291 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK_SRC 292 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK 293 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC 294 +#define GCC_UFS_PHY_UNIPRO_CORE_HW_CTL_CLK 295 +#define GCC_USB20_MASTER_CLK 296 +#define GCC_USB20_MASTER_CLK_SRC 297 +#define GCC_USB20_MOCK_UTMI_CLK 298 +#define GCC_USB20_MOCK_UTMI_CLK_SRC 299 +#define GCC_USB20_MOCK_UTMI_POSTDIV_CLK_SRC 300 +#define GCC_USB20_SLEEP_CLK 301 +#define GCC_USB30_MP_MASTER_CLK 302 +#define GCC_USB30_MP_MASTER_CLK_SRC 303 +#define GCC_USB30_MP_MOCK_UTMI_CLK 304 +#define GCC_USB30_MP_MOCK_UTMI_CLK_SRC 305 +#define GCC_USB30_MP_MOCK_UTMI_POSTDIV_CLK_SRC 306 +#define GCC_USB30_MP_SLEEP_CLK 307 +#define GCC_USB30_PRIM_MASTER_CLK 308 +#define GCC_USB30_PRIM_MASTER_CLK_SRC 309 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK 310 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC 311 +#define GCC_USB30_PRIM_MOCK_UTMI_POSTDIV_CLK_SRC 312 +#define GCC_USB30_PRIM_SLEEP_CLK 313 +#define GCC_USB30_SEC_MASTER_CLK 314 +#define GCC_USB30_SEC_MASTER_CLK_SRC 315 +#define GCC_USB30_SEC_MOCK_UTMI_CLK 316 +#define GCC_USB30_SEC_MOCK_UTMI_CLK_SRC 317 +#define GCC_USB30_SEC_MOCK_UTMI_POSTDIV_CLK_SRC 318 +#define GCC_USB30_SEC_SLEEP_CLK 319 +#define GCC_USB30_TERT_MASTER_CLK 320 +#define GCC_USB30_TERT_MASTER_CLK_SRC 321 +#define GCC_USB30_TERT_MOCK_UTMI_CLK 322 +#define GCC_USB30_TERT_MOCK_UTMI_CLK_SRC 323 +#define GCC_USB30_TERT_MOCK_UTMI_POSTDIV_CLK_SRC 324 +#define GCC_USB30_TERT_SLEEP_CLK 325 +#define GCC_USB34_PRIM_PHY_PIPE_CLK_SRC 326 +#define GCC_USB34_SEC_PHY_PIPE_CLK_SRC 327 +#define GCC_USB34_TERT_PHY_PIPE_CLK_SRC 328 +#define GCC_USB3_MP_PHY_AUX_CLK 329 +#define GCC_USB3_MP_PHY_AUX_CLK_SRC 330 +#define GCC_USB3_MP_PHY_COM_AUX_CLK 331 +#define GCC_USB3_MP_PHY_PIPE_0_CLK 332 +#define GCC_USB3_MP_PHY_PIPE_0_CLK_SRC 333 +#define GCC_USB3_MP_PHY_PIPE_1_CLK 334 +#define GCC_USB3_MP_PHY_PIPE_1_CLK_SRC 335 +#define GCC_USB3_PRIM_PHY_AUX_CLK 336 +#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC 337 +#define GCC_USB3_PRIM_PHY_COM_AUX_CLK 338 +#define GCC_USB3_PRIM_PHY_PIPE_CLK 339 +#define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC 340 +#define GCC_USB3_SEC_PHY_AUX_CLK 341 +#define GCC_USB3_SEC_PHY_AUX_CLK_SRC 342 +#define GCC_USB3_SEC_PHY_COM_AUX_CLK 343 +#define GCC_USB3_SEC_PHY_PIPE_CLK 344 +#define GCC_USB3_SEC_PHY_PIPE_CLK_SRC 345 +#define GCC_USB3_TERT_PHY_AUX_CLK 346 +#define GCC_USB3_TERT_PHY_AUX_CLK_SRC 347 +#define GCC_USB3_TERT_PHY_COM_AUX_CLK 348 +#define GCC_USB3_TERT_PHY_PIPE_CLK 349 +#define GCC_USB3_TERT_PHY_PIPE_CLK_SRC 350 +#define GCC_USB4_0_CFG_AHB_CLK 351 +#define GCC_USB4_0_DP0_CLK 352 +#define GCC_USB4_0_DP1_CLK 353 +#define GCC_USB4_0_MASTER_CLK 354 +#define GCC_USB4_0_MASTER_CLK_SRC 355 +#define GCC_USB4_0_PHY_DP0_CLK_SRC 356 +#define GCC_USB4_0_PHY_DP0_GMUX_CLK_SRC 357 +#define GCC_USB4_0_PHY_DP1_CLK_SRC 358 +#define GCC_USB4_0_PHY_DP1_GMUX_CLK_SRC 359 +#define GCC_USB4_0_PHY_P2RR2P_PIPE_CLK 360 +#define GCC_USB4_0_PHY_P2RR2P_PIPE_CLK_SRC 361 +#define GCC_USB4_0_PHY_PCIE_PIPE_CLK 362 +#define GCC_USB4_0_PHY_PCIE_PIPE_CLK_SRC 363 +#define GCC_USB4_0_PHY_PCIE_PIPE_MUX_CLK_SRC 364 +#define GCC_USB4_0_PHY_PCIE_PIPEGMUX_CLK_SRC 365 +#define GCC_USB4_0_PHY_PIPEGMUX_CLK_SRC 366 +#define GCC_USB4_0_PHY_RX0_CLK 367 +#define GCC_USB4_0_PHY_RX0_CLK_SRC 368 +#define GCC_USB4_0_PHY_RX1_CLK 369 +#define GCC_USB4_0_PHY_RX1_CLK_SRC 370 +#define GCC_USB4_0_PHY_SYS_CLK_SRC 371 +#define GCC_USB4_0_PHY_SYS_PIPEGMUX_CLK_SRC 372 +#define GCC_USB4_0_PHY_USB_PIPE_CLK 373 +#define GCC_USB4_0_SB_IF_CLK 374 +#define GCC_USB4_0_SB_IF_CLK_SRC 375 +#define GCC_USB4_0_SYS_CLK 376 +#define GCC_USB4_0_TMU_CLK 377 +#define GCC_USB4_0_TMU_CLK_SRC 378 +#define GCC_USB4_0_UC_HRR_CLK 379 +#define GCC_USB4_1_CFG_AHB_CLK 380 +#define GCC_USB4_1_DP0_CLK 381 +#define GCC_USB4_1_DP1_CLK 382 +#define GCC_USB4_1_MASTER_CLK 383 +#define GCC_USB4_1_MASTER_CLK_SRC 384 +#define GCC_USB4_1_PHY_DP0_CLK_SRC 385 +#define GCC_USB4_1_PHY_DP0_GMUX_2_CLK_SRC 386 +#define GCC_USB4_1_PHY_DP1_CLK_SRC 387 +#define GCC_USB4_1_PHY_DP1_GMUX_2_CLK_SRC 388 +#define GCC_USB4_1_PHY_P2RR2P_PIPE_CLK 389 +#define GCC_USB4_1_PHY_P2RR2P_PIPE_CLK_SRC 390 +#define GCC_USB4_1_PHY_PCIE_PIPE_CLK 391 +#define GCC_USB4_1_PHY_PCIE_PIPE_CLK_SRC 392 +#define GCC_USB4_1_PHY_PCIE_PIPE_MUX_CLK_SRC 393 +#define GCC_USB4_1_PHY_PCIE_PIPEGMUX_CLK_SRC 394 +#define GCC_USB4_1_PHY_PIPEGMUX_CLK_SRC 395 +#define GCC_USB4_1_PHY_PLL_PIPE_CLK_SRC 396 +#define GCC_USB4_1_PHY_RX0_CLK 397 +#define GCC_USB4_1_PHY_RX0_CLK_SRC 398 +#define GCC_USB4_1_PHY_RX1_CLK 399 +#define GCC_USB4_1_PHY_RX1_CLK_SRC 400 +#define GCC_USB4_1_PHY_SYS_CLK_SRC 401 +#define GCC_USB4_1_PHY_SYS_PIPEGMUX_CLK_SRC 402 +#define GCC_USB4_1_PHY_USB_PIPE_CLK 403 +#define GCC_USB4_1_SB_IF_CLK 404 +#define GCC_USB4_1_SB_IF_CLK_SRC 405 +#define GCC_USB4_1_SYS_CLK 406 +#define GCC_USB4_1_TMU_CLK 407 +#define GCC_USB4_1_TMU_CLK_SRC 408 +#define GCC_USB4_1_UC_HRR_CLK 409 +#define GCC_USB4_2_CFG_AHB_CLK 410 +#define GCC_USB4_2_DP0_CLK 411 +#define GCC_USB4_2_DP1_CLK 412 +#define GCC_USB4_2_MASTER_CLK 413 +#define GCC_USB4_2_MASTER_CLK_SRC 414 +#define GCC_USB4_2_PHY_DP0_CLK_SRC 415 +#define GCC_USB4_2_PHY_DP0_GMUX_CLK_SRC 416 +#define GCC_USB4_2_PHY_DP1_CLK_SRC 417 +#define GCC_USB4_2_PHY_DP1_GMUX_CLK_SRC 418 +#define GCC_USB4_2_PHY_P2RR2P_PIPE_CLK 419 +#define GCC_USB4_2_PHY_P2RR2P_PIPE_CLK_SRC 420 +#define GCC_USB4_2_PHY_PCIE_PIPE_CLK 421 +#define GCC_USB4_2_PHY_PCIE_PIPE_CLK_SRC 422 +#define GCC_USB4_2_PHY_PCIE_PIPE_MUX_CLK_SRC 423 +#define GCC_USB4_2_PHY_PCIE_PIPEGMUX_CLK_SRC 424 +#define GCC_USB4_2_PHY_PIPEGMUX_CLK_SRC 425 +#define GCC_USB4_2_PHY_RX0_CLK 426 +#define GCC_USB4_2_PHY_RX0_CLK_SRC 427 +#define GCC_USB4_2_PHY_RX1_CLK 428 +#define GCC_USB4_2_PHY_RX1_CLK_SRC 429 +#define GCC_USB4_2_PHY_SYS_CLK_SRC 430 +#define GCC_USB4_2_PHY_SYS_PIPEGMUX_CLK_SRC 431 +#define GCC_USB4_2_PHY_USB_PIPE_CLK 432 +#define GCC_USB4_2_SB_IF_CLK 433 +#define GCC_USB4_2_SB_IF_CLK_SRC 434 +#define GCC_USB4_2_SYS_CLK 435 +#define GCC_USB4_2_TMU_CLK 436 +#define GCC_USB4_2_TMU_CLK_SRC 437 +#define GCC_USB4_2_UC_HRR_CLK 438 +#define GCC_VIDEO_AHB_CLK 439 +#define GCC_VIDEO_AXI0_CLK 440 +#define GCC_VIDEO_AXI0C_CLK 441 +#define GCC_VIDEO_AXI1_CLK 442 +#define GCC_VIDEO_XO_CLK 443 + +/* GCC power domains */ +#define GCC_PCIE_0_TUNNEL_GDSC 0 +#define GCC_PCIE_1_TUNNEL_GDSC 1 +#define GCC_PCIE_2_TUNNEL_GDSC 2 +#define GCC_PCIE_3A_GDSC 3 +#define GCC_PCIE_3A_PHY_GDSC 4 +#define GCC_PCIE_3B_GDSC 5 +#define GCC_PCIE_3B_PHY_GDSC 6 +#define GCC_PCIE_4_GDSC 7 +#define GCC_PCIE_4_PHY_GDSC 8 +#define GCC_PCIE_5_GDSC 9 +#define GCC_PCIE_5_PHY_GDSC 10 +#define GCC_PCIE_6_GDSC 11 +#define GCC_PCIE_6_PHY_GDSC 12 +#define GCC_UFS_PHY_GDSC 13 +#define GCC_USB20_PRIM_GDSC 14 +#define GCC_USB30_MP_GDSC 15 +#define GCC_USB30_PRIM_GDSC 16 +#define GCC_USB30_SEC_GDSC 17 +#define GCC_USB30_TERT_GDSC 18 +#define GCC_USB3_MP_SS0_PHY_GDSC 19 +#define GCC_USB3_MP_SS1_PHY_GDSC 20 +#define GCC_USB4_0_GDSC 21 +#define GCC_USB4_1_GDSC 22 +#define GCC_USB4_2_GDSC 23 +#define GCC_USB_0_PHY_GDSC 24 +#define GCC_USB_1_PHY_GDSC 25 +#define GCC_USB_2_PHY_GDSC 26 + +/* GCC resets */ +#define GCC_AV1E_BCR 0 +#define GCC_CAMERA_BCR 1 +#define GCC_DISPLAY_BCR 2 +#define GCC_EVA_BCR 3 +#define GCC_GPU_BCR 4 +#define GCC_PCIE_0_LINK_DOWN_BCR 5 +#define GCC_PCIE_0_NOCSR_COM_PHY_BCR 6 +#define GCC_PCIE_0_PHY_BCR 7 +#define GCC_PCIE_0_PHY_NOCSR_COM_PHY_BCR 8 +#define GCC_PCIE_0_TUNNEL_BCR 9 +#define GCC_PCIE_1_LINK_DOWN_BCR 10 +#define GCC_PCIE_1_NOCSR_COM_PHY_BCR 11 +#define GCC_PCIE_1_PHY_BCR 12 +#define GCC_PCIE_1_PHY_NOCSR_COM_PHY_BCR 13 +#define GCC_PCIE_1_TUNNEL_BCR 14 +#define GCC_PCIE_2_LINK_DOWN_BCR 15 +#define GCC_PCIE_2_NOCSR_COM_PHY_BCR 16 +#define GCC_PCIE_2_PHY_BCR 17 +#define GCC_PCIE_2_PHY_NOCSR_COM_PHY_BCR 18 +#define GCC_PCIE_2_TUNNEL_BCR 19 +#define GCC_PCIE_3A_BCR 20 +#define GCC_PCIE_3A_LINK_DOWN_BCR 21 +#define GCC_PCIE_3A_NOCSR_COM_PHY_BCR 22 +#define GCC_PCIE_3A_PHY_BCR 23 +#define GCC_PCIE_3A_PHY_NOCSR_COM_PHY_BCR 24 +#define GCC_PCIE_3B_BCR 25 +#define GCC_PCIE_3B_LINK_DOWN_BCR 26 +#define GCC_PCIE_3B_NOCSR_COM_PHY_BCR 27 +#define GCC_PCIE_3B_PHY_BCR 28 +#define GCC_PCIE_3B_PHY_NOCSR_COM_PHY_BCR 29 +#define GCC_PCIE_4_BCR 30 +#define GCC_PCIE_4_LINK_DOWN_BCR 31 +#define GCC_PCIE_4_NOCSR_COM_PHY_BCR 32 +#define GCC_PCIE_4_PHY_BCR 33 +#define GCC_PCIE_4_PHY_NOCSR_COM_PHY_BCR 34 +#define GCC_PCIE_5_BCR 35 +#define GCC_PCIE_5_LINK_DOWN_BCR 36 +#define GCC_PCIE_5_NOCSR_COM_PHY_BCR 37 +#define GCC_PCIE_5_PHY_BCR 38 +#define GCC_PCIE_5_PHY_NOCSR_COM_PHY_BCR 39 +#define GCC_PCIE_6_BCR 40 +#define GCC_PCIE_6_LINK_DOWN_BCR 41 +#define GCC_PCIE_6_NOCSR_COM_PHY_BCR 42 +#define GCC_PCIE_6_PHY_BCR 43 +#define GCC_PCIE_6_PHY_NOCSR_COM_PHY_BCR 44 +#define GCC_PCIE_NOC_BCR 45 +#define GCC_PCIE_PHY_BCR 46 +#define GCC_PCIE_PHY_CFG_AHB_BCR 47 +#define GCC_PCIE_PHY_COM_BCR 48 +#define GCC_PCIE_RSCC_BCR 49 +#define GCC_PDM_BCR 50 +#define GCC_QUPV3_WRAPPER_0_BCR 51 +#define GCC_QUPV3_WRAPPER_1_BCR 52 +#define GCC_QUPV3_WRAPPER_2_BCR 53 +#define GCC_QUPV3_WRAPPER_OOB_BCR 54 +#define GCC_QUSB2PHY_HS0_MP_BCR 55 +#define GCC_QUSB2PHY_HS1_MP_BCR 56 +#define GCC_QUSB2PHY_PRIM_BCR 57 +#define GCC_QUSB2PHY_SEC_BCR 58 +#define GCC_QUSB2PHY_TERT_BCR 59 +#define GCC_QUSB2PHY_USB20_HS_BCR 60 +#define GCC_SDCC2_BCR 61 +#define GCC_SDCC4_BCR 62 +#define GCC_TCSR_PCIE_BCR 63 +#define GCC_UFS_PHY_BCR 64 +#define GCC_USB20_PRIM_BCR 65 +#define GCC_USB30_MP_BCR 66 +#define GCC_USB30_PRIM_BCR 67 +#define GCC_USB30_SEC_BCR 68 +#define GCC_USB30_TERT_BCR 69 +#define GCC_USB3_MP_SS0_PHY_BCR 70 +#define GCC_USB3_MP_SS1_PHY_BCR 71 +#define GCC_USB3_PHY_PRIM_BCR 72 +#define GCC_USB3_PHY_SEC_BCR 73 +#define GCC_USB3_PHY_TERT_BCR 74 +#define GCC_USB3_UNIPHY_MP0_BCR 75 +#define GCC_USB3_UNIPHY_MP1_BCR 76 +#define GCC_USB3PHY_PHY_PRIM_BCR 77 +#define GCC_USB3PHY_PHY_SEC_BCR 78 +#define GCC_USB3PHY_PHY_TERT_BCR 79 +#define GCC_USB3UNIPHY_PHY_MP0_BCR 80 +#define GCC_USB3UNIPHY_PHY_MP1_BCR 81 +#define GCC_USB4_0_BCR 82 +#define GCC_USB4_0_DP0_PHY_PRIM_BCR 83 +#define GCC_USB4_1_BCR 84 +#define GCC_USB4_2_BCR 85 +#define GCC_USB_0_PHY_BCR 86 +#define GCC_USB_1_PHY_BCR 87 +#define GCC_USB_2_PHY_BCR 88 +#define GCC_VIDEO_AXI0_CLK_ARES 89 +#define GCC_VIDEO_AXI1_CLK_ARES 90 +#define GCC_VIDEO_BCR 91 + +#endif -- cgit v1.2.3 From dd386b0d5e61556927189cd7b59a628d22cb6851 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 1 Sep 2025 19:26:07 -0600 Subject: io_uring/uring_cmd: correct io_uring_cmd_done() ret type io_uring_cmd_done() takes the result code for the CQE as a ssize_t ret argument. However, the CQE res field is a s32 value, as is the argument to io_req_set_res(). To clarify that only s32 values can be faithfully represented without truncation, change io_uring_cmd_done()'s ret argument type to s32. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250902012609.1513123-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 4 ++-- io_uring/uring_cmd.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 7211157edfe9..c4d7874016bb 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -56,7 +56,7 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, +void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, unsigned issue_flags); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -104,7 +104,7 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 ret2, unsigned issue_flags) { } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index d76d6d27765c..5562e8491c5b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -151,7 +151,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); -- cgit v1.2.3 From 9f8608fce90fbcd2a98ceefad0bc762423927629 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 1 Sep 2025 19:33:27 -0600 Subject: io_uring/cmd: remove unused io_uring_cmd_iopoll_done() io_uring_cmd_iopoll_done()'s only caller was removed in commit 9ce6c9875f3e ("nvme: always punt polled uring_cmd end_io work to task_work"). So remove the unused function too. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250902013328.1517686-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c4d7874016bb..50dd6a53cb5e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -133,17 +133,6 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, } #endif -/* - * Polled completions must ensure they are coming from a poll queue, and - * hence are completed inside the usual poll handling loops. - */ -static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, - ssize_t ret, ssize_t res2) -{ - lockdep_assert(in_task()); - io_uring_cmd_done(ioucmd, ret, res2, 0); -} - /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, io_uring_cmd_tw_t task_work_cb) -- cgit v1.2.3 From 21f82062d0f241e55dd59eb630e8710862cc90b4 Mon Sep 17 00:00:00 2001 From: Juraj Šarinay Date: Tue, 2 Sep 2025 13:36:28 +0200 Subject: net: nfc: nci: Increase NCI_DATA_TIMEOUT to 3000 ms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An exchange with a NFC target must complete within NCI_DATA_TIMEOUT. A delay of 700 ms is not sufficient for cryptographic operations on smart cards. CardOS 6.0 may need up to 1.3 seconds to perform 256-bit ECDH or 3072-bit RSA. To prevent brute-force attacks, passports and similar documents introduce even longer delays into access control protocols (BAC/PACE). The timeout should be higher, but not too much. The expiration allows us to detect that a NFC target has disappeared. Signed-off-by: Juraj Šarinay Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250902113630.62393-1-juraj@sarinay.com Signed-off-by: Jakub Kicinski --- include/net/nfc/nci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index e180bdf2f82b..664d5058e66e 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -52,7 +52,7 @@ enum nci_state { #define NCI_RF_DISC_SELECT_TIMEOUT 5000 #define NCI_RF_DEACTIVATE_TIMEOUT 30000 #define NCI_CMD_TIMEOUT 5000 -#define NCI_DATA_TIMEOUT 700 +#define NCI_DATA_TIMEOUT 3000 struct nci_dev; -- cgit v1.2.3 From 8c94db0ae97c72c253a615f990bd466b456e94f6 Mon Sep 17 00:00:00 2001 From: Svetlana Parfenova Date: Mon, 1 Sep 2025 20:53:50 +0700 Subject: binfmt_elf: preserve original ELF e_flags for core dumps Some architectures, such as RISC-V, use the ELF e_flags field to encode ABI-specific information (e.g., ISA extensions, fpu support). Debuggers like GDB rely on these flags in core dumps to correctly interpret optional register sets. If the flags are missing or incorrect, GDB may warn and ignore valid data, for example: warning: Unexpected size of section '.reg2/213' in core file. This can prevent access to fpu or other architecture-specific registers even when they were dumped. Save the e_flags field during ELF binary loading (in load_elf_binary()) into the mm_struct, and later retrieve it during core dump generation (in fill_note_info()). Kconfig option CONFIG_ARCH_HAS_ELF_CORE_EFLAGS is introduced for architectures that require this behaviour. Signed-off-by: Svetlana Parfenova Link: https://lore.kernel.org/r/20250901135350.619485-1-svetlana.parfenova@syntacore.com Signed-off-by: Kees Cook --- arch/riscv/Kconfig | 1 + fs/Kconfig.binfmt | 9 +++++++++ fs/binfmt_elf.c | 40 ++++++++++++++++++++++++++++++++++------ include/linux/mm_types.h | 5 +++++ 4 files changed, 49 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659e..b06a758abb63 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,6 +28,7 @@ config RISCV select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX + select ARCH_HAS_ELF_CORE_EFLAGS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bd2f530e5740..1949e25c7741 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST This builds the exec KUnit tests, which tests boundary conditions of various aspects of the exec internals. +config ARCH_HAS_ELF_CORE_EFLAGS + bool + depends on BINFMT_ELF && ELF_CORE + default n + help + Select this option if the architecture makes use of the e_flags + field in the ELF header to store ABI or other architecture-specific + information that should be preserved in core dumps. + endmenu diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4aacf9c9cc2d..e4653bb99946 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) +static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + mm->saved_e_flags = flags; +#endif +} + +static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + flags = mm->saved_e_flags; +#endif + return flags; +} + /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this @@ -1290,6 +1305,8 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; + elf_coredump_set_mm_eflags(mm, elf_ex->e_flags); + /** * DOC: "brk" handling * @@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; + u16 machine; + u32 flags; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) @@ -1831,17 +1850,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; } - /* - * Initialize the ELF file header. - */ - fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags); + machine = view->e_machine; + flags = view->e_flags; #else view = NULL; info->thread_notes = 2; - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); + machine = ELF_ARCH; + flags = ELF_CORE_EFLAGS; #endif + /* + * Override ELF e_flags with value taken from process, + * if arch needs that. + */ + flags = elf_coredump_get_mm_eflags(dump_task->mm, flags); + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, machine, flags); + /* * Allocate a structure for each thread. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..04a2857f12f2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1102,6 +1102,11 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + /* the ABI-related flags from the ELF header. Used for core dump */ + unsigned long saved_e_flags; +#endif + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; -- cgit v1.2.3 From 3ceb08838b576b20108d7facf6baa3dbf792afe9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Sep 2025 14:12:10 -0700 Subject: net: add helper to pre-check if PP for an Rx queue will be unreadable mlx5 pokes into the rxq state to check if the queue has a memory provider, and therefore whether it may produce unreadable mem. Add a helper for doing this in the page pool API. fbnic will want a similar thing (tho, for a slightly different reason). Reviewed-by: Mina Almasry Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250901211214.1027927-11-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +-------- include/net/netdev_queues.h | 2 ++ include/net/page_pool/helpers.h | 12 ++++++++++++ net/core/netdev_rx_queue.c | 9 +++++++++ 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3970d0ddbcdc..714cce595692 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -780,13 +780,6 @@ static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) bitmap_free(rq->mpwqe.shampo->bitmap); } -static bool mlx5_rq_needs_separate_hd_pool(struct mlx5e_rq *rq) -{ - struct netdev_rx_queue *rxq = __netif_get_rx_queue(rq->netdev, rq->ix); - - return !!rxq->mp_params.mp_ops; -} - static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rqp, @@ -825,7 +818,7 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, hd_pool_size = (rq->mpwqe.shampo->hd_per_wqe * wq_size) / MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; - if (mlx5_rq_needs_separate_hd_pool(rq)) { + if (netif_rxq_has_unreadable_mp(rq->netdev, rq->ix)) { /* Separate page pool for shampo headers */ struct page_pool_params pp_params = { }; diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b9d02bc65c97..cd00e0406cf4 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -151,6 +151,8 @@ struct netdev_queue_mgmt_ops { int idx); }; +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); + /** * DOC: Lockless queue stopping / waking helpers. * diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index aa3719f28216..3247026e096a 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -505,6 +505,18 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +/** + * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU + * @pool: queried page pool + * + * Check if page pool will return buffers which are unreadable to the CPU / + * kernel. This will only be the case if user space bound a memory provider (mp) + * which returns unreadable memory to the queue served by the page pool. + * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound + * this helper will return false. See also netif_rxq_has_unreadable_mp(). + * + * Return: true if memory allocated by the page pool may be unreadable + */ static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3bf1151d8061..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,6 +9,15 @@ #include "page_pool_priv.h" +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); -- cgit v1.2.3 From 34837c444cd42236b2b43ce871f30d83776a3431 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Sun, 24 Aug 2025 20:07:34 +0200 Subject: media: uapi: v4l2-controls: Cleanup codec definitions Move some fields closer to where they are used, add missing tabs and remove an extra newline. Signed-off-by: Paul Kocialkowski Reviewed-by: Nicolas Dufresne Signed-off-by: Nicolas Dufresne Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 7aef88465d04..2d30107e047e 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1537,15 +1537,6 @@ struct v4l2_ctrl_h264_pred_weights { struct v4l2_h264_weight_factors weight_factors[2]; }; -#define V4L2_H264_SLICE_TYPE_P 0 -#define V4L2_H264_SLICE_TYPE_B 1 -#define V4L2_H264_SLICE_TYPE_I 2 -#define V4L2_H264_SLICE_TYPE_SP 3 -#define V4L2_H264_SLICE_TYPE_SI 4 - -#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 -#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 - #define V4L2_H264_TOP_FIELD_REF 0x1 #define V4L2_H264_BOTTOM_FIELD_REF 0x2 #define V4L2_H264_FRAME_REF 0x3 @@ -1566,8 +1557,17 @@ struct v4l2_h264_reference { * Maximum DPB size, as specified by section 'A.3.1 Level limits * common to the Baseline, Main, and Extended profiles'. */ -#define V4L2_H264_NUM_DPB_ENTRIES 16 -#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) +#define V4L2_H264_NUM_DPB_ENTRIES 16 +#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) + +#define V4L2_H264_SLICE_TYPE_P 0 +#define V4L2_H264_SLICE_TYPE_B 1 +#define V4L2_H264_SLICE_TYPE_I 2 +#define V4L2_H264_SLICE_TYPE_SP 3 +#define V4L2_H264_SLICE_TYPE_SI 4 + +#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 +#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 #define V4L2_CID_STATELESS_H264_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 6) /** @@ -1707,7 +1707,6 @@ struct v4l2_ctrl_h264_decode_params { __u32 flags; }; - /* Stateless FWHT control, used by the vicodec driver */ /* Current FWHT version */ -- cgit v1.2.3 From ee63609454838ea2b108f96f74a287be72d281ee Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:19 +1000 Subject: wifi: mac80211: support block bitmap S1G TIM encoding An S1G TIM PVB is encoded differently compared to a non-s1g TIM PVB. As the AP dictates which encoding mode it uses, here we only implement block bitmap encoding. This is the default encoding mode used by all current vendor implementations. Additionally, S1G has a maximum AID count of 8192, however we are limiting the current implementation to 1600. This has no resemblence to the standard and is purely an implementation detail. The reason for this is due to the TIM elements maximum length of 255. This allows for, at most, 25 encoded blocks for a PVB encoded with block bitmap. Support for the maximum of 8192 AIDs will require an implementation of page slicing to be added to mac80211. As a result, we perform extra validation on both the STA and AP side when receiving an AID as an S1G interface. Add support for block bitmap encoding for an S1G AP and limit the maximum AID count to 1600 for the current mac80211 implementations. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +- net/mac80211/cfg.c | 10 ++- net/mac80211/ieee80211_i.h | 8 +++ net/mac80211/mlme.c | 19 ++--- net/mac80211/tx.c | 166 +++++++++++++++++++++++++++++++++---------- 5 files changed, 156 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1a14f2892d9..a4bc0c2729f6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2283,7 +2283,8 @@ enum nl80211_commands { * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16). * This is similar to @NL80211_ATTR_STA_AID but with a difference of being * allowed to be used with the first @NL80211_CMD_SET_STATION command to - * update a TDLS peer STA entry. + * update a TDLS peer STA entry. For S1G interfaces, this is limited to + * 1600 for the current mac80211 implementation. * * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. * diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2ed07fa121ab..4603350989c8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2171,10 +2171,16 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* * cfg80211 validates this (1-2007) and allows setting the AID - * only when creating a new station entry + * only when creating a new station entry. For S1G APs, the current + * implementation supports a maximum of 1600 AIDs. */ - if (params->aid) + if (params->aid) { + if (sdata->vif.cfg.s1g && + params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID) + return -EINVAL; + sta->sta.aid = params->aid; + } /* * Some of the following updates would be racy if called on an diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8afa2404eaa8..07f5fb11569b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_MAX_NAN_INSTANCE_ID 255 +/* + * Current mac80211 implementation supports a maximum of 1600 AIDS + * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks + * as each block is 64 AIDs. + */ +#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 +#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 + enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 353e89973d1e..f5d09ded9827 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6351,6 +6351,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; + u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -6377,10 +6378,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); - if (assoc_data->s1g) + if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; - else + max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; + } else { elem_start = mgmt->u.assoc_resp.variable; + } /* * Note: this may not be perfect, AP might misbehave - if @@ -6404,16 +6407,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); - else if (assoc_data->s1g) - aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* - * The 5 MSB of the AID field are reserved - * (802.11-2016 9.4.1.8 AID field) + * The 5 MSB of the AID field are reserved for a non-S1G STA. For + * an S1G STA the 3 MSBs are reserved. + * (802.11-2016 9.4.1.8 AID field). */ - aid &= 0x7ff; + aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", @@ -6450,7 +6452,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (aid == 0 || aid > IEEE80211_MAX_AID) { + if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); @@ -6488,6 +6490,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } sdata->vif.cfg.aid = aid; + sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 00671ae45b2f..0ece8d89e094 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4882,15 +4882,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t) /* functions for drivers to get certain frames */ +static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int i, n1 = 0, n2; + + /* + * Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. + */ + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (ps->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (ps->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + skb_put_u8(skb, n1 | mcast_traffic); + /* Part Virt Bitmap */ + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); +} + +/* + * mac80211 currently supports encoding using block bitmap mode, non + * inversed. The current implementation supports up to 1600 AIDs. + * + * Block bitmap encoding breaks down the AID bitmap into blocks of 64 + * AIDs. Each block contains between 0 and 8 subblocks. Each subblock + * describes 8 AIDs and the presence of a subblock is determined by + * the block bitmap. + */ +static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int blk; + + /* + * Emit a bitmap control block with a page slice number of 31 and a + * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 + * that the entire page (2048 bits) indicated by the page index + * is encoded in the partial virtual bitmap. + */ + skb_put_u8(skb, mcast_traffic | (31 << 1)); + + /* Emit an encoded block for each non-zero sub-block */ + for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { + u8 blk_bmap = 0; + int sblk; + + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + /* + * If the current subblock is non-zero, increase the + * number of subblocks to emit for the current block. + */ + if (ps->tim[sblk_idx]) + blk_bmap |= BIT(sblk); + } + + /* If the current block contains no non-zero sublocks */ + if (!blk_bmap) + continue; + + /* + * Emit a block control byte for the current encoded block + * with an encoding mode of block bitmap (0x0), not inverse + * (0x0) and the current block offset (5 bits) + */ + skb_put_u8(skb, blk << 3); + + /* + * Emit the block bitmap for the current encoded block which + * contains the present subblocks. + */ + skb_put_u8(skb, blk_bmap); + + /* Emit the present subblocks */ + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + if (!(blk_bmap & BIT(sblk))) + continue; + + skb_put_u8(skb, ps->tim[sblk_idx]); + } + } +} + static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { - u8 *pos, *tim; - int aid0 = 0; - int i, have_bits = 0, n1, n2; + struct element *tim; + bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; + bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ @@ -4898,7 +4997,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, - IEEE80211_MAX_AID+1); + IEEE80211_MAX_AID + 1); + if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; @@ -4906,51 +5006,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 5); - *pos++ = WLAN_EID_TIM; - *pos++ = 3; - *pos++ = ps->dtim_count; - *pos++ = link_conf->dtim_period; + /* Length is set after parsing the AID bitmap */ + tim = skb_put(skb, sizeof(struct element)); + tim->id = WLAN_EID_TIM; + skb_put_u8(skb, ps->dtim_count); + skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) - aid0 = 1; + mcast_traffic = true; - ps->dtim_bc_mc = aid0 == 1; + ps->dtim_bc_mc = mcast_traffic; if (have_bits) { - /* Find largest even number N1 so that bits numbered 1 through - * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits - * (N2 + 1) x 8 through 2007 are 0. */ - n1 = 0; - for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { - if (ps->tim[i]) { - n1 = i & 0xfe; - break; - } - } - n2 = n1; - for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { - if (ps->tim[i]) { - n2 = i; - break; - } - } - - /* Bitmap control */ - *pos++ = n1 | aid0; - /* Part Virt Bitmap */ - skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); - - tim[1] = n2 - n1 + 4; + if (s1g) + ieee80211_s1g_beacon_add_tim_pvb(ps, skb, + mcast_traffic); + else + ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { - *pos++ = aid0; /* Bitmap control */ - - if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { - tim[1] = 4; + /* + * If there is no buffered unicast traffic for an S1G + * interface, we can exclude the bitmap control. This is in + * contrast to other phy types as they do include the bitmap + * control and pvb even when there is no buffered traffic. + */ + if (!s1g) { + /* Bitmap control */ + skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } + + tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From e0c47c6229c25b54440fe1f84a0ff533942290b1 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:20 +1000 Subject: wifi: mac80211: support parsing S1G TIM PVB An S1G TIM PVB has 3 mandatory encoding modes, that being block bitmap, single AID and OBL alongside the ability for each encoding mode to be inverted. Introduce the ability to parse the 3 encoding formats. The implementation specification for the encoding formats can be found in IEEE80211-2024 9.4.2.5. Signed-off-by: Arien Judge Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/carl9170/rx.c | 2 +- drivers/net/wireless/intersil/p54/txrx.c | 2 +- drivers/net/wireless/ralink/rt2x00/rt2x00dev.c | 2 +- drivers/net/wireless/realtek/rtlwifi/ps.c | 2 +- include/linux/ieee80211.h | 265 ++++++++++++++++++++++++- net/mac80211/mesh_ps.c | 2 +- net/mac80211/mlme.c | 3 +- 7 files changed, 263 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/carl9170/rx.c b/drivers/net/wireless/ath/carl9170/rx.c index 908c4c8b7f82..6833430130f4 100644 --- a/drivers/net/wireless/ath/carl9170/rx.c +++ b/drivers/net/wireless/ath/carl9170/rx.c @@ -555,7 +555,7 @@ static void carl9170_ps_beacon(struct ar9170 *ar, void *data, unsigned int len) /* Check whenever the PHY can be turned off again. */ /* 1. What about buffered unicast traffic for our AID? */ - cam = ieee80211_check_tim(tim_ie, tim_len, ar->common.curaid); + cam = ieee80211_check_tim(tim_ie, tim_len, ar->common.curaid, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ cam |= !!(tim_ie->bitmap_ctrl & 0x01); diff --git a/drivers/net/wireless/intersil/p54/txrx.c b/drivers/net/wireless/intersil/p54/txrx.c index 2deb1bb54f24..1294a1d6528e 100644 --- a/drivers/net/wireless/intersil/p54/txrx.c +++ b/drivers/net/wireless/intersil/p54/txrx.c @@ -317,7 +317,7 @@ static void p54_pspoll_workaround(struct p54_common *priv, struct sk_buff *skb) tim_len = tim[1]; tim_ie = (struct ieee80211_tim_ie *) &tim[2]; - new_psm = ieee80211_check_tim(tim_ie, tim_len, priv->aid); + new_psm = ieee80211_check_tim(tim_ie, tim_len, priv->aid, false); if (new_psm != priv->powersave_override) { priv->powersave_override = new_psm; p54_set_ps(priv); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 7db29e90eb4f..f8a6f9c968a1 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -679,7 +679,7 @@ static void rt2x00lib_rxdone_check_ps(struct rt2x00_dev *rt2x00dev, /* Check whenever the PHY can be turned off again. */ /* 1. What about buffered unicast traffic for our AID? */ - cam = ieee80211_check_tim(tim_ie, tim_len, rt2x00dev->aid); + cam = ieee80211_check_tim(tim_ie, tim_len, rt2x00dev->aid, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ cam |= (tim_ie->bitmap_ctrl & 0x01); diff --git a/drivers/net/wireless/realtek/rtlwifi/ps.c b/drivers/net/wireless/realtek/rtlwifi/ps.c index 6241e4fed4f6..bcab12c3b4c1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/ps.c +++ b/drivers/net/wireless/realtek/rtlwifi/ps.c @@ -519,7 +519,7 @@ void rtl_swlps_beacon(struct ieee80211_hw *hw, void *data, unsigned int len) /* 1. What about buffered unicast traffic for our AID? */ u_buffed = ieee80211_check_tim(tim_ie, tim_len, - rtlpriv->mac80211.assoc_id); + rtlpriv->mac80211.assoc_id, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ m_buffed = tim_ie->bitmap_ctrl & 0x01; diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e5a2096e022e..d350263f23f3 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -220,6 +220,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_AID_S1G 8191 #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 + +/* S1G encoding types */ +#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 +#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 +#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 + /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section 6.2.1.1.2. @@ -4757,15 +4763,8 @@ static inline unsigned long ieee80211_tu_to_usec(unsigned long tu) return 1024 * tu; } -/** - * ieee80211_check_tim - check if AID bit is set in TIM - * @tim: the TIM IE - * @tim_len: length of the TIM IE - * @aid: the AID to look for - * Return: whether or not traffic is indicated in the TIM for the given AID - */ -static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid) +static inline bool __ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) { u8 mask; u8 index, indexn1, indexn2; @@ -4788,6 +4787,254 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, return !!(tim->virtual_map[index] & mask); } +struct s1g_tim_aid { + u16 aid; + u8 target_blk; /* Target block index */ + u8 target_subblk; /* Target subblock index */ + u8 target_subblk_bit; /* Target subblock bit */ +}; + +struct s1g_tim_enc_block { + u8 enc_mode; + bool inverse; + const u8 *ptr; + u8 len; + + /* + * For an OLB encoded block that spans multiple blocks, this + * is the offset into the span described by that encoded block. + */ + u8 olb_blk_offset; +}; + +/* + * Helper routines to quickly extract the length of an encoded block. Validation + * is also performed to ensure the length extracted lies within the TIM. + */ + +static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) +{ + u8 blkmap; + u8 n_subblks; + + if (ptr >= end) + return -EINVAL; + + blkmap = *ptr; + n_subblks = hweight8(blkmap); + + if (ptr + 1 + n_subblks > end) + return -EINVAL; + + return 1 + n_subblks; +} + +static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) +{ + return (ptr + 1 > end) ? -EINVAL : 1; +} + +static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) +{ + if (ptr >= end) + return -EINVAL; + + return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; +} + +/* + * Enumerate all encoded blocks until we find the encoded block that describes + * our target AID. OLB is a special case as a single encoded block can describe + * multiple blocks as a single encoded block. + */ +static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, + const struct s1g_tim_aid *aid, + const u8 *ptr, const u8 *end) +{ + /* need at least block-control octet */ + while (ptr + 1 <= end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool contains, inverse = ctrl & BIT(2); + u8 span, blk_off = ctrl >> 3; + int len; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + len = ieee80211_s1g_len_bitmap(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + len = ieee80211_s1g_len_single(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + len = ieee80211_s1g_len_olb(ptr, end); + /* + * An OLB encoded block can describe more then one + * block, meaning an encoded OLB block can span more + * then a single block. + */ + if (len > 0) { + /* Minus one for the length octet */ + span = DIV_ROUND_UP(len - 1, 8); + /* + * Check if our target block lies within the + * block span described by this encoded block. + */ + contains = (aid->target_blk >= blk_off) && + (aid->target_blk < blk_off + span); + } + break; + default: + return -EOPNOTSUPP; + } + + if (len < 0) + return len; + + if (contains) { + enc->enc_mode = mode; + enc->inverse = inverse; + enc->ptr = ptr; + enc->len = (u8)len; + enc->olb_blk_offset = blk_off; + return 0; + } + + ptr += len; + } + + return -ENOENT; +} + +static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blkmap = *ptr++; + + /* + * If our block bitmap does not contain a set bit that corresponds + * to our AID, it could mean a variety of things depending on if + * the encoding mode is inverted or not. + * + * 1. If inverted, it means the entire subblock is present and hence + * our AID has been set. + * 2. If not inverted, it means our subblock is not present and hence + * it is all zero meaning our AID is not set. + */ + if (!(blkmap & BIT(aid->target_subblk))) + return enc->inverse; + + /* + * Increment ptr by the number of set subblocks that appear before our + * target subblock. If our target subblock is 0, do nothing as ptr + * already points to our target subblock. + */ + if (aid->target_subblk) + ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); + + return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + /* + * Single AID mode describes, as the name suggests, a single AID + * within the block described by the encoded block. The octet + * contains the 6 LSBs of the AID described in the block. The other + * 2 bits are reserved. When inversed, every single AID described + * by the current block have buffered traffic except for the AID + * described in the single AID octet. + */ + return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blk_len = *ptr++; + /* + * Given an OLB encoded block that describes multiple blocks, + * calculate the offset into the span. Then calculate the + * subblock location normally. + */ + u16 span_offset = aid->target_blk - enc->olb_blk_offset; + u16 subblk_idx = span_offset * 8 + aid->target_subblk; + + if (subblk_idx >= blk_len) + return enc->inverse; + + return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +/* + * An S1G PVB has 3 non optional encoding types, each that can be inverted. + * An S1G PVB is constructed with zero or more encoded block subfields. Each + * encoded block represents a single "block" of AIDs (64), and each encoded + * block can contain one of the 3 encoding types alongside a single bit for + * whether the bits should be inverted. + * + * As the standard makes no guarantee about the ordering of encoded blocks, + * we must parse every encoded block in the worst case scenario given an + * AID that lies within the last block. + */ +static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) +{ + int err; + struct s1g_tim_aid target_aid; + struct s1g_tim_enc_block enc_blk; + + if (tim_len < 3) + return false; + + target_aid.aid = aid; + target_aid.target_blk = (aid >> 6) & 0x1f; + target_aid.target_subblk = (aid >> 3) & 0x7; + target_aid.target_subblk_bit = aid & 0x7; + + /* + * Find our AIDs target encoded block and fill &enc_blk with the + * encoded blocks information. If no entry is found or an error + * occurs return false. + */ + err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, + tim->virtual_map, + (const u8 *)tim + tim_len + 2); + if (err) + return false; + + switch (enc_blk.enc_mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + return ieee80211_s1g_parse_single(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); + default: + return false; + } +} + +/** + * ieee80211_check_tim - check if AID bit is set in TIM + * @tim: the TIM IE + * @tim_len: length of the TIM IE + * @aid: the AID to look for + * @s1g: whether the TIM is from an S1G PPDU + * Return: whether or not traffic is indicated in the TIM for the given AID + */ +static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid, bool s1g) +{ + return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : + __ieee80211_check_tim(tim, tim_len, aid); +} + /** * ieee80211_get_tdls_action - get TDLS action code * @skb: the skb containing the frame, length will not be checked diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 20e022a03933..ebab1f0a0138 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -586,7 +586,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->mesh->aid); + sta->mesh->aid, false); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f5d09ded9827..9568cc95a7ff 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7438,7 +7438,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) { + ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, + vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; -- cgit v1.2.3 From 5f9d5fd8e08968e66d0212f782fc24d76e52800f Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 12 Aug 2025 12:53:28 +0530 Subject: wifi: cfg80211: fix return value in cfg80211_get_radio_idx_by_chan() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a valid radio index is not found, the function returns -ENOENT. If the channel argument itself is invalid, it returns -EINVAL. However, since the caller only checks for < 0, the distinction between these error codes is not utilized much. Also, handling these two distinct error codes throughout the codebase adds complexity, as both cases must be addressed separately. A subsequent change aims to simplify this by using a single error code for all invalid cases, making error handling more consistent and streamlined. To support this change, update the return value to -EINVAL when a valid radio index is not found. This is still appropriate because, even if the channel argument is structurally valid, the absence of a corresponding radio index implies that the argument is effectively invalid—otherwise, a valid index would have been found. Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250812-fix_scan_ap_flag_requirement_during_mlo-v4-1-383ffb6da213@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/util.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 406626ff6cc8..cb1c36be2749 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9548,7 +9548,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, * @wiphy: the wiphy * @chan: channel for which the supported radio index is required * - * Return: radio index on success or a negative error code + * Return: radio index on success or -EINVAL otherwise */ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, const struct ieee80211_channel *chan); diff --git a/net/wireless/util.c b/net/wireless/util.c index 240c68baa3d1..d12d49134c88 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2584,7 +2584,7 @@ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, } } - return -ENOENT; + return -EINVAL; } EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); -- cgit v1.2.3 From d0bf06158c39e7129524dd8b43b82aed84d68faa Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Fri, 15 Aug 2025 14:30:11 -0700 Subject: wifi: nl80211: Add EHT fixed Tx rate support Add new attributes to support EHT MCS/NSS Tx rates and EHT GI/LTF. Parse EHT fixed MCS/NSS Tx rates and EHT GI/LTF values passed by the userspace, validate and add as part of cfg80211_bitrate_mask. MCS mask is constructed by new function, eht_build_mcs_mask(). Max NSS supported for MCS rates of 7, 9, 11 and 13 is utilized to set MCS bitmask for each NSS. MCS rates 14, and 15 if supported, are set only for NSS = 0. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Signed-off-by: Muna Sinada Link: https://patch.msgid.link/20250815213011.2704803-1-muna.sinada@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 41 +++++++- net/wireless/nl80211.c | 229 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 266 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb1c36be2749..7d881aa7e48b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -841,9 +841,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a4bc0c2729f6..4f08264bbc8e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1943,8 +1943,9 @@ enum nl80211_commands { * The driver must also specify support for this with the extended * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, * NL80211_EXT_FEATURE_BEACON_RATE_HT, - * NL80211_EXT_FEATURE_BEACON_RATE_VHT and - * NL80211_EXT_FEATURE_BEACON_RATE_HE. + * NL80211_EXT_FEATURE_BEACON_RATE_VHT, + * NL80211_EXT_FEATURE_BEACON_RATE_HE and + * NL80211_EXT_FEATURE_BEACON_RATE_EHT. * * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. @@ -3736,6 +3737,22 @@ enum nl80211_eht_gi { NL80211_RATE_INFO_EHT_GI_3_2, }; +/** + * enum nl80211_eht_ltf - EHT long training field + * @NL80211_RATE_INFO_EHT_1XLTF: 3.2 usec + * @NL80211_RATE_INFO_EHT_2XLTF: 6.4 usec + * @NL80211_RATE_INFO_EHT_4XLTF: 12.8 usec + * @NL80211_RATE_INFO_EHT_6XLTF: 19.2 usec + * @NL80211_RATE_INFO_EHT_8XLTF: 25.6 usec + */ +enum nl80211_eht_ltf { + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_2XLTF, + NL80211_RATE_INFO_EHT_4XLTF, + NL80211_RATE_INFO_EHT_6XLTF, + NL80211_RATE_INFO_EHT_8XLTF, +}; + /** * enum nl80211_eht_ru_alloc - EHT RU allocation values * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation @@ -5482,6 +5499,10 @@ enum nl80211_key_attributes { * see &struct nl80211_txrate_he * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. + * @NL80211_TXRATE_EHT: EHT rates allowed for TX rate selection, + * see &struct nl80211_txrate_eht + * @NL80211_TXRATE_EHT_GI: configure EHT GI, (u8, see &enum nl80211_eht_gi) + * @NL80211_TXRATE_EHT_LTF: configure EHT LTF, (u8, see &enum nl80211_eht_ltf) * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -5494,6 +5515,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HE, NL80211_TXRATE_HE_GI, NL80211_TXRATE_HE_LTF, + NL80211_TXRATE_EHT, + NL80211_TXRATE_EHT_GI, + NL80211_TXRATE_EHT_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -5526,6 +5550,15 @@ enum nl80211_txrate_gi { NL80211_TXRATE_FORCE_LGI, }; +#define NL80211_EHT_NSS_MAX 16 +/** + * struct nl80211_txrate_eht - EHT MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_eht { + __u16 mcs[NL80211_EHT_NSS_MAX]; +}; + /** * enum nl80211_band - Frequency band * @NL80211_BAND_2GHZ: 2.4 GHz ISM band @@ -6650,6 +6683,9 @@ enum nl80211_feature_flags { * (signaling and payload protected) A-MSDUs and this shall be advertised * in the RSNXE. * + * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate + * configuration (AP/mesh) with EHT rates. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6725,6 +6761,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, NL80211_EXT_FEATURE_DFS_CONCURRENT, NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, + NL80211_EXT_FEATURE_BEACON_RATE_EHT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..364d83fb9992 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,6 +411,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -5393,6 +5401,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5579,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5609,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5695,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5730,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5747,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5793,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5827,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } -- cgit v1.2.3 From 24185534915b5d926ded098336f47bdcca333aec Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:32 +0200 Subject: wifi: nl80211: allow drivers to support subset of NL80211_CMD_SET_BSS The so-called fullmac devices rely on firmware functionality and/or API to change BSS parameters. Today there are limited drivers supporting the nl80211 primitive, but they only handle a subset of the bss parameters passed if any. The mac80211 driver does handle all parameters and stores their configured values. Some of the BSS parameters were already conditional by wiphy->features. For these the wiphy->bss_param_support and wiphy->features fields are silently aligned in wiphy_register(). Maybe better to issue a warning instead when they are misaligned. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-2-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/core.c | 9 +++++++++ net/wireless/nl80211.c | 39 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d881aa7e48b..4072a67c9cc9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2459,6 +2459,29 @@ struct mpath_info { int generation; }; +/** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + /** * struct bss_parameters - BSS parameters * @@ -5785,6 +5808,11 @@ struct wiphy_radio { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5970,6 +5998,7 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4f08264bbc8e..6c07100fc01f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2930,6 +2930,9 @@ enum nl80211_commands { * required alongside this attribute. Refer to * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * + * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY + * which indicates which BSS parameters can be modified. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3491,6 +3494,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, + NL80211_ATTR_BSS_PARAM, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..797f9f2004a6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 364d83fb9992..153644a04072 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3027,6 +3027,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -9048,6 +9082,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9087,7 +9122,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; } @@ -9099,7 +9134,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } -- cgit v1.2.3 From 4f652a390db4246c5d3c51bf25d03ed0e4178fdc Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:34 +0200 Subject: wifi: nl80211: strict checking attributes for NL80211_CMD_SET_BSS Assure user-space only modifies attributes for NL80211_CMD_SET_BSS that are supported by the driver. This stricter checking is only done when user-space commits to it by including NL80211_ATTR_BSS_PARAM. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-4-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 ++++- net/wireless/nl80211.c | 52 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6c07100fc01f..aed0b4c5d5e8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2931,7 +2931,10 @@ enum nl80211_commands { * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY - * which indicates which BSS parameters can be modified. + * which indicates which BSS parameters can be modified. The attribute can + * also be used as flag attribute by user-space in %NL80211_CMD_SET_BSS to + * indicate that it wants strict checking on the BSS parameters to be + * modified. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 153644a04072..99e2aadc65f7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -879,6 +879,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), + [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -9083,6 +9084,8 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; u32 bss_param_support = rdev->wiphy.bss_param_support; + u32 changed = 0; + bool strict; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9095,26 +9098,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = -1; params.p2p_opp_ps = -1; - if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]); + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT)) + return -EINVAL; params.use_cts_prot = nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + changed |= WIPHY_BSS_PARAM_CTS_PROT; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE)) + return -EINVAL; params.use_short_preamble = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME)) + return -EINVAL; params.use_short_slot_time = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME; + } if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES)) + return -EINVAL; params.basic_rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); params.basic_rates_len = nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + changed |= WIPHY_BSS_PARAM_BASIC_RATES; } - if (info->attrs[NL80211_ATTR_AP_ISOLATE]) - params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE)) + return -EINVAL; + params.ap_isolate = + !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + changed |= WIPHY_BSS_PARAM_AP_ISOLATE; + } + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE)) + return -EINVAL; params.ht_opmode = nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + changed |= WIPHY_BSS_PARAM_HT_OPMODE; + } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -9124,6 +9155,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (params.p2p_ctwindow != 0 && !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; + changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW; } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { @@ -9132,9 +9164,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); + if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } @@ -9145,6 +9179,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; + changed &= rdev->wiphy.bss_param_support; + if (!changed) + return 0; + return rdev_change_bss(rdev, dev, ¶ms); } -- cgit v1.2.3 From 7d298d25ce81251068bb4ea1d92813ec764a9fec Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 26 Aug 2025 08:17:06 +0200 Subject: vdso: Move ENABLE_COMPAT_VDSO from core to arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ENABLE_COMAPT_VDSO symbol is only used by arm64 and only for the time-related functionality. There should be no new users, so it doesn't need to be in the generic vDSO code. Move the logic into arm64 architecture-specific code and replace the explicit define by the standard '#ifdef __aarch64__'. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-3-d9b65750e49f@linutronix.de --- arch/arm64/include/asm/vdso/compat_barrier.h | 7 +++---- arch/arm64/include/asm/vdso/compat_gettimeofday.h | 6 +++--- arch/arm64/include/asm/vdso/gettimeofday.h | 8 ++++++++ arch/arm64/kernel/vdso32/Makefile | 1 - include/vdso/datapage.h | 4 ---- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h index 3ac35f4a667c..6d75e03d3827 100644 --- a/arch/arm64/include/asm/vdso/compat_barrier.h +++ b/arch/arm64/include/asm/vdso/compat_barrier.h @@ -7,11 +7,10 @@ #ifndef __ASSEMBLY__ /* - * Warning: This code is meant to be used with - * ENABLE_COMPAT_VDSO only. + * Warning: This code is meant to be used from the compat vDSO only. */ -#ifndef ENABLE_COMPAT_VDSO -#error This header is meant to be used with ENABLE_COMPAT_VDSO only +#ifdef __arch64__ +#error This header is meant to be used with from the compat vDSO only #endif #ifdef dmb diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index d60ea7a72a9c..7d1a116549b1 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2018 ARM Limited */ -#ifndef __ASM_VDSO_GETTIMEOFDAY_H -#define __ASM_VDSO_GETTIMEOFDAY_H +#ifndef __ASM_VDSO_COMPAT_GETTIMEOFDAY_H +#define __ASM_VDSO_COMPAT_GETTIMEOFDAY_H #ifndef __ASSEMBLY__ @@ -163,4 +163,4 @@ static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) #endif /* !__ASSEMBLY__ */ -#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ +#endif /* __ASM_VDSO_COMPAT_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index da1ab8759592..c59e84105b43 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -5,6 +5,8 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H +#ifdef __aarch64__ + #ifndef __ASSEMBLY__ #include @@ -96,4 +98,10 @@ static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data( #endif /* !__ASSEMBLY__ */ +#else /* !__aarch64__ */ + +#include "compat_gettimeofday.h" + +#endif /* __aarch64__ */ + #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f2dfdc7dc818..230fdc26796a 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -59,7 +59,6 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 02533038640e..0b1982f15de4 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -196,11 +196,7 @@ enum vdso_pages { * - clock_gettime_fallback(): fallback for clock_gettime. * - clock_getres_fallback(): fallback for clock_getres. */ -#ifdef ENABLE_COMPAT_VDSO -#include -#else #include -#endif /* ENABLE_COMPAT_VDSO */ #else /* !__ASSEMBLY__ */ -- cgit v1.2.3 From 7b338f6d4e3d6baa057e3505592a86f6410d68ed Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 26 Aug 2025 08:17:12 +0200 Subject: vdso: Drop Kconfig GENERIC_VDSO_DATA_STORE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users of the generic vDSO library also use the generic vDSO datastore. Remove the now unnecessary Kconfig symbol. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/all/20250826-vdso-cleanups-v1-9-d9b65750e49f@linutronix.de --- arch/Kconfig | 2 +- arch/arm/mm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - include/asm-generic/vdso/vsyscall.h | 4 ---- include/vdso/datapage.h | 5 +---- lib/vdso/Kconfig | 5 ----- lib/vdso/Makefile | 2 +- lib/vdso/gettimeofday.c | 2 -- 14 files changed, 3 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e085..f6ca7f303172 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1609,7 +1609,7 @@ config HAVE_SPARSE_SYSCALL_NR related optimizations for a given architecture. config ARCH_HAS_VDSO_ARCH_DATA - depends on GENERIC_VDSO_DATA_STORE + depends on HAVE_GENERIC_VDSO bool config ARCH_HAS_VDSO_TIME_DATA diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 2347988cf641..7b27ee9482b3 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -927,7 +927,6 @@ config VDSO select HAVE_GENERIC_VDSO select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c61b19ea9c8..b0f007b396c8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,7 +162,6 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac..d15b201d55f9 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -108,7 +108,6 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index caf508f6e9ec..f7e6bbd755e0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,7 +51,6 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9f..78c82af95561 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -207,7 +207,6 @@ config PPC select GENERIC_PCI_IOMAP if PCI select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e4ac0e833ecf..f6cf9180ccf2 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -121,7 +121,6 @@ config RISCV select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE if HAVE_GENERIC_VDSO select GENERIC_VDSO_TIME_NS if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33c..696d2243d64b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -167,7 +167,6 @@ config S390 select GENERIC_GETTIMEOFDAY select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4f120070a51b..1e74b2a356e4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -181,7 +181,6 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 7fc0b560007d..5c6d9799f4e7 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,8 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE - #ifndef __arch_get_vdso_u_time_data static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { @@ -20,8 +18,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo } #endif -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - #ifndef __arch_update_vdso_clock static __always_inline void __arch_update_vdso_clock(struct vdso_clock *vc) { diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 0b1982f15de4..23c39b96190f 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -31,7 +31,7 @@ struct arch_vdso_time_data {}; #if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) #include -#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +#else struct vdso_arch_data { /* Needed for the generic code, never actually used at runtime */ char __unused; @@ -164,7 +164,6 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); @@ -185,8 +184,6 @@ enum vdso_pages { VDSO_NR_PAGES }; -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - /* * The generic vDSO implementation requires that gettimeofday.h * provides: diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 2594dd7185be..48ffb0f6fa41 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -31,8 +31,3 @@ config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). - -config GENERIC_VDSO_DATA_STORE - bool - help - Selected by architectures that use the generic vDSO data store. diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index aedd40aaa950..405f743253d7 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += datastore.o diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 1e2a40b8d2c6..95df0153f05a 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -108,13 +108,11 @@ bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock return true; } -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE static __always_inline const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) { return (void *)vd + PAGE_SIZE; } -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ static __always_inline bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, -- cgit v1.2.3 From 0dcfb6fcdd085bbfcdfdcf64a7d4a75c63c108af Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 4 Sep 2025 08:19:53 +0100 Subject: dt-bindings: clock: renesas,r9a09g077/87: Add Ethernet clock IDs Add clock definitions for Ethernet (ETCLK A-E) to both R9A09G077 and R9A09G087 SoCs. These definitions are required for describing Ethernet devices in DT. Signed-off-by: Lad Prabhakar Acked-by: Conor Dooley Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20250904071954.3176806-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h | 5 +++++ include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h index 0c2ce81a8744..2a805e06487b 100644 --- a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h @@ -26,5 +26,10 @@ #define R9A09G077_CLK_PCLKL 14 #define R9A09G077_SDHI_CLKHS 15 #define R9A09G077_USB_CLK 16 +#define R9A09G077_ETCLKA 17 +#define R9A09G077_ETCLKB 18 +#define R9A09G077_ETCLKC 19 +#define R9A09G077_ETCLKD 20 +#define R9A09G077_ETCLKE 21 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */ diff --git a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h index 70ee883f2386..09da0ad33be6 100644 --- a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h @@ -26,5 +26,10 @@ #define R9A09G087_CLK_PCLKL 14 #define R9A09G087_SDHI_CLKHS 15 #define R9A09G087_USB_CLK 16 +#define R9A09G087_ETCLKA 17 +#define R9A09G087_ETCLKB 18 +#define R9A09G087_ETCLKC 19 +#define R9A09G087_ETCLKD 20 +#define R9A09G087_ETCLKE 21 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__ */ -- cgit v1.2.3 From 9428fff44f0c5823fde733b64a344d7a6eda3873 Mon Sep 17 00:00:00 2001 From: "hongyu.chen1" Date: Fri, 22 Aug 2025 13:39:55 +0800 Subject: dt-bindings: power: add Amlogic S6 S7 S7D power domains Add devicetree binding document and related header file for Amlogic S6 S7 S7D secure power domains. Signed-off-by: hongyu.chen1 Signed-off-by: Xianwei Zhao Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250822-pm-s6-s7-s7d-v1-1-82e3f3aff327@amlogic.com Signed-off-by: Ulf Hansson --- .../bindings/power/amlogic,meson-sec-pwrc.yaml | 3 +++ include/dt-bindings/power/amlogic,s6-pwrc.h | 29 ++++++++++++++++++++++ include/dt-bindings/power/amlogic,s7-pwrc.h | 20 +++++++++++++++ include/dt-bindings/power/amlogic,s7d-pwrc.h | 27 ++++++++++++++++++++ 4 files changed, 79 insertions(+) create mode 100644 include/dt-bindings/power/amlogic,s6-pwrc.h create mode 100644 include/dt-bindings/power/amlogic,s7-pwrc.h create mode 100644 include/dt-bindings/power/amlogic,s7d-pwrc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml b/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml index 15d74138baa3..12b71688dd34 100644 --- a/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml +++ b/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml @@ -24,6 +24,9 @@ properties: - amlogic,a5-pwrc - amlogic,c3-pwrc - amlogic,t7-pwrc + - amlogic,s6-pwrc + - amlogic,s7-pwrc + - amlogic,s7d-pwrc "#power-domain-cells": const: 1 diff --git a/include/dt-bindings/power/amlogic,s6-pwrc.h b/include/dt-bindings/power/amlogic,s6-pwrc.h new file mode 100644 index 000000000000..2c005864ae73 --- /dev/null +++ b/include/dt-bindings/power/amlogic,s6-pwrc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S6_POWER_H +#define _DT_BINDINGS_AMLOGIC_S6_POWER_H + +#define PWRC_S6_DSPA_ID 0 +#define PWRC_S6_DOS_HEVC_ID 1 +#define PWRC_S6_DOS_VDEC_ID 2 +#define PWRC_S6_VPU_HDMI_ID 3 +#define PWRC_S6_U2DRD_ID 4 +#define PWRC_S6_U3DRD_ID 5 +#define PWRC_S6_SD_EMMC_C_ID 6 +#define PWRC_S6_GE2D_ID 7 +#define PWRC_S6_AMFC_ID 8 +#define PWRC_S6_VC9000E_ID 9 +#define PWRC_S6_DEWARP_ID 10 +#define PWRC_S6_VICP_ID 11 +#define PWRC_S6_SD_EMMC_A_ID 12 +#define PWRC_S6_SD_EMMC_B_ID 13 +#define PWRC_S6_ETH_ID 14 +#define PWRC_S6_PCIE_ID 15 +#define PWRC_S6_NNA_4T_ID 16 +#define PWRC_S6_AUDIO_ID 17 +#define PWRC_S6_AUCPU_ID 18 +#define PWRC_S6_ADAPT_ID 19 + +#endif diff --git a/include/dt-bindings/power/amlogic,s7-pwrc.h b/include/dt-bindings/power/amlogic,s7-pwrc.h new file mode 100644 index 000000000000..3f21d095f784 --- /dev/null +++ b/include/dt-bindings/power/amlogic,s7-pwrc.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S7_POWER_H +#define _DT_BINDINGS_AMLOGIC_S7_POWER_H + +#define PWRC_S7_DOS_HEVC_ID 0 +#define PWRC_S7_DOS_VDEC_ID 1 +#define PWRC_S7_VPU_HDMI_ID 2 +#define PWRC_S7_USB_COMB_ID 3 +#define PWRC_S7_SD_EMMC_C_ID 4 +#define PWRC_S7_GE2D_ID 5 +#define PWRC_S7_SD_EMMC_A_ID 6 +#define PWRC_S7_SD_EMMC_B_ID 7 +#define PWRC_S7_ETH_ID 8 +#define PWRC_S7_AUCPU_ID 9 +#define PWRC_S7_AUDIO_ID 10 + +#endif diff --git a/include/dt-bindings/power/amlogic,s7d-pwrc.h b/include/dt-bindings/power/amlogic,s7d-pwrc.h new file mode 100644 index 000000000000..c6998553670a --- /dev/null +++ b/include/dt-bindings/power/amlogic,s7d-pwrc.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S7D_POWER_H +#define _DT_BINDINGS_AMLOGIC_S7D_POWER_H + +#define PWRC_S7D_DOS_HCODEC_ID 0 +#define PWRC_S7D_DOS_HEVC_ID 1 +#define PWRC_S7D_DOS_VDEC_ID 2 +#define PWRC_S7D_VPU_HDMI_ID 3 +#define PWRC_S7D_USB_U2DRD_ID 4 +#define PWRC_S7D_USB_U2H_ID 5 +#define PWRC_S7D_SSD_EMMC_C_ID 6 +#define PWRC_S7D_GE2D_ID 7 +#define PWRC_S7D_AMFC_ID 8 +#define PWRC_S7D_EMMC_A_ID 9 +#define PWRC_S7D_EMMC_B_ID 10 +#define PWRC_S7D_ETH_ID 11 +#define PWRC_S7D_AUCPU_ID 12 +#define PWRC_S7D_AUDIO_ID 13 +#define PWRC_S7D_SRAMA_ID 14 +#define PWRC_S7D_DMC0_ID 15 +#define PWRC_S7D_DMC1_ID 16 +#define PWRC_S7D_DDR_ID 17 + +#endif -- cgit v1.2.3 From ec630c2c8ce215dd365b8c3644f004f645714a0f Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 20 Aug 2025 17:37:17 +0100 Subject: ASoC: SDCA: Reorder members of hide struct to remove holes Remove some padding holes in the sdca_entity_hide struct by reordering the members. Signed-off-by: Charles Keepax Message-ID: <20250820163717.1095846-4-ckeepax@opensource.cirrus.com> Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 06ec126cdcc3..ea68856e4c8c 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -1063,27 +1063,30 @@ struct sdca_entity_ge { /** * struct sdca_entity_hide - information specific to HIDE Entities * @hid: HID device structure - * @hidtx_ids: HIDTx Report ID * @num_hidtx_ids: number of HIDTx Report ID - * @hidrx_ids: HIDRx Report ID * @num_hidrx_ids: number of HIDRx Report ID - * @hide_reside_function_num: indicating which Audio Function Numbers within this Device - * @max_delay: the maximum time in microseconds allowed for the Device to change the ownership from Device to Host - * @af_number_list: which Audio Function Numbers within this Device are sending/receiving the messages in this HIDE - * @hid_desc: HID descriptor for the HIDE Entity + * @hidtx_ids: HIDTx Report ID + * @hidrx_ids: HIDRx Report ID + * @af_number_list: which Audio Function Numbers within this Device are + * sending/receiving the messages in this HIDE + * @hide_reside_function_num: indicating which Audio Function Numbers + * within this Device + * @max_delay: the maximum time in microseconds allowed for the Device + * to change the ownership from Device to Host * @hid_report_desc: HID Report Descriptor for the HIDE Entity + * @hid_desc: HID descriptor for the HIDE Entity */ struct sdca_entity_hide { struct hid_device *hid; unsigned int *hidtx_ids; - int num_hidtx_ids; unsigned int *hidrx_ids; + int num_hidtx_ids; int num_hidrx_ids; + unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT]; unsigned int hide_reside_function_num; unsigned int max_delay; - unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT]; - struct hid_descriptor hid_desc; unsigned char *hid_report_desc; + struct hid_descriptor hid_desc; }; /** -- cgit v1.2.3 From 4d32c1f66a768dd281edadbd244dce1c63c0899d Mon Sep 17 00:00:00 2001 From: Barnabás Czémán Date: Wed, 3 Sep 2025 23:08:21 +0200 Subject: dt-bindings: clock: qcom: Add MSM8937 Global Clock Controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add device tree bindings for the global clock controller on Qualcomm MSM8937 platform. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Barnabás Czémán Link: https://lore.kernel.org/r/20250903-msm8937-v9-1-a097c91c5801@mainlining.org Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/clock/qcom,gcc-msm8953.yaml | 11 ++++++++--- include/dt-bindings/clock/qcom,gcc-msm8917.h | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8953.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8953.yaml index fe1f5f3ed992..f2e37f439d28 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8953.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8953.yaml @@ -9,16 +9,21 @@ title: Qualcomm Global Clock & Reset Controller on MSM8953 maintainers: - Adam Skladowski - Sireesh Kodali + - Barnabas Czeman description: | Qualcomm global clock control module provides the clocks, resets and power - domains on MSM8953. + domains on MSM8937 or MSM8953. - See also: include/dt-bindings/clock/qcom,gcc-msm8953.h + See also:: + include/dt-bindings/clock/qcom,gcc-msm8917.h + include/dt-bindings/clock/qcom,gcc-msm8953.h properties: compatible: - const: qcom,gcc-msm8953 + enum: + - qcom,gcc-msm8937 + - qcom,gcc-msm8953 clocks: items: diff --git a/include/dt-bindings/clock/qcom,gcc-msm8917.h b/include/dt-bindings/clock/qcom,gcc-msm8917.h index 4b421e7414b5..4e3897b3669d 100644 --- a/include/dt-bindings/clock/qcom,gcc-msm8917.h +++ b/include/dt-bindings/clock/qcom,gcc-msm8917.h @@ -170,6 +170,23 @@ #define VFE1_CLK_SRC 163 #define VSYNC_CLK_SRC 164 #define GPLL0_SLEEP_CLK_SRC 165 +/* Addtional MSM8937-specific clocks */ +#define MSM8937_BLSP1_QUP1_I2C_APPS_CLK_SRC 166 +#define MSM8937_BLSP1_QUP1_SPI_APPS_CLK_SRC 167 +#define MSM8937_BLSP2_QUP4_I2C_APPS_CLK_SRC 168 +#define MSM8937_BLSP2_QUP4_SPI_APPS_CLK_SRC 169 +#define MSM8937_BYTE1_CLK_SRC 170 +#define MSM8937_ESC1_CLK_SRC 171 +#define MSM8937_PCLK1_CLK_SRC 172 +#define MSM8937_GCC_BLSP1_QUP1_I2C_APPS_CLK 173 +#define MSM8937_GCC_BLSP1_QUP1_SPI_APPS_CLK 174 +#define MSM8937_GCC_BLSP2_QUP4_I2C_APPS_CLK 175 +#define MSM8937_GCC_BLSP2_QUP4_SPI_APPS_CLK 176 +#define MSM8937_GCC_MDSS_BYTE1_CLK 177 +#define MSM8937_GCC_MDSS_ESC1_CLK 178 +#define MSM8937_GCC_MDSS_PCLK1_CLK 179 +#define MSM8937_GCC_OXILI_AON_CLK 180 +#define MSM8937_GCC_OXILI_TIMER_CLK 181 /* GCC block resets */ #define GCC_CAMSS_MICRO_BCR 0 @@ -187,5 +204,7 @@ #define VENUS_GDSC 5 #define VFE0_GDSC 6 #define VFE1_GDSC 7 +/* Additional MSM8937-specific GDSCs */ +#define MSM8937_OXILI_CX_GDSC 8 #endif -- cgit v1.2.3 From eb9bc162775cabfc4cf2b37cb0d3c2c2bf4c4b54 Mon Sep 17 00:00:00 2001 From: Denzeel Oliva Date: Sun, 31 Aug 2025 12:13:14 +0000 Subject: dt-bindings: clock: exynos990: Add LHS_ACEL clock ID for HSI0 block Add the missing LHS_ACEL clock ID for the HSI0 block. This clock is required for proper USB operation, as without it, USB connections fail with errors like device descriptor read timeouts and address response issues. Signed-off-by: Denzeel Oliva Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250831-usb-v2-1-00b9c0559733@gmail.com Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/samsung,exynos990.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/samsung,exynos990.h b/include/dt-bindings/clock/samsung,exynos990.h index c5c79e078f2f..c60f15503d5b 100644 --- a/include/dt-bindings/clock/samsung,exynos990.h +++ b/include/dt-bindings/clock/samsung,exynos990.h @@ -236,6 +236,7 @@ #define CLK_GOUT_HSI0_VGEN_LITE_HSI0_CLK 20 #define CLK_GOUT_HSI0_CMU_HSI0_PCLK 21 #define CLK_GOUT_HSI0_XIU_D_HSI0_ACLK 22 +#define CLK_GOUT_HSI0_LHS_ACEL_D_HSI0_CLK 23 /* CMU_PERIS */ #define CLK_MOUT_PERIS_BUS_USER 1 -- cgit v1.2.3 From c8ab5e888bb6721e6e084881e6e24ef2678832c3 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 1 Sep 2025 09:44:52 +0200 Subject: PCI/AER: Print TLP Log for errors introduced since PCIe r1.1 When reporting an error, the AER driver prints the TLP Header / Prefix Log only for errors enumerated in the AER_LOG_TLP_MASKS macro. The macro was never amended since its introduction in 2006 with commit 6c2b374d7485 ("PCI-Express AER implemetation: AER core and aerdriver"). At the time, PCIe r1.1 was the latest spec revision. Amend the macro with errors defined since then to avoid omitting the TLP Header / Prefix Log for newer errors. The order of the errors in AER_LOG_TLP_MASKS follows PCIe r1.1 sec 6.2.7 rather than 7.10.2, because only the former documents for which errors a TLP Header / Prefix is logged. Retain this order. The section number is still 6.2.7 in today's PCIe r7.0. For Completion Timeouts, the TLP Header / Prefix is only logged if the Completion Timeout Prefix / Header Log Capable bit is set in the AER Capabilities and Control register. Introduce a tlp_header_logged() helper to check whether the TLP Header / Prefix Log is populated and use it in the two places which currently match against AER_LOG_TLP_MASKS directly. For Uncorrectable Internal Errors, logging of the TLP Header / Prefix is optional per PCIe r7.0 sec 6.2.7. If needed, drivers could indicate through a flag whether devices are capable and tlp_header_logged() could then check that flag. pcitools introduced macros for newer errors with commit 144b0911cc0b ("ls-ecaps: extend decode support for more fields for AER CE and UE status"): https://git.kernel.org/pub/scm/utils/pciutils/pciutils.git/commit/?id=144b0911cc0b Unfortunately some of those macros are overly long: PCI_ERR_UNC_POISONED_TLP_EGRESS PCI_ERR_UNC_DMWR_REQ_EGRESS_BLOCKED PCI_ERR_UNC_IDE_CHECK PCI_ERR_UNC_MISR_IDE_TLP PCI_ERR_UNC_PCRC_CHECK PCI_ERR_UNC_TLP_XLAT_EGRESS_BLOCKED This seems unsuitable for , so shorten to: PCI_ERR_UNC_POISON_BLK PCI_ERR_UNC_DMWR_BLK PCI_ERR_UNC_IDE_CHECK PCI_ERR_UNC_MISR_IDE PCI_ERR_UNC_PCRC_CHECK PCI_ERR_UNC_XLAT_BLK Note that some of the existing macros in do not match exactly with pcitools (e.g. PCI_ERR_UNC_SDES versus PCI_ERR_UNC_SURPDN), so it does not seem mandatory for them to be identical. Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/5f707caf1260bd8f15012bb032f7da9a9b898aba.1756712066.git.lukas@wunner.de --- drivers/pci/pcie/aer.c | 30 +++++++++++++++++++++++++++--- include/uapi/linux/pci_regs.h | 7 +++++++ 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 15ed541d2fbe..62c74b5f99ae 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -96,11 +96,21 @@ struct aer_info { }; #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ + PCI_ERR_UNC_POISON_BLK | \ PCI_ERR_UNC_ECRC| \ PCI_ERR_UNC_UNSUP| \ PCI_ERR_UNC_COMP_ABORT| \ PCI_ERR_UNC_UNX_COMP| \ - PCI_ERR_UNC_MALF_TLP) + PCI_ERR_UNC_ACSV | \ + PCI_ERR_UNC_MCBTLP | \ + PCI_ERR_UNC_ATOMEG | \ + PCI_ERR_UNC_DMWR_BLK | \ + PCI_ERR_UNC_XLAT_BLK | \ + PCI_ERR_UNC_TLPPRE | \ + PCI_ERR_UNC_MALF_TLP | \ + PCI_ERR_UNC_IDE_CHECK | \ + PCI_ERR_UNC_MISR_IDE | \ + PCI_ERR_UNC_PCRC_CHECK) #define SYSTEM_ERROR_INTR_ON_MESG_MASK (PCI_EXP_RTCTL_SECEE| \ PCI_EXP_RTCTL_SENFEE| \ @@ -796,6 +806,20 @@ static int aer_ratelimit(struct pci_dev *dev, unsigned int severity) } } +static bool tlp_header_logged(u32 status, u32 capctl) +{ + /* Errors for which a header is always logged (PCIe r7.0 sec 6.2.7) */ + if (status & AER_LOG_TLP_MASKS) + return true; + + /* Completion Timeout header is only logged on capable devices */ + if (status & PCI_ERR_UNC_COMP_TIME && + capctl & PCI_ERR_CAP_COMP_TIME_LOG) + return true; + + return false; +} + static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { const char **strings; @@ -910,7 +934,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, status = aer->uncor_status; mask = aer->uncor_mask; info.level = KERN_ERR; - tlp_header_valid = status & AER_LOG_TLP_MASKS; + tlp_header_valid = tlp_header_logged(status, aer->cap_control); } info.status = status; @@ -1401,7 +1425,7 @@ int aer_get_device_error_info(struct aer_err_info *info, int i) pci_read_config_dword(dev, aer + PCI_ERR_CAP, &aercc); info->first_error = PCI_ERR_CAP_FEP(aercc); - if (info->status & AER_LOG_TLP_MASKS) { + if (tlp_header_logged(info->status, aercc)) { info->tlp_header_valid = 1; pcie_read_tlp_log(dev, aer + PCI_ERR_HEADER_LOG, aer + PCI_ERR_PREFIX_LOG, diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de60..ae1f52e8d515 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -776,6 +776,12 @@ #define PCI_ERR_UNC_MCBTLP 0x00800000 /* MC blocked TLP */ #define PCI_ERR_UNC_ATOMEG 0x01000000 /* Atomic egress blocked */ #define PCI_ERR_UNC_TLPPRE 0x02000000 /* TLP prefix blocked */ +#define PCI_ERR_UNC_POISON_BLK 0x04000000 /* Poisoned TLP Egress Blocked */ +#define PCI_ERR_UNC_DMWR_BLK 0x08000000 /* DMWr Request Egress Blocked */ +#define PCI_ERR_UNC_IDE_CHECK 0x10000000 /* IDE Check Failed */ +#define PCI_ERR_UNC_MISR_IDE 0x20000000 /* Misrouted IDE TLP */ +#define PCI_ERR_UNC_PCRC_CHECK 0x40000000 /* PCRC Check Failed */ +#define PCI_ERR_UNC_XLAT_BLK 0x80000000 /* TLP Translation Egress Blocked */ #define PCI_ERR_UNCOR_MASK 0x08 /* Uncorrectable Error Mask */ /* Same bits as above */ #define PCI_ERR_UNCOR_SEVER 0x0c /* Uncorrectable Error Severity */ @@ -798,6 +804,7 @@ #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ #define PCI_ERR_CAP_PREFIX_LOG_PRESENT 0x00000800 /* TLP Prefix Log Present */ +#define PCI_ERR_CAP_COMP_TIME_LOG 0x00001000 /* Completion Timeout Prefix/Header Log Capable */ #define PCI_ERR_CAP_TLP_LOG_FLIT 0x00040000 /* TLP was logged in Flit Mode */ #define PCI_ERR_CAP_TLP_LOG_SIZE 0x00f80000 /* Logged TLP Size (only in Flit mode) */ #define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ -- cgit v1.2.3 From d8b269e009bbc471cb2735b5f737839495efce3b Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Thu, 4 Sep 2025 15:45:05 +0800 Subject: cgroup: Remove unused cgroup_subsys::post_attach cgroup_subsys::post_attach callback was introduced in commit 5cf1cacb49ae ("cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback") and only cpuset would use this callback to wait for the mm migration to complete at the end of __cgroup_procs_write(). Since the previous patch defer the flush operation until returning to userspace, no one use this callback now. Remove this callback from cgroup_subsys. Signed-off-by: Chuyi Zhou Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 - kernel/cgroup/cgroup.c | 4 ---- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 539c64eeef38..92ed6d18266d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -763,7 +763,6 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); - void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b38d7a847ed4..0dc6ad71f175 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3033,10 +3033,6 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) -- cgit v1.2.3 From b28f9eba12a4967eff6e8a1c0512f86f1ac7fa68 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 28 Jun 2025 11:37:30 -0400 Subject: change the calling conventions for vfs_parse_fs_string() Absolute majority of callers are passing the 4th argument equal to strlen() of the 3rd one. Drop the v_size argument, add vfs_parse_fs_qstr() for the cases that want independent length. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- Documentation/filesystems/mount_api.rst | 10 +++++++++- Documentation/filesystems/porting.rst | 12 ++++++++++++ drivers/gpu/drm/i915/gem/i915_gemfs.c | 9 ++------- drivers/gpu/drm/v3d/v3d_gemfs.c | 9 ++------- fs/afs/mntpt.c | 3 ++- fs/fs_context.c | 17 +++++++---------- fs/namespace.c | 8 +++----- fs/nfs/fs_context.c | 3 +-- fs/nfs/namespace.c | 3 ++- fs/smb/client/fs_context.c | 4 +--- include/linux/fs_context.h | 9 +++++++-- kernel/trace/trace.c | 3 +-- 12 files changed, 49 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index e149b89118c8..c99ab1f7fea4 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -504,10 +504,18 @@ returned. clear the pointer, but then becomes responsible for disposing of the object. + * :: + + int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value); + + A wrapper around vfs_parse_fs_param() that copies the value string it is + passed. + * :: int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); + const char *value); A wrapper around vfs_parse_fs_param() that copies the value string it is passed. diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..ab48ab3f6eb2 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Calling conventions for vfs_parse_fs_string() have changed; it does *not* +take length anymore (value ? strlen(value) : 0 is used). If you want +a different length, use + + vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len)) + +instead. diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c index a09e2eb47175..8f13ec4ff0d0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gemfs.c +++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c @@ -11,11 +11,6 @@ #include "i915_gemfs.h" #include "i915_utils.h" -static int add_param(struct fs_context *fc, const char *key, const char *val) -{ - return vfs_parse_fs_string(fc, key, val, strlen(val)); -} - void i915_gemfs_init(struct drm_i915_private *i915) { struct file_system_type *type; @@ -48,9 +43,9 @@ void i915_gemfs_init(struct drm_i915_private *i915) fc = fs_context_for_mount(type, SB_KERNMOUNT); if (IS_ERR(fc)) goto err; - ret = add_param(fc, "source", "tmpfs"); + ret = vfs_parse_fs_string(fc, "source", "tmpfs"); if (!ret) - ret = add_param(fc, "huge", "within_size"); + ret = vfs_parse_fs_string(fc, "huge", "within_size"); if (!ret) gemfs = fc_mount_longterm(fc); put_fs_context(fc); diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c index 8ec6ed82b3d9..c1a30166c099 100644 --- a/drivers/gpu/drm/v3d/v3d_gemfs.c +++ b/drivers/gpu/drm/v3d/v3d_gemfs.c @@ -7,11 +7,6 @@ #include "v3d_drv.h" -static int add_param(struct fs_context *fc, const char *key, const char *val) -{ - return vfs_parse_fs_string(fc, key, val, strlen(val)); -} - void v3d_gemfs_init(struct v3d_dev *v3d) { struct file_system_type *type; @@ -38,9 +33,9 @@ void v3d_gemfs_init(struct v3d_dev *v3d) fc = fs_context_for_mount(type, SB_KERNMOUNT); if (IS_ERR(fc)) goto err; - ret = add_param(fc, "source", "tmpfs"); + ret = vfs_parse_fs_string(fc, "source", "tmpfs"); if (!ret) - ret = add_param(fc, "huge", "within_size"); + ret = vfs_parse_fs_string(fc, "huge", "within_size"); if (!ret) gemfs = fc_mount_longterm(fc); put_fs_context(fc); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 9434a5399f2b..1ad048e6e164 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -137,7 +137,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ret = -EINVAL; if (content[size - 1] == '.') - ret = vfs_parse_fs_string(fc, "source", content, size - 1); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(content, size - 1)); do_delayed_call(&cleanup); if (ret < 0) return ret; diff --git a/fs/fs_context.c b/fs/fs_context.c index 666e61753aed..93b7ebf8d927 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -161,25 +161,24 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) EXPORT_SYMBOL(vfs_parse_fs_param); /** - * vfs_parse_fs_string - Convenience function to just parse a string. + * vfs_parse_fs_qstr - Convenience function to just parse a string. * @fc: Filesystem context. * @key: Parameter name. * @value: Default value. - * @v_size: Maximum number of bytes in the value. */ -int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size) +int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value) { int ret; struct fs_parameter param = { .key = key, .type = fs_value_is_flag, - .size = v_size, + .size = value ? value->len : 0, }; if (value) { - param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; @@ -189,7 +188,7 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, kfree(param.string); return ret; } -EXPORT_SYMBOL(vfs_parse_fs_string); +EXPORT_SYMBOL(vfs_parse_fs_qstr); /** * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data @@ -218,16 +217,14 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, while ((key = sep(&options)) != NULL) { if (*key) { - size_t v_len = 0; char *value = strchr(key, '='); if (value) { if (unlikely(value == key)) continue; *value++ = 0; - v_len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, v_len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..88636124c8fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1281,8 +1281,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, return ERR_CAST(fc); if (name) - ret = vfs_parse_fs_string(fc, "source", - name, strlen(name)); + ret = vfs_parse_fs_string(fc, "source", name); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) @@ -3793,10 +3792,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, fc->oldapi = true; if (subtype) - err = vfs_parse_fs_string(fc, "subtype", - subtype, strlen(subtype)); + err = vfs_parse_fs_string(fc, "subtype", subtype); if (!err && name) - err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + err = vfs_parse_fs_string(fc, "source", name); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9e94d18448ff..b4679b7161b0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - ret = vfs_parse_fs_string(fc, "context", - data->context, strlen(data->context)); + ret = vfs_parse_fs_string(fc, "context", data->context); if (ret < 0) return ret; #else diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7f1ec9c67ff2..5735c0448b4c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc) nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); ret = PTR_ERR(p); } else { - ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); if (!ret) ret = vfs_get_tree(fc); } diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 072383899e81..306b76e97783 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -773,16 +773,14 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, } - len = 0; value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; - len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039..97b514a79a49 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -134,8 +134,13 @@ extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_ty extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param); -extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); +extern int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value); +static inline int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value) +{ + return vfs_parse_fs_qstr(fc, key, value ? &QSTR(value) : NULL); +} int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, char *(*sep)(char **)); extern int generic_parse_monolithic(struct fs_context *fc, void *data); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4283ed4e8f59..15375b45fd74 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10201,8 +10201,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); - ret = vfs_parse_fs_string(fc, "source", - "tracefs", strlen("tracefs")); + ret = vfs_parse_fs_string(fc, "source", "tracefs"); if (!ret) mnt = fc_mount(fc); else -- cgit v1.2.3 From 038c7dc66e2744e5df57163b8f957745ae10d23e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Sep 2025 20:46:40 -0700 Subject: compiler_types.h: Move __nocfi out of compiler-specific header Prepare for GCC KCFI support and move the __nocfi attribute from compiler-clang.h to compiler_types.h. This was already gated by CONFIG_CFI_CLANG, so this remains safe for non-KCFI GCC builds. Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250904034656.3670313-1-kees@kernel.org --- include/linux/compiler-clang.h | 5 ----- include/linux/compiler_types.h | 4 +++- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc7..7a4568e421dc 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -96,11 +96,6 @@ # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#if __has_feature(kcfi) -/* Disable CFI checking inside a function. */ -#define __nocfi __attribute__((__no_sanitize__("kcfi"))) -#endif - /* * Turn individual warnings and errors on and off locally, depending * on version. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11..a910f9fa5341 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -432,7 +432,9 @@ struct ftrace_likely_data { # define __noscs #endif -#ifndef __nocfi +#if defined(CONFIG_CFI_CLANG) +# define __nocfi __attribute__((__no_sanitize__("kcfi"))) +#else # define __nocfi #endif -- cgit v1.2.3 From 0b815825b1b0bd6762ca028e9b6631b002efb7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Sep 2025 20:46:45 -0700 Subject: x86/cfi: Remove __noinitretpoline and __noretpoline Commit 66f793099a63 ("x86/retpoline: Avoid retpolines for built-in __init functions") disabled retpolines in __init sections (__noinitretpoline) as a precaution against potential issues with retpolines in early boot, but it has not been a problem in practice (i.e. see Clang below). Commit 87358710c1fb ("x86/retpoline: Support retpoline builds with Clang") narrowed this to only GCC, as Clang doesn't have per-function control over retpoline emission. As such, Clang has been booting with retpolines in __init since retpoline support was introduced. Clang KCFI has been instrumenting __init since CFI was introduced. With the introduction of KCFI for GCC, KCFI instrumentation with retpolines disabled means that objtool does not construct .retpoline_sites section entries for the non-retpoline KCFI calls. At boot, the KCFI rehashing code, via __apply_fineibt(), misses all __init KCFI calls (since they are not retpolines), resulting in immediate hash mismatches: all preambles are rehashed (via .cfi_sites) and none of the __init call sites are rehashed. Remove __noinitretpoline since it provides no meaningful utility and creates problems with CFI. Additionally remove __noretpoline since it is now unused. Alternatively, cfi_rand_callers() could walk the .kcfi_traps section which is exactly the list of KCFI instrumentation sites. But it seems better to have as few differences in common instruction sequences between compilers as possible, so better to remove the special handling of retpolines in __init for GCC. Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250904034656.3670313-6-kees@kernel.org --- include/linux/compiler-gcc.h | 4 ---- include/linux/init.h | 8 -------- 2 files changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5d07c469b571..5de824a0b3d7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -35,10 +35,6 @@ (typeof(ptr)) (__ptr + (off)); \ }) -#ifdef CONFIG_MITIGATION_RETPOLINE -#define __noretpoline __attribute__((__indirect_branch__("keep"))) -#endif - #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif diff --git a/include/linux/init.h b/include/linux/init.h index a60d32d227ee..17c1bc712e23 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -7,13 +7,6 @@ #include #include -/* Built-in __init functions needn't be compiled with retpoline */ -#if defined(__noretpoline) && !defined(MODULE) -#define __noinitretpoline __noretpoline -#else -#define __noinitretpoline -#endif - /* These macros are used to mark some functions or * initialized data (doesn't apply to uninitialized data) * as `initialization' functions. The kernel can take this @@ -50,7 +43,6 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ #define __init __section(".init.text") __cold __latent_entropy \ - __noinitretpoline \ __no_kstack_erase #define __initdata __section(".init.data") #define __initconst __section(".init.rodata") -- cgit v1.2.3 From 54728bd535fb3899ad51489dc1e05eb5bb53cb95 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 1 Sep 2025 15:27:43 +0200 Subject: bpf: Return an error pointer for skb metadata when CONFIG_NET=n Kernel Test Robot reported a compiler warning - a null pointer may be passed to memmove in __bpf_dynptr_{read,write} when building without networking support. The warning is correct from a static analysis standpoint, but not actually reachable. Without CONFIG_NET, creating dynptrs to skb metadata is impossible since the constructor kfunc is missing. Silence the false-postive diagnostic message by returning an error pointer from bpf_skb_meta_pointer stub when CONFIG_NET=n. Fixes: 6877cd392bae ("bpf: Enable read/write access to skb metadata through a dynptr") Closes: https://lore.kernel.org/oe-kbuild-all/202508212031.ir9b3B6Q-lkp@intel.com/ Reported-by: kernel test robot Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250901-dynptr-skb-meta-no-net-v2-1-ce607fcb6091@cloudflare.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9ed21b65e2e9..af6d9354662c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1822,7 +1822,7 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NET */ -- cgit v1.2.3 From 0a0fdb98d16e334e259352893462030f15fb887f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 26 Aug 2025 17:08:19 +0200 Subject: fuse: remove FUSE_NOTIFY_CODE_MAX from Constants that change value from version to version have no place in an interface definition. Hopefully this won't break anything. Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 6b9fb8b08768..30bf0846547f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -680,7 +680,6 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, - FUSE_NOTIFY_CODE_MAX, }; /* The read buffer is required to be at least 8k, but may be much larger */ -- cgit v1.2.3 From 3f29d59e92a96d843c2ff10ebfed92ac26878658 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 2 Sep 2025 10:22:06 +0200 Subject: fuse: add prune notification Some fuse servers need to prune their caches, which can only be done if the kernel's own dentry/inode caches are pruned first to avoid dangling references. Add FUSE_NOTIFY_PRUNE, which takes an array of node ID's to try and get rid of. Inodes with active references are skipped. A similar functionality is already provided by FUSE_NOTIFY_INVAL_ENTRY with the FUSE_EXPIRE_ONLY flag. Differences in the interface are FUSE_NOTIFY_INVAL_ENTRY: - can only prune one dentry - dentry is determined by parent ID and name - if inode has multiple aliases (cached hard links), then they would have to be invalidated individually to be able to get rid of the inode FUSE_NOTIFY_PRUNE: - can prune multiple inodes - inodes determined by their node ID - aliases are taken care of automatically Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 11 +++++++++++ include/uapi/linux/fuse.h | 8 ++++++++ 4 files changed, 64 insertions(+) (limited to 'include') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1258acee9704..4229b38546bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2034,6 +2034,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2065,6 +2101,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: return -EINVAL; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 233c6111f768..fb6604120b53 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1413,6 +1413,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5b7897bf7e45..a4d361b34d06 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -585,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 30bf0846547f..c13e1f9a2f12 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -239,6 +239,7 @@ * 7.45 * - add FUSE_COPY_FILE_RANGE_64 * - add struct fuse_copy_file_range_out + * - add FUSE_NOTIFY_PRUNE */ #ifndef _LINUX_FUSE_H @@ -680,6 +681,7 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, + FUSE_NOTIFY_PRUNE = 9, }; /* The read buffer is required to be at least 8k, but may be much larger */ @@ -1118,6 +1120,12 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_notify_prune_out { + uint32_t count; + uint32_t padding; + uint64_t spare; +}; + struct fuse_backing_map { int32_t fd; uint32_t flags; -- cgit v1.2.3 From 74a0e72f03ffd01b5d88b411f02d9b9861fdb99e Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 21 Aug 2025 12:15:59 +0200 Subject: iommu/io-pgtable-dart: Add 4-level page table support DARTs on t602x SoCs are of the t8110 variant but have an IAS of 42, which means optional support for an extra page table level. Refactor the PTE management to support an arbitrary level count, and then calculate how many levels we need for any given configuration. Signed-off-by: Hector Martin Signed-off-by: Janne Grunau Reviewed-by: Sven Peter Reviewed-by: Neal Gompa Link: https://lore.kernel.org/r/20250821-apple-dart-4levels-v2-2-e39af79daa37@jannau.net Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-dart.c | 139 +++++++++++++++++++++++++--------------- include/linux/io-pgtable.h | 1 + 2 files changed, 87 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index 679bda104797..9a63c80a2786 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -27,8 +27,9 @@ #define DART1_MAX_ADDR_BITS 36 -#define DART_MAX_TABLES 4 -#define DART_LEVELS 2 +#define DART_MAX_TABLE_BITS 2 +#define DART_MAX_TABLES BIT(DART_MAX_TABLE_BITS) +#define DART_MAX_LEVELS 4 /* Includes TTBR level */ /* Struct accessors */ #define io_pgtable_to_data(x) \ @@ -68,6 +69,7 @@ struct dart_io_pgtable { struct io_pgtable iop; + int levels; int tbl_bits; int bits_per_level; @@ -156,44 +158,45 @@ static dart_iopte dart_install_table(dart_iopte *table, return old; } -static int dart_get_table(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_index(struct dart_io_pgtable *data, unsigned long iova, int level) { - return (iova >> (3 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->tbl_bits) - 1); + return (iova >> (level * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & + ((1 << data->bits_per_level) - 1); } -static int dart_get_l1_index(struct dart_io_pgtable *data, unsigned long iova) -{ - - return (iova >> (2 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->bits_per_level) - 1); -} - -static int dart_get_l2_index(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_last_index(struct dart_io_pgtable *data, unsigned long iova) { return (iova >> (data->bits_per_level + ilog2(sizeof(dart_iopte)))) & ((1 << data->bits_per_level) - 1); } -static dart_iopte *dart_get_l2(struct dart_io_pgtable *data, unsigned long iova) +static dart_iopte *dart_get_last(struct dart_io_pgtable *data, unsigned long iova) { dart_iopte pte, *ptep; - int tbl = dart_get_table(data, iova); + int level = data->levels; + int tbl = dart_get_index(data, iova, level); + + if (tbl > (1 << data->tbl_bits)) + return NULL; ptep = data->pgd[tbl]; if (!ptep) return NULL; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* Valid entry? */ - if (!pte) - return NULL; + /* Valid entry? */ + if (!pte) + return NULL; - /* Deref to get level 2 table */ - return iopte_deref(pte, data); + /* Deref to get next level table */ + ptep = iopte_deref(pte, data); + } + + return ptep; } static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data, @@ -230,6 +233,7 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, int ret = 0, tbl, num_entries, max_entries, map_idx_start; dart_iopte pte, *cptep, *ptep; dart_iopte prot; + int level = data->levels; if (WARN_ON(pgsize != cfg->pgsize_bitmap)) return -EINVAL; @@ -240,31 +244,36 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) return -EINVAL; - tbl = dart_get_table(data, iova); + tbl = dart_get_index(data, iova, level); + + if (tbl > (1 << data->tbl_bits)) + return -ENOMEM; ptep = data->pgd[tbl]; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* no L2 table present */ - if (!pte) { - cptep = iommu_alloc_pages_sz(gfp, tblsz); - if (!cptep) - return -ENOMEM; + /* no table present */ + if (!pte) { + cptep = iommu_alloc_pages_sz(gfp, tblsz); + if (!cptep) + return -ENOMEM; - pte = dart_install_table(cptep, ptep, 0, data); - if (pte) - iommu_free_pages(cptep); + pte = dart_install_table(cptep, ptep, 0, data); + if (pte) + iommu_free_pages(cptep); - /* L2 table is present (now) */ - pte = READ_ONCE(*ptep); - } + /* L2 table is present (now) */ + pte = READ_ONCE(*ptep); + } - ptep = iopte_deref(pte, data); + ptep = iopte_deref(pte, data); + } /* install a leaf entries into L2 table */ prot = dart_prot_to_pte(data, iommu_prot); - map_idx_start = dart_get_l2_index(data, iova); + map_idx_start = dart_get_last_index(data, iova); max_entries = DART_PTES_PER_TABLE(data) - map_idx_start; num_entries = min_t(int, pgcount, max_entries); ptep += map_idx_start; @@ -293,13 +302,13 @@ static size_t dart_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(pgsize != cfg->pgsize_bitmap || !pgcount)) return 0; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (WARN_ON(!ptep)) return 0; - unmap_idx_start = dart_get_l2_index(data, iova); + unmap_idx_start = dart_get_last_index(data, iova); ptep += unmap_idx_start; max_entries = DART_PTES_PER_TABLE(data) - unmap_idx_start; @@ -330,13 +339,13 @@ static phys_addr_t dart_iova_to_phys(struct io_pgtable_ops *ops, struct dart_io_pgtable *data = io_pgtable_ops_to_data(ops); dart_iopte pte, *ptep; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (!ptep) return 0; - ptep += dart_get_l2_index(data, iova); + ptep += dart_get_last_index(data, iova); pte = READ_ONCE(*ptep); /* Found translation */ @@ -353,21 +362,37 @@ static struct dart_io_pgtable * dart_alloc_pgtable(struct io_pgtable_cfg *cfg) { struct dart_io_pgtable *data; - int tbl_bits, bits_per_level, va_bits, pg_shift; + int levels, max_tbl_bits, tbl_bits, bits_per_level, va_bits, pg_shift; + + /* + * Old 4K page DARTs can use up to 4 top-level tables. + * Newer ones only ever use a maximum of 1. + */ + if (cfg->pgsize_bitmap == SZ_4K) + max_tbl_bits = DART_MAX_TABLE_BITS; + else + max_tbl_bits = 0; pg_shift = __ffs(cfg->pgsize_bitmap); bits_per_level = pg_shift - ilog2(sizeof(dart_iopte)); va_bits = cfg->ias - pg_shift; - tbl_bits = max_t(int, 0, va_bits - (bits_per_level * DART_LEVELS)); - if ((1 << tbl_bits) > DART_MAX_TABLES) + levels = max_t(int, 2, (va_bits - max_tbl_bits + bits_per_level - 1) / bits_per_level); + + if (levels > (DART_MAX_LEVELS - 1)) + return NULL; + + tbl_bits = max_t(int, 0, va_bits - (bits_per_level * levels)); + + if (tbl_bits > max_tbl_bits) return NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; + data->levels = levels + 1; /* Table level counts as one level */ data->tbl_bits = tbl_bits; data->bits_per_level = bits_per_level; @@ -403,6 +428,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) return NULL; cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits; + cfg->apple_dart_cfg.n_levels = data->levels; for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) { data->pgd[i] = @@ -422,24 +448,31 @@ out_free_data: return NULL; } -static void apple_dart_free_pgtable(struct io_pgtable *iop) +static void apple_dart_free_pgtables(struct dart_io_pgtable *data, dart_iopte *ptep, int level) { - struct dart_io_pgtable *data = io_pgtable_to_data(iop); - dart_iopte *ptep, *end; - int i; + dart_iopte *end; + dart_iopte *start = ptep; - for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) { - ptep = data->pgd[i]; + if (level > 1) { end = (void *)ptep + DART_GRANULE(data); while (ptep != end) { dart_iopte pte = *ptep++; if (pte) - iommu_free_pages(iopte_deref(pte, data)); + apple_dart_free_pgtables(data, iopte_deref(pte, data), level - 1); } - iommu_free_pages(data->pgd[i]); } + iommu_free_pages(start); +} + +static void apple_dart_free_pgtable(struct io_pgtable *iop) +{ + struct dart_io_pgtable *data = io_pgtable_to_data(iop); + int i; + + for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) + apple_dart_free_pgtables(data, data->pgd[i], data->levels - 1); kfree(data); } diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 138fbd89b1e6..8a823c6f2b4a 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -180,6 +180,7 @@ struct io_pgtable_cfg { struct { u64 ttbr[4]; u32 n_ttbrs; + u32 n_levels; } apple_dart_cfg; struct { -- cgit v1.2.3 From 8f77295525825086cb43675cd1a4f3716b119d7f Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 18 Aug 2025 10:28:05 +0530 Subject: ACPI: RISC-V: Add support for RIMT RISC-V IO Mapping Table (RIMT) is a static ACPI table to communicate IOMMU information to the OS. The spec is available at [1]. The changes at high level are, a) Initialize data structures required for IOMMU/device configuration using the data from RIMT. Provide APIs required for device configuration. b) Provide an API for IOMMU drivers to register the fwnode with RIMT data structures. This API will create a fwnode for PCIe IOMMU. [1] - https://github.com/riscv-non-isa/riscv-acpi-rimt Signed-off-by: Sunil V L Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250818045807.763922-2-sunilvl@ventanamicro.com Signed-off-by: Joerg Roedel --- MAINTAINERS | 1 + arch/riscv/Kconfig | 1 + drivers/acpi/Kconfig | 4 + drivers/acpi/riscv/Kconfig | 7 + drivers/acpi/riscv/Makefile | 1 + drivers/acpi/riscv/init.c | 2 + drivers/acpi/riscv/init.h | 1 + drivers/acpi/riscv/rimt.c | 520 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi_rimt.h | 28 +++ 9 files changed, 565 insertions(+) create mode 100644 drivers/acpi/riscv/Kconfig create mode 100644 drivers/acpi/riscv/rimt.c create mode 100644 include/linux/acpi_rimt.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..af562bb23bef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -347,6 +347,7 @@ L: linux-acpi@vger.kernel.org L: linux-riscv@lists.infradead.org S: Maintained F: drivers/acpi/riscv/ +F: include/linux/acpi_rimt.h ACPI PCC(Platform Communication Channel) MAILBOX DRIVER M: Sudeep Holla diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659e..7563f43ec026 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -16,6 +16,7 @@ config RISCV select ACPI_MCFG if (ACPI && PCI) select ACPI_PPTT if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI + select ACPI_RIMT if ACPI select ACPI_SPCR_TABLE if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index b594780a57d7..2cdbd08b30e4 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -547,6 +547,10 @@ if ARM64 source "drivers/acpi/arm64/Kconfig" endif +if RISCV +source "drivers/acpi/riscv/Kconfig" +endif + config ACPI_PPTT bool diff --git a/drivers/acpi/riscv/Kconfig b/drivers/acpi/riscv/Kconfig new file mode 100644 index 000000000000..046296a18d00 --- /dev/null +++ b/drivers/acpi/riscv/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# ACPI Configuration for RISC-V +# + +config ACPI_RIMT + bool diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile index a96fdf1e2cb8..1284a076fa88 100644 --- a/drivers/acpi/riscv/Makefile +++ b/drivers/acpi/riscv/Makefile @@ -2,3 +2,4 @@ obj-y += rhct.o init.o irq.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o +obj-$(CONFIG_ACPI_RIMT) += rimt.o diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c index 673e4d5dd752..7c00f7995e86 100644 --- a/drivers/acpi/riscv/init.c +++ b/drivers/acpi/riscv/init.c @@ -10,4 +10,6 @@ void __init acpi_arch_init(void) { riscv_acpi_init_gsi_mapping(); + if (IS_ENABLED(CONFIG_ACPI_RIMT)) + riscv_acpi_rimt_init(); } diff --git a/drivers/acpi/riscv/init.h b/drivers/acpi/riscv/init.h index 0b9a07e4031f..1680aa2aaf23 100644 --- a/drivers/acpi/riscv/init.h +++ b/drivers/acpi/riscv/init.h @@ -2,3 +2,4 @@ #include void __init riscv_acpi_init_gsi_mapping(void); +void __init riscv_acpi_rimt_init(void); diff --git a/drivers/acpi/riscv/rimt.c b/drivers/acpi/riscv/rimt.c new file mode 100644 index 000000000000..683fcfe35c31 --- /dev/null +++ b/drivers/acpi/riscv/rimt.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024-2025, Ventana Micro Systems Inc + * Author: Sunil V L + * + */ + +#define pr_fmt(fmt) "ACPI: RIMT: " fmt + +#include +#include +#include +#include +#include +#include +#include "init.h" + +struct rimt_fwnode { + struct list_head list; + struct acpi_rimt_node *rimt_node; + struct fwnode_handle *fwnode; +}; + +static LIST_HEAD(rimt_fwnode_list); +static DEFINE_SPINLOCK(rimt_fwnode_lock); + +#define RIMT_TYPE_MASK(type) (1 << (type)) +#define RIMT_IOMMU_TYPE BIT(0) + +/* Root pointer to the mapped RIMT table */ +static struct acpi_table_header *rimt_table; + +/** + * rimt_set_fwnode() - Create rimt_fwnode and use it to register + * iommu data in the rimt_fwnode_list + * + * @rimt_node: RIMT table node associated with the IOMMU + * @fwnode: fwnode associated with the RIMT node + * + * Returns: 0 on success + * <0 on failure + */ +static int rimt_set_fwnode(struct acpi_rimt_node *rimt_node, + struct fwnode_handle *fwnode) +{ + struct rimt_fwnode *np; + + np = kzalloc(sizeof(*np), GFP_ATOMIC); + + if (WARN_ON(!np)) + return -ENOMEM; + + INIT_LIST_HEAD(&np->list); + np->rimt_node = rimt_node; + np->fwnode = fwnode; + + spin_lock(&rimt_fwnode_lock); + list_add_tail(&np->list, &rimt_fwnode_list); + spin_unlock(&rimt_fwnode_lock); + + return 0; +} + +/** + * rimt_get_fwnode() - Retrieve fwnode associated with an RIMT node + * + * @node: RIMT table node to be looked-up + * + * Returns: fwnode_handle pointer on success, NULL on failure + */ +static struct fwnode_handle *rimt_get_fwnode(struct acpi_rimt_node *node) +{ + struct fwnode_handle *fwnode = NULL; + struct rimt_fwnode *curr; + + spin_lock(&rimt_fwnode_lock); + list_for_each_entry(curr, &rimt_fwnode_list, list) { + if (curr->rimt_node == node) { + fwnode = curr->fwnode; + break; + } + } + spin_unlock(&rimt_fwnode_lock); + + return fwnode; +} + +static acpi_status rimt_match_node_callback(struct acpi_rimt_node *node, + void *context) +{ + acpi_status status = AE_NOT_FOUND; + struct device *dev = context; + + if (node->type == ACPI_RIMT_NODE_TYPE_IOMMU) { + struct acpi_rimt_iommu *iommu_node = (struct acpi_rimt_iommu *)&node->node_data; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev; + u16 bdf; + + pdev = to_pci_dev(dev); + bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); + if ((pci_domain_nr(pdev->bus) == iommu_node->pcie_segment_number) && + bdf == iommu_node->pcie_bdf) { + status = AE_OK; + } else { + status = AE_NOT_FOUND; + } + } else { + struct platform_device *pdev = to_platform_device(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (res && res->start == iommu_node->base_address) + status = AE_OK; + else + status = AE_NOT_FOUND; + } + } else if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + struct acpi_rimt_pcie_rc *pci_rc; + struct pci_bus *bus; + + bus = to_pci_bus(dev); + pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data; + + /* + * It is assumed that PCI segment numbers maps one-to-one + * with root complexes. Each segment number can represent only + * one root complex. + */ + status = pci_rc->pcie_segment_number == pci_domain_nr(bus) ? + AE_OK : AE_NOT_FOUND; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_rimt_platform_device *ncomp; + struct device *plat_dev = dev; + struct acpi_device *adev; + + /* + * Walk the device tree to find a device with an + * ACPI companion; there is no point in scanning + * RIMT for a device matching a platform device if + * the device does not have an ACPI companion to + * start with. + */ + do { + adev = ACPI_COMPANION(plat_dev); + if (adev) + break; + + plat_dev = plat_dev->parent; + } while (plat_dev); + + if (!adev) + return status; + + status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); + if (ACPI_FAILURE(status)) { + dev_warn(plat_dev, "Can't get device full path name\n"); + return status; + } + + ncomp = (struct acpi_rimt_platform_device *)node->node_data; + status = !strcmp(ncomp->device_name, buf.pointer) ? + AE_OK : AE_NOT_FOUND; + acpi_os_free(buf.pointer); + } + + return status; +} + +static struct acpi_rimt_node *rimt_scan_node(enum acpi_rimt_node_type type, + void *context) +{ + struct acpi_rimt_node *rimt_node, *rimt_end; + struct acpi_table_rimt *rimt; + int i; + + if (!rimt_table) + return NULL; + + /* Get the first RIMT node */ + rimt = (struct acpi_table_rimt *)rimt_table; + rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt, + rimt->node_offset); + rimt_end = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, + rimt_table->length); + + for (i = 0; i < rimt->num_nodes; i++) { + if (WARN_TAINT(rimt_node >= rimt_end, TAINT_FIRMWARE_WORKAROUND, + "RIMT node pointer overflows, bad table!\n")) + return NULL; + + if (rimt_node->type == type && + ACPI_SUCCESS(rimt_match_node_callback(rimt_node, context))) + return rimt_node; + + rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_node, + rimt_node->length); + } + + return NULL; +} + +static bool rimt_pcie_rc_supports_ats(struct acpi_rimt_node *node) +{ + struct acpi_rimt_pcie_rc *pci_rc; + + pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data; + return pci_rc->flags & ACPI_RIMT_PCIE_ATS_SUPPORTED; +} + +static int rimt_iommu_xlate(struct device *dev, struct acpi_rimt_node *node, u32 deviceid) +{ + struct fwnode_handle *rimt_fwnode; + + if (!node) + return -ENODEV; + + rimt_fwnode = rimt_get_fwnode(node); + + /* + * The IOMMU drivers may not be probed yet. + * Defer the IOMMU configuration + */ + if (!rimt_fwnode) + return -EPROBE_DEFER; + + return acpi_iommu_fwspec_init(dev, deviceid, rimt_fwnode); +} + +struct rimt_pci_alias_info { + struct device *dev; + struct acpi_rimt_node *node; + const struct iommu_ops *ops; +}; + +static int rimt_id_map(struct acpi_rimt_id_mapping *map, u8 type, u32 rid_in, u32 *rid_out) +{ + if (rid_in < map->source_id_base || + (rid_in > map->source_id_base + map->num_ids)) + return -ENXIO; + + *rid_out = map->dest_id_base + (rid_in - map->source_id_base); + return 0; +} + +static struct acpi_rimt_node *rimt_node_get_id(struct acpi_rimt_node *node, + u32 *id_out, int index) +{ + struct acpi_rimt_platform_device *plat_node; + u32 id_mapping_offset, num_id_mapping; + struct acpi_rimt_pcie_rc *pci_node; + struct acpi_rimt_id_mapping *map; + struct acpi_rimt_node *parent; + + if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data; + id_mapping_offset = pci_node->id_mapping_offset; + num_id_mapping = pci_node->num_id_mappings; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + plat_node = (struct acpi_rimt_platform_device *)&node->node_data; + id_mapping_offset = plat_node->id_mapping_offset; + num_id_mapping = plat_node->num_id_mappings; + } else { + return NULL; + } + + if (!id_mapping_offset || !num_id_mapping || index >= num_id_mapping) + return NULL; + + map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node, + id_mapping_offset + index * sizeof(*map)); + + /* Firmware bug! */ + if (!map->dest_offset) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + return NULL; + } + + parent = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, map->dest_offset); + + if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE || + node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + *id_out = map->dest_id_base; + return parent; + } + + return NULL; +} + +/* + * RISC-V supports IOMMU as a PCI device or a platform device. + * When it is a platform device, there should be a namespace device as + * well along with RIMT. To create the link between RIMT information and + * the platform device, the IOMMU driver should register itself with the + * RIMT module. This is true for PCI based IOMMU as well. + */ +int rimt_iommu_register(struct device *dev) +{ + struct fwnode_handle *rimt_fwnode; + struct acpi_rimt_node *node; + + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_IOMMU, dev); + if (!node) { + pr_err("Could not find IOMMU node in RIMT\n"); + return -ENODEV; + } + + if (dev_is_pci(dev)) { + rimt_fwnode = acpi_alloc_fwnode_static(); + if (!rimt_fwnode) + return -ENOMEM; + + rimt_fwnode->dev = dev; + if (!dev->fwnode) + dev->fwnode = rimt_fwnode; + + rimt_set_fwnode(node, rimt_fwnode); + } else { + rimt_set_fwnode(node, dev->fwnode); + } + + return 0; +} + +#ifdef CONFIG_IOMMU_API + +static struct acpi_rimt_node *rimt_node_map_id(struct acpi_rimt_node *node, + u32 id_in, u32 *id_out, + u8 type_mask) +{ + struct acpi_rimt_platform_device *plat_node; + u32 id_mapping_offset, num_id_mapping; + struct acpi_rimt_pcie_rc *pci_node; + u32 id = id_in; + + /* Parse the ID mapping tree to find specified node type */ + while (node) { + struct acpi_rimt_id_mapping *map; + int i, rc = 0; + u32 map_id = id; + + if (RIMT_TYPE_MASK(node->type) & type_mask) { + if (id_out) + *id_out = id; + return node; + } + + if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data; + id_mapping_offset = pci_node->id_mapping_offset; + num_id_mapping = pci_node->num_id_mappings; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + plat_node = (struct acpi_rimt_platform_device *)&node->node_data; + id_mapping_offset = plat_node->id_mapping_offset; + num_id_mapping = plat_node->num_id_mappings; + } else { + goto fail_map; + } + + if (!id_mapping_offset || !num_id_mapping) + goto fail_map; + + map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node, + id_mapping_offset); + + /* Firmware bug! */ + if (!map->dest_offset) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + goto fail_map; + } + + /* Do the ID translation */ + for (i = 0; i < num_id_mapping; i++, map++) { + rc = rimt_id_map(map, node->type, map_id, &id); + if (!rc) + break; + } + + if (i == num_id_mapping) + goto fail_map; + + node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, + rc ? 0 : map->dest_offset); + } + +fail_map: + /* Map input ID to output ID unchanged on mapping failure */ + if (id_out) + *id_out = id_in; + + return NULL; +} + +static struct acpi_rimt_node *rimt_node_map_platform_id(struct acpi_rimt_node *node, u32 *id_out, + u8 type_mask, int index) +{ + struct acpi_rimt_node *parent; + u32 id; + + parent = rimt_node_get_id(node, &id, index); + if (!parent) + return NULL; + + if (!(RIMT_TYPE_MASK(parent->type) & type_mask)) + parent = rimt_node_map_id(parent, id, id_out, type_mask); + else + if (id_out) + *id_out = id; + + return parent; +} + +static int rimt_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) +{ + struct rimt_pci_alias_info *info = data; + struct acpi_rimt_node *parent; + u32 deviceid; + + parent = rimt_node_map_id(info->node, alias, &deviceid, RIMT_IOMMU_TYPE); + return rimt_iommu_xlate(info->dev, parent, deviceid); +} + +static int rimt_plat_iommu_map(struct device *dev, struct acpi_rimt_node *node) +{ + struct acpi_rimt_node *parent; + int err = -ENODEV, i = 0; + u32 deviceid = 0; + + do { + parent = rimt_node_map_platform_id(node, &deviceid, + RIMT_IOMMU_TYPE, + i++); + + if (parent) + err = rimt_iommu_xlate(dev, parent, deviceid); + } while (parent && !err); + + return err; +} + +static int rimt_plat_iommu_map_id(struct device *dev, + struct acpi_rimt_node *node, + const u32 *in_id) +{ + struct acpi_rimt_node *parent; + u32 deviceid; + + parent = rimt_node_map_id(node, *in_id, &deviceid, RIMT_IOMMU_TYPE); + if (parent) + return rimt_iommu_xlate(dev, parent, deviceid); + + return -ENODEV; +} + +/** + * rimt_iommu_configure_id - Set-up IOMMU configuration for a device. + * + * @dev: device to configure + * @id_in: optional input id const value pointer + * + * Returns: 0 on success, <0 on failure + */ +int rimt_iommu_configure_id(struct device *dev, const u32 *id_in) +{ + struct acpi_rimt_node *node; + int err = -ENODEV; + + if (dev_is_pci(dev)) { + struct iommu_fwspec *fwspec; + struct pci_bus *bus = to_pci_dev(dev)->bus; + struct rimt_pci_alias_info info = { .dev = dev }; + + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX, &bus->dev); + if (!node) + return -ENODEV; + + info.node = node; + err = pci_for_each_dma_alias(to_pci_dev(dev), + rimt_pci_iommu_init, &info); + + fwspec = dev_iommu_fwspec_get(dev); + if (fwspec && rimt_pcie_rc_supports_ats(node)) + fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; + } else { + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PLAT_DEVICE, dev); + if (!node) + return -ENODEV; + + err = id_in ? rimt_plat_iommu_map_id(dev, node, id_in) : + rimt_plat_iommu_map(dev, node); + } + + return err; +} + +#endif + +void __init riscv_acpi_rimt_init(void) +{ + acpi_status status; + + /* rimt_table will be used at runtime after the rimt init, + * so we don't need to call acpi_put_table() to release + * the RIMT table mapping. + */ + status = acpi_get_table(ACPI_SIG_RIMT, 0, &rimt_table); + if (ACPI_FAILURE(status)) { + if (status != AE_NOT_FOUND) { + const char *msg = acpi_format_exception(status); + + pr_err("Failed to get table, %s\n", msg); + } + + return; + } +} diff --git a/include/linux/acpi_rimt.h b/include/linux/acpi_rimt.h new file mode 100644 index 000000000000..fad3adc4d899 --- /dev/null +++ b/include/linux/acpi_rimt.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024-2025, Ventana Micro Systems Inc. + * Author: Sunil V L + */ + +#ifndef _ACPI_RIMT_H +#define _ACPI_RIMT_H + +#ifdef CONFIG_ACPI_RIMT +int rimt_iommu_register(struct device *dev); +#else +static inline int rimt_iommu_register(struct device *dev) +{ + return -ENODEV; +} +#endif + +#if defined(CONFIG_IOMMU_API) && defined(CONFIG_ACPI_RIMT) +int rimt_iommu_configure_id(struct device *dev, const u32 *id_in); +#else +static inline int rimt_iommu_configure_id(struct device *dev, const u32 *id_in) +{ + return -ENODEV; +} +#endif + +#endif /* _ACPI_RIMT_H */ -- cgit v1.2.3 From c593b9d6c446510684da400833f9d632651942f0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 3 Sep 2025 11:23:33 -0400 Subject: filelock: add FL_RECLAIM to show_fl_flags() macro Show the FL_RECLAIM flag symbolically in tracepoints. Fixes: bb0a55bb7148 ("nfs: don't allow reexport reclaims") Signed-off-by: Jeff Layton Link: https://lore.kernel.org/20250903-filelock-v1-1-f2926902962d@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/filelock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982..2dfeb158e848 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -27,7 +27,8 @@ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ - { FL_OFDLCK, "FL_OFDLCK" }) + { FL_OFDLCK, "FL_OFDLCK" }, \ + { FL_RECLAIM, "FL_RECLAIM"}) #define show_fl_type(val) \ __print_symbolic(val, \ -- cgit v1.2.3 From f6cfa602d2ba7e5ca9dc65ec4141521aca80bda2 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:23 +0200 Subject: workqueue: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 45d5dd470ff6..af860e8f8481 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -783,8 +783,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ - (__builtin_constant_p(_wq == system_unbound_wq) && \ - _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_dfl_wq) && \ + _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 59faf857ee4f..2888f4399acd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2932,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t) raw_spin_unlock_irq(&pool->lock); if (do_cull) - queue_work(system_unbound_wq, &pool->idle_cull_work); + queue_work(system_dfl_wq, &pool->idle_cull_work); } /** -- cgit v1.2.3 From a2be943b46b4a7478ea8ddf9bb8e5251c59fceb7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:24 +0200 Subject: workqueue: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 22 +++++++++++----------- kernel/workqueue.c | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index af860e8f8481..b6834b7aee4b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -434,10 +434,10 @@ enum wq_consts { * short queue flush time. Don't queue works which can run for too * long. * - * system_highpri_wq is similar to system_wq but for work items which + * system_highpri_wq is similar to system_percpu_wq but for work items which * require WQ_HIGHPRI. * - * system_long_wq is similar to system_wq but may host long running + * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * * system_dfl_wq is unbound workqueue. Workers are not bound to @@ -445,13 +445,13 @@ enum wq_consts { * executed immediately as long as max_active limit is not reached and * resources are available. * - * system_freezable_wq is equivalent to system_wq except that it's + * system_freezable_wq is equivalent to system_percpu_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. - * system_power_efficient_wq is identical to system_wq if + * system_power_efficient_wq is identical to system_percpu_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items @@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq, */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, system_wq, work); + return queue_work_on(cpu, system_percpu_wq, work); } /** @@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) */ static inline bool schedule_work(struct work_struct *work) { - return queue_work(system_wq, work); + return queue_work(system_percpu_wq, work); } /** @@ -770,15 +770,15 @@ extern void __warn_flushing_systemwide_wq(void) #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ - __flush_workqueue(system_wq); \ + __flush_workqueue(system_percpu_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ - if ((__builtin_constant_p(_wq == system_wq) && \ - _wq == system_wq) || \ + if ((__builtin_constant_p(_wq == system_percpu_wq) && \ + _wq == system_percpu_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ @@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void) static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, system_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay); } /** @@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_delayed_work(system_percpu_wq, dwork, delay); } #ifndef CONFIG_SMP diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2888f4399acd..90db8cf015c2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7668,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val, if (ret) return ret; - if (system_wq) + if (system_percpu_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; -- cgit v1.2.3 From 97248d05b70edc674f2f2fa835fed33172686b1d Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Tue, 2 Sep 2025 15:33:23 +0800 Subject: cpufreq: Drop redundant freq_table parameter Since commit e0b3165ba521 ("cpufreq: add 'freq_table' in struct cpufreq_policy"), freq_table has been stored in struct cpufreq_policy instead of being maintained separately. However, several helpers in freq_table.c still take both policy and freq_table as parameters, even though policy->freq_table can always be used. This leads to redundant function arguments and increases the chance of inconsistencies. This patch removes the unnecessary freq_table argument from these functions and updates their callers to only pass policy. This makes the code simpler, more consistent, and avoids duplication. Signed-off-by: Zihuan Zhang Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250902073323.48330-1-zhangzihuan@kylinos.cn Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/freq_table.c | 14 ++++++-------- drivers/cpufreq/sh-cpufreq.c | 6 ++---- drivers/cpufreq/virtual-cpufreq.c | 2 +- include/linux/cpufreq.h | 7 +++---- 5 files changed, 13 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a615c98d80ca..5fcc99f768d2 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2793,7 +2793,7 @@ int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) if (!policy->freq_table) return -ENXIO; - ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); + ret = cpufreq_frequency_table_cpuinfo(policy); if (ret) { pr_err("%s: Policy frequency update failed\n", __func__); return ret; diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 35de513af6c9..d5111ee56e38 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -28,10 +28,9 @@ static bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } -int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, - struct cpufreq_frequency_table *table) +int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy) { - struct cpufreq_frequency_table *pos; + struct cpufreq_frequency_table *pos, *table = policy->freq_table; unsigned int min_freq = ~0; unsigned int max_freq = 0; unsigned int freq; @@ -65,10 +64,9 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, return 0; } -int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy, - struct cpufreq_frequency_table *table) +int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy) { - struct cpufreq_frequency_table *pos; + struct cpufreq_frequency_table *pos, *table = policy->freq_table; unsigned int freq, prev_smaller = 0; bool found = false; @@ -110,7 +108,7 @@ int cpufreq_generic_frequency_table_verify(struct cpufreq_policy_data *policy) if (!policy->freq_table) return -ENODEV; - return cpufreq_frequency_table_verify(policy, policy->freq_table); + return cpufreq_frequency_table_verify(policy); } EXPORT_SYMBOL_GPL(cpufreq_generic_frequency_table_verify); @@ -354,7 +352,7 @@ int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy) return 0; } - ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); + ret = cpufreq_frequency_table_cpuinfo(policy); if (ret) return ret; diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c index 9c0b01e00508..642ddb9ea217 100644 --- a/drivers/cpufreq/sh-cpufreq.c +++ b/drivers/cpufreq/sh-cpufreq.c @@ -89,11 +89,9 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy, static int sh_cpufreq_verify(struct cpufreq_policy_data *policy) { struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu); - struct cpufreq_frequency_table *freq_table; - freq_table = cpuclk->nr_freqs ? cpuclk->freq_table : NULL; - if (freq_table) - return cpufreq_frequency_table_verify(policy, freq_table); + if (policy->freq_table) + return cpufreq_frequency_table_verify(policy); cpufreq_verify_within_cpu_limits(policy); diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c index 7dd1b0c263c7..6ffa16d239b2 100644 --- a/drivers/cpufreq/virtual-cpufreq.c +++ b/drivers/cpufreq/virtual-cpufreq.c @@ -250,7 +250,7 @@ static int virt_cpufreq_offline(struct cpufreq_policy *policy) static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy) { if (policy->freq_table) - return cpufreq_frequency_table_verify(policy, policy->freq_table); + return cpufreq_frequency_table_verify(policy); cpufreq_verify_within_cpu_limits(policy); return 0; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 95f3807c8c55..40966512ea18 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -780,11 +780,10 @@ struct cpufreq_frequency_table { else -int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, - struct cpufreq_frequency_table *table); +int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy); + +int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy); -int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy, - struct cpufreq_frequency_table *table); int cpufreq_generic_frequency_table_verify(struct cpufreq_policy_data *policy); int cpufreq_table_index_unsorted(struct cpufreq_policy *policy, -- cgit v1.2.3 From 9d68320b2bca876278856fdc1e8684a7494dd069 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Fri, 5 Sep 2025 16:18:58 +0800 Subject: ACPI: processor: idle: Fix function defined but not used warning If CONFIG_ACPI_PROCESSOR_IDLE=n, acpi_processor_register_idle_driver() and acpi_processor_unregister_idle_driver() are never used and the empty stubs of them are not needed. Moreover, they cause the compiler to complain [1], so remove them. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508300519.tZQHY6HA-lkp@intel.com/ [1] Fixes: 7a8c994cbb2d ("ACPI: processor: idle: Optimize ACPI idle driver registration") Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250905081900.663869-2-lihuisong@huawei.com [ rjw: Changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 360b673f05e5..ff864c1cee3a 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -445,12 +445,6 @@ static inline int acpi_processor_hotplug(struct acpi_processor *pr) { return -ENODEV; } -static inline void acpi_processor_register_idle_driver(void) -{ -} -static inline void acpi_processor_unregister_idle_driver(void) -{ -} #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ -- cgit v1.2.3 From 437054b1bbe11be87ab0a522b8ccbae3f785c642 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 15 Aug 2025 12:41:10 +0200 Subject: vdso: Add struct __kernel_old_timeval forward declaration to gettime.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prototype of __vdso_gettimeofday() uses this struct. However gettime.h's own includes do not provide a definition for it. Add a forward declaration, similar to other used structs. Fixes: 42874e4eb35b ("arch: vdso: consolidate gettime prototypes") Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250815-vdso-sparc64-generic-2-v2-1-b5ff80672347@linutronix.de --- include/vdso/gettime.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/vdso/gettime.h b/include/vdso/gettime.h index c50d152e7b3e..9ac161866653 100644 --- a/include/vdso/gettime.h +++ b/include/vdso/gettime.h @@ -5,6 +5,7 @@ #include struct __kernel_timespec; +struct __kernel_old_timeval; struct timezone; #if !defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) -- cgit v1.2.3 From 1544344563376b2a2ae2af5af1db00d6410c18e0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 29 Aug 2025 15:28:44 +0800 Subject: rhashtable: Use __always_inline instead of inline Sometimes, the compiler is not clever enough to inline the rhashtable_lookup() for us, even if the "obj_cmpfn" and "key_len" in params is const. This can introduce more overhead. Therefore, use __always_inline for the rhashtable. Signed-off-by: Menglong Dong Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 6c85b28ea30b..e740157f3cd7 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -122,7 +122,7 @@ static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, return hash & (tbl->size - 1); } -static inline unsigned int rht_key_get_hash(struct rhashtable *ht, +static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { @@ -152,7 +152,7 @@ static inline unsigned int rht_key_get_hash(struct rhashtable *ht, return hash; } -static inline unsigned int rht_key_hashfn( +static __always_inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { @@ -161,7 +161,7 @@ static inline unsigned int rht_key_hashfn( return rht_bucket_index(tbl, hash); } -static inline unsigned int rht_head_hashfn( +static __always_inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { @@ -586,7 +586,7 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, } /* Internal function, do not use. */ -static inline struct rhash_head *__rhashtable_lookup( +static __always_inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -639,7 +639,7 @@ restart: * * Returns the first entry on which the compare function returned true. */ -static inline void *rhashtable_lookup( +static __always_inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -662,7 +662,7 @@ static inline void *rhashtable_lookup( * * Returns the first entry on which the compare function returned true. */ -static inline void *rhashtable_lookup_fast( +static __always_inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -689,7 +689,7 @@ static inline void *rhashtable_lookup_fast( * * Returns the list of entries that match the given key. */ -static inline struct rhlist_head *rhltable_lookup( +static __always_inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { @@ -702,7 +702,7 @@ static inline struct rhlist_head *rhltable_lookup( * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). */ -static inline void *__rhashtable_insert_fast( +static __always_inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { @@ -825,7 +825,7 @@ out_unlock: * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhashtable_insert_fast( +static __always_inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -854,7 +854,7 @@ static inline int rhashtable_insert_fast( * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhltable_insert_key( +static __always_inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { @@ -877,7 +877,7 @@ static inline int rhltable_insert_key( * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhltable_insert( +static __always_inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { @@ -902,7 +902,7 @@ static inline int rhltable_insert( * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhashtable_lookup_insert_fast( +static __always_inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -929,7 +929,7 @@ static inline int rhashtable_lookup_insert_fast( * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ -static inline void *rhashtable_lookup_get_insert_fast( +static __always_inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -956,7 +956,7 @@ static inline void *rhashtable_lookup_get_insert_fast( * * Returns zero on success. */ -static inline int rhashtable_lookup_insert_key( +static __always_inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { @@ -982,7 +982,7 @@ static inline int rhashtable_lookup_insert_key( * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ -static inline void *rhashtable_lookup_get_insert_key( +static __always_inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { @@ -992,7 +992,7 @@ static inline void *rhashtable_lookup_get_insert_key( } /* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast_one( +static __always_inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) @@ -1074,7 +1074,7 @@ unlocked: } /* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast( +static __always_inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { @@ -1115,7 +1115,7 @@ static inline int __rhashtable_remove_fast( * * Returns zero on success, -ENOENT if the entry could not be found. */ -static inline int rhashtable_remove_fast( +static __always_inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -1137,7 +1137,7 @@ static inline int rhashtable_remove_fast( * * Returns zero on success, -ENOENT if the entry could not be found. */ -static inline int rhltable_remove( +static __always_inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { @@ -1145,7 +1145,7 @@ static inline int rhltable_remove( } /* Internal function, please use rhashtable_replace_fast() instead */ -static inline int __rhashtable_replace_fast( +static __always_inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) @@ -1208,7 +1208,7 @@ unlocked: * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ -static inline int rhashtable_replace_fast( +static __always_inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) -- cgit v1.2.3 From 886d6981208263b55a1eb8b39c5d00db1544b9bb Mon Sep 17 00:00:00 2001 From: Zhushuai Yin Date: Sat, 30 Aug 2025 18:27:57 +0800 Subject: crypto: hisilicon/zip - add hashjoin, gather, and UDMA data move features The new version of the hisilicon zip driver supports the hash join and gather features, as well as the data move feature (UDMA), including data copying and memory initialization functions.These features are registered to the uacce subsystem. Signed-off-by: Zhushuai Yin Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 29 ++++++++++++++++++++++------- drivers/crypto/hisilicon/zip/dae_main.c | 11 +++++++++-- include/linux/hisi_acc_qm.h | 1 + include/uapi/misc/uacce/hisi_qm.h | 1 + 4 files changed, 33 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 102aff9ea19a..98099117bb38 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2742,6 +2742,27 @@ static void qm_remove_uacce(struct hisi_qm *qm) } } +static void qm_uacce_api_ver_init(struct hisi_qm *qm) +{ + struct uacce_device *uacce = qm->uacce; + + switch (qm->ver) { + case QM_HW_V1: + uacce->api_ver = HISI_QM_API_VER_BASE; + break; + case QM_HW_V2: + uacce->api_ver = HISI_QM_API_VER2_BASE; + break; + case QM_HW_V3: + case QM_HW_V4: + uacce->api_ver = HISI_QM_API_VER3_BASE; + break; + default: + uacce->api_ver = HISI_QM_API_VER5_BASE; + break; + } +} + static int qm_alloc_uacce(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -2775,13 +2796,6 @@ static int qm_alloc_uacce(struct hisi_qm *qm) uacce->is_vf = pdev->is_virtfn; uacce->priv = qm; - if (qm->ver == QM_HW_V1) - uacce->api_ver = HISI_QM_API_VER_BASE; - else if (qm->ver == QM_HW_V2) - uacce->api_ver = HISI_QM_API_VER2_BASE; - else - uacce->api_ver = HISI_QM_API_VER3_BASE; - if (qm->ver == QM_HW_V1) mmio_page_nr = QM_DOORBELL_PAGE_NR; else if (!test_bit(QM_SUPPORT_DB_ISOLATION, &qm->caps)) @@ -2801,6 +2815,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) uacce->qf_pg_num[UACCE_QFRT_DUS] = dus_page_nr; qm->uacce = uacce; + qm_uacce_api_ver_init(qm); INIT_LIST_HEAD(&qm->isolate_data.qm_hw_errs); mutex_init(&qm->isolate_data.isolate_lock); diff --git a/drivers/crypto/hisilicon/zip/dae_main.c b/drivers/crypto/hisilicon/zip/dae_main.c index 6f22e4c36e49..4c5481c77436 100644 --- a/drivers/crypto/hisilicon/zip/dae_main.c +++ b/drivers/crypto/hisilicon/zip/dae_main.c @@ -15,6 +15,7 @@ #define DAE_REG_RD_TMOUT_US USEC_PER_SEC #define DAE_ALG_NAME "hashagg" +#define DAE_V5_ALG_NAME "hashagg\nudma\nhashjoin\ngather" /* error */ #define DAE_AXI_CFG_OFFSET 0x331000 @@ -82,6 +83,7 @@ int hisi_dae_set_user_domain(struct hisi_qm *qm) int hisi_dae_set_alg(struct hisi_qm *qm) { + const char *alg_name; size_t len; if (!dae_is_support(qm)) @@ -90,9 +92,14 @@ int hisi_dae_set_alg(struct hisi_qm *qm) if (!qm->uacce) return 0; + if (qm->ver >= QM_HW_V5) + alg_name = DAE_V5_ALG_NAME; + else + alg_name = DAE_ALG_NAME; + len = strlen(qm->uacce->algs); /* A line break may be required */ - if (len + strlen(DAE_ALG_NAME) + 1 >= QM_DEV_ALG_MAX_LEN) { + if (len + strlen(alg_name) + 1 >= QM_DEV_ALG_MAX_LEN) { pci_err(qm->pdev, "algorithm name is too long!\n"); return -EINVAL; } @@ -100,7 +107,7 @@ int hisi_dae_set_alg(struct hisi_qm *qm) if (len) strcat((char *)qm->uacce->algs, "\n"); - strcat((char *)qm->uacce->algs, DAE_ALG_NAME); + strcat((char *)qm->uacce->algs, alg_name); return 0; } diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 0c4c84b8c3be..f2254ddc327c 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -125,6 +125,7 @@ enum qm_hw_ver { QM_HW_V2 = 0x21, QM_HW_V3 = 0x30, QM_HW_V4 = 0x50, + QM_HW_V5 = 0x51, }; enum qm_fun_type { diff --git a/include/uapi/misc/uacce/hisi_qm.h b/include/uapi/misc/uacce/hisi_qm.h index 3e66dbc2f323..10504b48eabf 100644 --- a/include/uapi/misc/uacce/hisi_qm.h +++ b/include/uapi/misc/uacce/hisi_qm.h @@ -31,6 +31,7 @@ struct hisi_qp_info { #define HISI_QM_API_VER_BASE "hisi_qm_v1" #define HISI_QM_API_VER2_BASE "hisi_qm_v2" #define HISI_QM_API_VER3_BASE "hisi_qm_v3" +#define HISI_QM_API_VER5_BASE "hisi_qm_v5" /* UACCE_CMD_QM_SET_QP_CTX: Set qp algorithm type */ #define UACCE_CMD_QM_SET_QP_CTX _IOWR('H', 10, struct hisi_qp_ctx) -- cgit v1.2.3 From c2ce2453413d429e302659abc5ace634e873f6f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2025 12:59:24 +0200 Subject: driver core/PM: Set power.no_callbacks along with power.no_pm Devices with power.no_pm set are not expected to need any power management at all, so modify device_set_pm_not_required() to set power.no_callbacks for them too in case runtime PM will be enabled for any of them (which in principle may be done for convenience if such a device participates in a dependency chain). Since device_set_pm_not_required() must be called before device_add() or it would not have any effect, it can update power.no_callbacks without locking, unlike pm_runtime_no_callbacks() that can be called after registering the target device. Signed-off-by: Rafael J. Wysocki Cc: stable Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/1950054.tdWV9SEqCh@rafael.j.wysocki Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 0470d19da7f2..b031ff71a5bd 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -851,6 +851,9 @@ static inline bool device_pm_not_required(struct device *dev) static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; +#ifdef CONFIG_PM + dev->power.no_callbacks = true; +#endif } static inline void dev_pm_syscore_device(struct device *dev, bool val) -- cgit v1.2.3 From 20f988320d2718ef28b1f0635acc88c12a216d29 Mon Sep 17 00:00:00 2001 From: "Rai, Amardeep" Date: Wed, 20 Aug 2025 17:38:19 +0300 Subject: usb: core: Add a function to get USB version independent periodic payload Add usb_endpoint_max_periodic_payload() to obtain maximum payload bytes in a service interval for isochronous and interrupt endpoints in a USB version independent way. Signed-off-by: Rai, Amardeep Signed-off-by: Mathias Nyman Co-developed-by: Sakari Ailus Signed-off-by: Sakari Ailus Reviewed-by: Hans de Goede Acked-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250820143824.551777-5-sakari.ailus@linux.intel.com --- drivers/usb/core/usb.c | 29 +++++++++++++++++++++++++++++ include/linux/usb.h | 3 +++ 2 files changed, 32 insertions(+) (limited to 'include') diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index fca7735fc660..ca9ff6ad8e73 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -1110,6 +1110,35 @@ void usb_free_noncoherent(struct usb_device *dev, size_t size, } EXPORT_SYMBOL_GPL(usb_free_noncoherent); +/** + * usb_endpoint_max_periodic_payload - Get maximum payload bytes per service + * interval + * @udev: The USB device + * @ep: The endpoint + * + * Returns: the maximum number of bytes isochronous or interrupt endpoint @ep + * can transfer during a service interval, or 0 for other endpoints. + */ +u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, + const struct usb_host_endpoint *ep) +{ + if (!usb_endpoint_xfer_isoc(&ep->desc) && + !usb_endpoint_xfer_int(&ep->desc)) + return 0; + + switch (udev->speed) { + case USB_SPEED_SUPER_PLUS: + if (USB_SS_SSP_ISOC_COMP(ep->ss_ep_comp.bmAttributes)) + return le32_to_cpu(ep->ssp_isoc_ep_comp.dwBytesPerInterval); + fallthrough; + case USB_SPEED_SUPER: + return le16_to_cpu(ep->ss_ep_comp.wBytesPerInterval); + default: + return usb_endpoint_maxp(&ep->desc) * usb_endpoint_maxp_mult(&ep->desc); + } +} +EXPORT_SYMBOL_GPL(usb_endpoint_max_periodic_payload); + /* * Notifications of device and interface registration */ diff --git a/include/linux/usb.h b/include/linux/usb.h index 9d662c6abb4d..e9cf2786d8bd 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2039,6 +2039,9 @@ static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) return usb_endpoint_maxp(&ep->desc); } +u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, + const struct usb_host_endpoint *ep); + /* translate USB error codes to codes user space understands */ static inline int usb_translate_errors(int error_code) { -- cgit v1.2.3 From d6725169a9bbcb5bd1dd14b2891b874614c59f52 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 20 Aug 2025 17:38:21 +0300 Subject: usb: core: Introduce usb_endpoint_is_hs_isoc_double() Introduce usb_endpoint_is_hs_isoc_double() tell whether an endpoint conforms to USB 2.0 Isochronous Double IN Bandwidth ECN. Signed-off-by: Sakari Ailus Acked-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250820143824.551777-7-sakari.ailus@linux.intel.com --- drivers/usb/core/usb.c | 19 +++++++++++++++++++ include/linux/usb.h | 3 +++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index ca9ff6ad8e73..939dc4aafb89 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -1139,6 +1139,25 @@ u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, } EXPORT_SYMBOL_GPL(usb_endpoint_max_periodic_payload); +/** + * usb_endpoint_is_hs_isoc_double - Tell whether an endpoint uses USB 2 + * Isochronous Double IN Bandwidth + * @udev: The USB device + * @ep: The endpoint + * + * Returns: true if an endpoint @ep conforms to USB 2 Isochronous Double IN + * Bandwidth ECN, false otherwise. + */ +bool usb_endpoint_is_hs_isoc_double(struct usb_device *udev, + const struct usb_host_endpoint *ep) +{ + return ep->eusb2_isoc_ep_comp.bDescriptorType && + le16_to_cpu(udev->descriptor.bcdUSB) == 0x220 && + usb_endpoint_is_isoc_in(&ep->desc) && + !le16_to_cpu(ep->desc.wMaxPacketSize); +} +EXPORT_SYMBOL_GPL(usb_endpoint_is_hs_isoc_double); + /* * Notifications of device and interface registration */ diff --git a/include/linux/usb.h b/include/linux/usb.h index e9cf2786d8bd..70ef00c42d22 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2042,6 +2042,9 @@ static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, const struct usb_host_endpoint *ep); +bool usb_endpoint_is_hs_isoc_double(struct usb_device *udev, + const struct usb_host_endpoint *ep); + /* translate USB error codes to codes user space understands */ static inline int usb_translate_errors(int error_code) { -- cgit v1.2.3 From 23743ba64709a9c137c1b928f8b8e00d846af9cc Mon Sep 17 00:00:00 2001 From: Calixte Pernot Date: Mon, 25 Aug 2025 14:56:09 +0200 Subject: vt: add support for smput/rmput escape codes Support "\e[?1049h" and "\e[?1049l" escape codes. This patch allows programs to enter and leave alternate screens. This feature is widely available in graphical terminal emulators and mostly used by fullscreen terminal-based user interfaces such as text editors. Most editors such as vim and nano assume this escape code in not supported and will not try to print the escape sequence if TERM=linux. To try out this patch, run `TERM=xterm-256color vim` inside a VT. Signed-off-by: Calixte Pernot Link: https://lore.kernel.org/r/20250825125607.2478-3-calixte.pernot@grenoble-inp.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 58 ++++++++++++++++++++++++++++++++++++++++++ include/linux/console_struct.h | 3 +++ 2 files changed, 61 insertions(+) (limited to 'include') diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 869261141535..3f80e98e5c51 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -141,6 +141,7 @@ static const struct consw *con_driver_map[MAX_NR_CONSOLES]; static int con_open(struct tty_struct *, struct file *); static void vc_init(struct vc_data *vc, int do_clear); static void gotoxy(struct vc_data *vc, int new_x, int new_y); +static void restore_cur(struct vc_data *vc); static void save_cur(struct vc_data *vc); static void reset_terminal(struct vc_data *vc, int do_clear); static void con_flush_chars(struct tty_struct *tty); @@ -1341,6 +1342,10 @@ struct vc_data *vc_deallocate(unsigned int currcons) kfree(vc->vc_screenbuf); vc_cons[currcons].d = NULL; } + if (vc->vc_saved_screen != NULL) { + kfree(vc->vc_saved_screen); + vc->vc_saved_screen = NULL; + } return vc; } @@ -1875,6 +1880,45 @@ static int get_bracketed_paste(struct tty_struct *tty) return vc->vc_bracketed_paste; } +/* console_lock is held */ +static void enter_alt_screen(struct vc_data *vc) +{ + unsigned int size = vc->vc_rows * vc->vc_cols * 2; + + if (vc->vc_saved_screen != NULL) + return; /* Already inside an alt-screen */ + vc->vc_saved_screen = kmemdup((u16 *)vc->vc_origin, size, GFP_KERNEL); + if (vc->vc_saved_screen == NULL) + return; + vc->vc_saved_rows = vc->vc_rows; + vc->vc_saved_cols = vc->vc_cols; + save_cur(vc); + /* clear entire screen */ + csi_J(vc, CSI_J_FULL); +} + +/* console_lock is held */ +static void leave_alt_screen(struct vc_data *vc) +{ + unsigned int rows = min(vc->vc_saved_rows, vc->vc_rows); + unsigned int cols = min(vc->vc_saved_cols, vc->vc_cols); + u16 *src, *dest; + + if (vc->vc_saved_screen == NULL) + return; /* Not inside an alt-screen */ + for (unsigned int r = 0; r < rows; r++) { + src = vc->vc_saved_screen + r * vc->vc_saved_cols; + dest = ((u16 *)vc->vc_origin) + r * vc->vc_cols; + memcpy(dest, src, 2 * cols); + } + restore_cur(vc); + /* Update the entire screen */ + if (con_should_update(vc)) + do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); + kfree(vc->vc_saved_screen); + vc->vc_saved_screen = NULL; +} + enum { CSI_DEC_hl_CURSOR_KEYS = 1, /* CKM: cursor keys send ^[Ox/^[[x */ CSI_DEC_hl_132_COLUMNS = 3, /* COLM: 80/132 mode switch */ @@ -1885,6 +1929,7 @@ enum { CSI_DEC_hl_MOUSE_X10 = 9, CSI_DEC_hl_SHOW_CURSOR = 25, /* TCEM */ CSI_DEC_hl_MOUSE_VT200 = 1000, + CSI_DEC_hl_ALT_SCREEN = 1049, CSI_DEC_hl_BRACKETED_PASTE = 2004, }; @@ -1941,6 +1986,12 @@ static void csi_DEC_hl(struct vc_data *vc, bool on_off) case CSI_DEC_hl_BRACKETED_PASTE: vc->vc_bracketed_paste = on_off; break; + case CSI_DEC_hl_ALT_SCREEN: + if (on_off) + enter_alt_screen(vc); + else + leave_alt_screen(vc); + break; } } @@ -2179,6 +2230,13 @@ static void reset_terminal(struct vc_data *vc, int do_clear) vc->vc_deccm = global_cursor_default; vc->vc_decim = 0; + if (vc->vc_saved_screen != NULL) { + kfree(vc->vc_saved_screen); + vc->vc_saved_screen = NULL; + vc->vc_saved_rows = 0; + vc->vc_saved_cols = 0; + } + vt_reset_keyboard(vc->vc_num); vc->vc_cursor_type = cur_default; diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 59b4fec5f254..13b35637bd5a 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -159,6 +159,9 @@ struct vc_data { struct uni_pagedict *uni_pagedict; struct uni_pagedict **uni_pagedict_loc; /* [!] Location of uni_pagedict variable for this console */ u32 **vc_uni_lines; /* unicode screen content */ + u16 *vc_saved_screen; + unsigned int vc_saved_cols; + unsigned int vc_saved_rows; /* additional information is in vt_kern.h */ }; -- cgit v1.2.3 From afc4e4a5f122183b38095daba2264123cc86d8ab Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:18 -0700 Subject: lib/crypto: tests: Migrate Curve25519 self-test to KUnit Move the Curve25519 test from an ad-hoc self-test to a KUnit test. Generally keep the same test logic for now, just translated to KUnit. There's one exception, which is that I dropped the incomplete test of curve25519_generic(). The approach I'm taking to cover the different implementations with the KUnit tests is to just rely on booting kernels in QEMU with different '-cpu' options, rather than try to make the tests (incompletely) test multiple implementations on one CPU. This way, both the test and the library API are simpler. This commit makes the file lib/crypto/curve25519.c no longer needed, as its only purpose was to call the self-test. However, keep it for now, since a later commit will add code to it again. Temporarily omit the default value of CRYPTO_SELFTESTS that the other lib/crypto/ KUnit tests have. It would cause a recursive kconfig dependency, since the Curve25519 code is still entangled with CRYPTO. A later commit will fix that. Link: https://lore.kernel.org/r/20250906213523.84915-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/curve25519.h | 2 - lib/crypto/Makefile | 1 - lib/crypto/curve25519-selftest.c | 1321 ---------------------------------- lib/crypto/curve25519.c | 3 - lib/crypto/tests/Kconfig | 9 + lib/crypto/tests/Makefile | 1 + lib/crypto/tests/curve25519_kunit.c | 1332 +++++++++++++++++++++++++++++++++++ 7 files changed, 1342 insertions(+), 1327 deletions(-) delete mode 100644 lib/crypto/curve25519-selftest.c create mode 100644 lib/crypto/tests/curve25519_kunit.c (limited to 'include') diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index ece6a9b5fafc..4e6dc840b159 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -28,8 +28,6 @@ void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]); -bool curve25519_selftest(void); - static inline bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index ad27c5bf99e1..6c3be971ace0 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -87,7 +87,6 @@ endif obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o libcurve25519-y += curve25519.o -libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/curve25519-selftest.c deleted file mode 100644 index c85e85381e78..000000000000 --- a/lib/crypto/curve25519-selftest.c +++ /dev/null @@ -1,1321 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include - -struct curve25519_test_vector { - u8 private[CURVE25519_KEY_SIZE]; - u8 public[CURVE25519_KEY_SIZE]; - u8 result[CURVE25519_KEY_SIZE]; - bool valid; -}; -static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = { - { - .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, - 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, - 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, - 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, - .public = { 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, - 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, - 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, - 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, - .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .valid = true - }, - { - .private = { 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, - 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, - 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, - 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, - .public = { 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, - 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, - 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, - 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, - .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .valid = true - }, - { - .private = { 1 }, - .public = { 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, - 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, - 0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98, - 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f }, - .valid = true - }, - { - .private = { 1 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, - 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, - 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3, - 0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 }, - .valid = true - }, - { - .private = { 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, - 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, - 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, - 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, - .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, - 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, - 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, - 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, - 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, - 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, - 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .valid = true - }, - { - .private = { 1, 2, 3, 4 }, - .public = { 0 }, - .result = { 0 }, - .valid = false - }, - { - .private = { 2, 4, 6, 8 }, - .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, - 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, - 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, - 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 }, - .result = { 0 }, - .valid = false - }, - { - .private = { 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f }, - .result = { 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2, - 0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57, - 0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05, - 0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 }, - .valid = true - }, - { - .private = { 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 }, - .result = { 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d, - 0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12, - 0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99, - 0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c }, - .valid = true - }, - /* wycheproof - normal case */ - { - .private = { 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda, - 0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66, - 0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3, - 0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba }, - .public = { 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5, - 0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9, - 0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e, - 0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a }, - .result = { 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5, - 0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38, - 0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e, - 0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 }, - .valid = true - }, - /* wycheproof - public key on twist */ - { - .private = { 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4, - 0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5, - 0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49, - 0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 }, - .public = { 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5, - 0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8, - 0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3, - 0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 }, - .result = { 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff, - 0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d, - 0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe, - 0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 }, - .valid = true - }, - /* wycheproof - public key on twist */ - { - .private = { 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9, - 0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39, - 0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5, - 0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 }, - .public = { 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f, - 0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b, - 0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c, - 0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 }, - .result = { 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53, - 0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57, - 0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0, - 0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b }, - .valid = true - }, - /* wycheproof - public key on twist */ - { - .private = { 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc, - 0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d, - 0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67, - 0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c }, - .public = { 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97, - 0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f, - 0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45, - 0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a }, - .result = { 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93, - 0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2, - 0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44, - 0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a }, - .valid = true - }, - /* wycheproof - public key on twist */ - { - .private = { 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1, - 0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95, - 0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99, - 0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d }, - .public = { 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27, - 0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07, - 0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae, - 0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c }, - .result = { 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73, - 0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2, - 0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f, - 0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 }, - .valid = true - }, - /* wycheproof - public key on twist */ - { - .private = { 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9, - 0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd, - 0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b, - 0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 }, - .public = { 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5, - 0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52, - 0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8, - 0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 }, - .result = { 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86, - 0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4, - 0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6, - 0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 }, - .valid = true - }, - /* wycheproof - public key = 0 */ - { - .private = { 0x20, 0x74, 0x94, 0x03, 0x8f, 0x2b, 0xb8, 0x11, - 0xd4, 0x78, 0x05, 0xbc, 0xdf, 0x04, 0xa2, 0xac, - 0x58, 0x5a, 0xda, 0x7f, 0x2f, 0x23, 0x38, 0x9b, - 0xfd, 0x46, 0x58, 0xf9, 0xdd, 0xd4, 0xde, 0xbc }, - .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key = 1 */ - { - .private = { 0x20, 0x2e, 0x89, 0x72, 0xb6, 0x1c, 0x7e, 0x61, - 0x93, 0x0e, 0xb9, 0x45, 0x0b, 0x50, 0x70, 0xea, - 0xe1, 0xc6, 0x70, 0x47, 0x56, 0x85, 0x54, 0x1f, - 0x04, 0x76, 0x21, 0x7e, 0x48, 0x18, 0xcf, 0xab }, - .public = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - edge case on twist */ - { - .private = { 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04, - 0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77, - 0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90, - 0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 }, - .public = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97, - 0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9, - 0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7, - 0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 }, - .valid = true - }, - /* wycheproof - edge case on twist */ - { - .private = { 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36, - 0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd, - 0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c, - 0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 }, - .public = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e, - 0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b, - 0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e, - 0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 }, - .valid = true - }, - /* wycheproof - edge case on twist */ - { - .private = { 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed, - 0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e, - 0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd, - 0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 }, - .public = { 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff, - 0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00, - 0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 }, - .result = { 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f, - 0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1, - 0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10, - 0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b }, - .valid = true - }, - /* wycheproof - edge case on twist */ - { - .private = { 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3, - 0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d, - 0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00, - 0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 }, - .public = { 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00, - 0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff, - 0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f }, - .result = { 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8, - 0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4, - 0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70, - 0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b }, - .valid = true - }, - /* wycheproof - edge case on twist */ - { - .private = { 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3, - 0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a, - 0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e, - 0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 }, - .public = { 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57, - 0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c, - 0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59, - 0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 }, - .valid = true - }, - /* wycheproof - edge case on twist */ - { - .private = { 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f, - 0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42, - 0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9, - 0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 }, - .public = { 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c, - 0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5, - 0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65, - 0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6, - 0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4, - 0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8, - 0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe }, - .public = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7, - 0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca, - 0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f, - 0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa, - 0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3, - 0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52, - 0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3, - 0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e, - 0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75, - 0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26, - 0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea, - 0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00, - 0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, - .result = { 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8, - 0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32, - 0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87, - 0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c, - 0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6, - 0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb, - 0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 }, - .public = { 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff, - 0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff, - 0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff, - 0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f }, - .result = { 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85, - 0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f, - 0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0, - 0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38, - 0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b, - 0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c, - 0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .result = { 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b, - 0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81, - 0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3, - 0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d, - 0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42, - 0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98, - 0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f }, - .result = { 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c, - 0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9, - 0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89, - 0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 }, - .valid = true - }, - /* wycheproof - edge case for public key */ - { - .private = { 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29, - 0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6, - 0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c, - 0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f }, - .public = { 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75, - 0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89, - 0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c, - 0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f }, - .valid = true - }, - /* wycheproof - public key with low order */ - { - .private = { 0x10, 0x25, 0x5c, 0x92, 0x30, 0xa9, 0x7a, 0x30, - 0xa4, 0x58, 0xca, 0x28, 0x4a, 0x62, 0x96, 0x69, - 0x29, 0x3a, 0x31, 0x89, 0x0c, 0xda, 0x9d, 0x14, - 0x7f, 0xeb, 0xc7, 0xd1, 0xe2, 0x2d, 0x6b, 0xb1 }, - .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, - 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, - 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, - 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8, 0x00 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x78, 0xf1, 0xe8, 0xed, 0xf1, 0x44, 0x81, 0xb3, - 0x89, 0x44, 0x8d, 0xac, 0x8f, 0x59, 0xc7, 0x0b, - 0x03, 0x8e, 0x7c, 0xf9, 0x2e, 0xf2, 0xc7, 0xef, - 0xf5, 0x7a, 0x72, 0x46, 0x6e, 0x11, 0x52, 0x96 }, - .public = { 0x5f, 0x9c, 0x95, 0xbc, 0xa3, 0x50, 0x8c, 0x24, - 0xb1, 0xd0, 0xb1, 0x55, 0x9c, 0x83, 0xef, 0x5b, - 0x04, 0x44, 0x5c, 0xc4, 0x58, 0x1c, 0x8e, 0x86, - 0xd8, 0x22, 0x4e, 0xdd, 0xd0, 0x9f, 0x11, 0x57 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0xa0, 0xa0, 0x5a, 0x3e, 0x8f, 0x9f, 0x44, 0x20, - 0x4d, 0x5f, 0x80, 0x59, 0xa9, 0x4a, 0xc7, 0xdf, - 0xc3, 0x9a, 0x49, 0xac, 0x01, 0x6d, 0xd7, 0x43, - 0xdb, 0xfa, 0x43, 0xc5, 0xd6, 0x71, 0xfd, 0x88 }, - .public = { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0xd0, 0xdb, 0xb3, 0xed, 0x19, 0x06, 0x66, 0x3f, - 0x15, 0x42, 0x0a, 0xf3, 0x1f, 0x4e, 0xaf, 0x65, - 0x09, 0xd9, 0xa9, 0x94, 0x97, 0x23, 0x50, 0x06, - 0x05, 0xad, 0x7c, 0x1c, 0x6e, 0x74, 0x50, 0xa9 }, - .public = { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0xc0, 0xb1, 0xd0, 0xeb, 0x22, 0xb2, 0x44, 0xfe, - 0x32, 0x91, 0x14, 0x00, 0x72, 0xcd, 0xd9, 0xd9, - 0x89, 0xb5, 0xf0, 0xec, 0xd9, 0x6c, 0x10, 0x0f, - 0xeb, 0x5b, 0xca, 0x24, 0x1c, 0x1d, 0x9f, 0x8f }, - .public = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x48, 0x0b, 0xf4, 0x5f, 0x59, 0x49, 0x42, 0xa8, - 0xbc, 0x0f, 0x33, 0x53, 0xc6, 0xe8, 0xb8, 0x85, - 0x3d, 0x77, 0xf3, 0x51, 0xf1, 0xc2, 0xca, 0x6c, - 0x2d, 0x1a, 0xbf, 0x8a, 0x00, 0xb4, 0x22, 0x9c }, - .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x30, 0xf9, 0x93, 0xfc, 0xf8, 0x51, 0x4f, 0xc8, - 0x9b, 0xd8, 0xdb, 0x14, 0xcd, 0x43, 0xba, 0x0d, - 0x4b, 0x25, 0x30, 0xe7, 0x3c, 0x42, 0x76, 0xa0, - 0x5e, 0x1b, 0x14, 0x5d, 0x42, 0x0c, 0xed, 0xb4 }, - .public = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0xc0, 0x49, 0x74, 0xb7, 0x58, 0x38, 0x0e, 0x2a, - 0x5b, 0x5d, 0xf6, 0xeb, 0x09, 0xbb, 0x2f, 0x6b, - 0x34, 0x34, 0xf9, 0x82, 0x72, 0x2a, 0x8e, 0x67, - 0x6d, 0x3d, 0xa2, 0x51, 0xd1, 0xb3, 0xde, 0x83 }, - .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, - 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, - 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, - 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8, 0x80 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x50, 0x2a, 0x31, 0x37, 0x3d, 0xb3, 0x24, 0x46, - 0x84, 0x2f, 0xe5, 0xad, 0xd3, 0xe0, 0x24, 0x02, - 0x2e, 0xa5, 0x4f, 0x27, 0x41, 0x82, 0xaf, 0xc3, - 0xd9, 0xf1, 0xbb, 0x3d, 0x39, 0x53, 0x4e, 0xb5 }, - .public = { 0x5f, 0x9c, 0x95, 0xbc, 0xa3, 0x50, 0x8c, 0x24, - 0xb1, 0xd0, 0xb1, 0x55, 0x9c, 0x83, 0xef, 0x5b, - 0x04, 0x44, 0x5c, 0xc4, 0x58, 0x1c, 0x8e, 0x86, - 0xd8, 0x22, 0x4e, 0xdd, 0xd0, 0x9f, 0x11, 0xd7 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x90, 0xfa, 0x64, 0x17, 0xb0, 0xe3, 0x70, 0x30, - 0xfd, 0x6e, 0x43, 0xef, 0xf2, 0xab, 0xae, 0xf1, - 0x4c, 0x67, 0x93, 0x11, 0x7a, 0x03, 0x9c, 0xf6, - 0x21, 0x31, 0x8b, 0xa9, 0x0f, 0x4e, 0x98, 0xbe }, - .public = { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x78, 0xad, 0x3f, 0x26, 0x02, 0x7f, 0x1c, 0x9f, - 0xdd, 0x97, 0x5a, 0x16, 0x13, 0xb9, 0x47, 0x77, - 0x9b, 0xad, 0x2c, 0xf2, 0xb7, 0x41, 0xad, 0xe0, - 0x18, 0x40, 0x88, 0x5a, 0x30, 0xbb, 0x97, 0x9c }, - .public = { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key with low order */ - { - .private = { 0x98, 0xe2, 0x3d, 0xe7, 0xb1, 0xe0, 0x92, 0x6e, - 0xd9, 0xc8, 0x7e, 0x7b, 0x14, 0xba, 0xf5, 0x5f, - 0x49, 0x7a, 0x1d, 0x70, 0x96, 0xf9, 0x39, 0x77, - 0x68, 0x0e, 0x44, 0xdc, 0x1c, 0x7b, 0x7b, 0x8b }, - .public = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = false - }, - /* wycheproof - public key >= p */ - { - .private = { 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc, - 0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1, - 0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d, - 0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae }, - .public = { 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09, - 0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde, - 0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1, - 0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81, - 0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a, - 0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99, - 0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d }, - .public = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17, - 0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35, - 0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55, - 0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11, - 0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b, - 0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9, - 0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 }, - .public = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53, - 0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e, - 0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6, - 0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78, - 0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2, - 0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd, - 0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .result = { 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb, - 0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40, - 0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2, - 0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9, - 0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60, - 0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13, - 0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 }, - .public = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .result = { 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c, - 0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3, - 0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65, - 0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a, - 0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7, - 0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11, - 0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e }, - .public = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .result = { 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82, - 0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4, - 0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c, - 0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e, - 0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a, - 0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d, - 0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f }, - .public = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .result = { 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2, - 0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60, - 0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25, - 0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb, - 0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97, - 0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c, - 0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 }, - .public = { 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23, - 0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8, - 0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69, - 0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a, - 0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23, - 0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b, - 0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 }, - .public = { 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b, - 0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44, - 0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37, - 0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80, - 0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d, - 0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b, - 0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 }, - .public = { 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63, - 0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae, - 0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f, - 0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0, - 0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd, - 0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49, - 0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 }, - .public = { 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41, - 0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0, - 0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf, - 0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9, - 0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa, - 0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5, - 0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e }, - .public = { 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47, - 0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3, - 0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b, - 0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8, - 0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98, - 0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0, - 0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 }, - .public = { 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0, - 0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1, - 0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a, - 0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02, - 0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4, - 0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68, - 0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d }, - .public = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f, - 0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2, - 0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95, - 0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7, - 0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06, - 0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9, - 0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 }, - .public = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5, - 0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0, - 0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80, - 0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 }, - .valid = true - }, - /* wycheproof - public key >= p */ - { - .private = { 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd, - 0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4, - 0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04, - 0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0, - 0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac, - 0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48, - 0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 }, - .valid = true - }, - /* wycheproof - RFC 7748 */ - { - .private = { 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, - 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, - 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, - 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 }, - .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, - 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, - 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, - 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, - 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, - 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, - 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .valid = true - }, - /* wycheproof - RFC 7748 */ - { - .private = { 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c, - 0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5, - 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4, - 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d }, - .public = { 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3, - 0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c, - 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e, - 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 }, - .result = { 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d, - 0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8, - 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52, - 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde, - 0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8, - 0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4, - 0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 }, - .result = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d, - 0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64, - 0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd, - 0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 }, - .result = { 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8, - 0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf, - 0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94, - 0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d }, - .result = { 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84, - 0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62, - 0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e, - 0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 }, - .result = { 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8, - 0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58, - 0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02, - 0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 }, - .result = { 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9, - 0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a, - 0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44, - 0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b }, - .result = { 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd, - 0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22, - 0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56, - 0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b }, - .result = { 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53, - 0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f, - 0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18, - 0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f }, - .result = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55, - 0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b, - 0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79, - 0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f }, - .result = { 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39, - 0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c, - 0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb, - 0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e }, - .result = { 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04, - 0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10, - 0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58, - 0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c }, - .result = { 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3, - 0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c, - 0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88, - 0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 }, - .result = { 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a, - 0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49, - 0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a, - 0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }, - .valid = true - }, - /* wycheproof - edge case for shared secret */ - { - .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .public = { 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca, - 0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c, - 0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb, - 0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 }, - .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }, - .valid = true - }, - /* wycheproof - checking for overflow */ - { - .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .public = { 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58, - 0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7, - 0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01, - 0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d }, - .result = { 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d, - 0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27, - 0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b, - 0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 }, - .valid = true - }, - /* wycheproof - checking for overflow */ - { - .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .public = { 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26, - 0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2, - 0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44, - 0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e }, - .result = { 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6, - 0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d, - 0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e, - 0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 }, - .valid = true - }, - /* wycheproof - checking for overflow */ - { - .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .public = { 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61, - 0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67, - 0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e, - 0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c }, - .result = { 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65, - 0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce, - 0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0, - 0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 }, - .valid = true - }, - /* wycheproof - checking for overflow */ - { - .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .public = { 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee, - 0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d, - 0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14, - 0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 }, - .result = { 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e, - 0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc, - 0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5, - 0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b }, - .valid = true - }, - /* wycheproof - checking for overflow */ - { - .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .public = { 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4, - 0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5, - 0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c, - 0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 }, - .result = { 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b, - 0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93, - 0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f, - 0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 }, - .valid = true - }, - /* wycheproof - private key == -1 (mod order) */ - { - .private = { 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8, - 0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 }, - .public = { 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, - 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, - 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, - 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, - .result = { 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, - 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, - 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, - 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, - .valid = true - }, - /* wycheproof - private key == 1 (mod order) on twist */ - { - .private = { 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef, - 0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f }, - .public = { 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, - 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, - 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, - 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, - .result = { 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, - 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, - 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, - 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, - .valid = true - } -}; - -bool __init curve25519_selftest(void) -{ - bool success = true, ret, ret2; - size_t i = 0, j; - u8 in[CURVE25519_KEY_SIZE]; - u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE], - out3[CURVE25519_KEY_SIZE]; - - for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { - memset(out, 0, CURVE25519_KEY_SIZE); - ret = curve25519(out, curve25519_test_vectors[i].private, - curve25519_test_vectors[i].public); - if (ret != curve25519_test_vectors[i].valid || - memcmp(out, curve25519_test_vectors[i].result, - CURVE25519_KEY_SIZE)) { - pr_err("curve25519 self-test %zu: FAIL\n", i + 1); - success = false; - } - } - - for (i = 0; i < 5; ++i) { - get_random_bytes(in, sizeof(in)); - ret = curve25519_generate_public(out, in); - ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); - curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); - if (ret != ret2 || - memcmp(out, out2, CURVE25519_KEY_SIZE) || - memcmp(out, out3, CURVE25519_KEY_SIZE)) { - pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x", - i + 1); - for (j = CURVE25519_KEY_SIZE; j-- > 0;) - printk(KERN_CONT "%02x", in[j]); - printk(KERN_CONT "\n"); - success = false; - } - } - - return success; -} diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 6850b76a80c9..25f16777865b 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -15,9 +15,6 @@ static int __init curve25519_init(void) { - if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && - WARN_ON(!curve25519_selftest())) - return -ENODEV; return 0; } diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig index fd341aa12f15..eaca60d3e0a3 100644 --- a/lib/crypto/tests/Kconfig +++ b/lib/crypto/tests/Kconfig @@ -10,6 +10,15 @@ config CRYPTO_LIB_BLAKE2S_KUNIT_TEST help KUnit tests for the BLAKE2s cryptographic hash function. +config CRYPTO_LIB_CURVE25519_KUNIT_TEST + tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_CURVE25519 + help + KUnit tests for the Curve25519 Diffie-Hellman function. + config CRYPTO_LIB_MD5_KUNIT_TEST tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile index be7de929af2c..a71fad19922b 100644 --- a/lib/crypto/tests/Makefile +++ b/lib/crypto/tests/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-or-later obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o +obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o diff --git a/lib/crypto/tests/curve25519_kunit.c b/lib/crypto/tests/curve25519_kunit.c new file mode 100644 index 000000000000..68eab75d40dc --- /dev/null +++ b/lib/crypto/tests/curve25519_kunit.c @@ -0,0 +1,1332 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include + +struct curve25519_test_vector { + u8 private[CURVE25519_KEY_SIZE]; + u8 public[CURVE25519_KEY_SIZE]; + u8 result[CURVE25519_KEY_SIZE]; + bool valid; +}; +static const struct curve25519_test_vector curve25519_test_vectors[] = { + { + .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, + 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, + 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, + 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, + .public = { 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, + 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, + 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, + 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, + .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, + 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, + 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, + 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, + .valid = true + }, + { + .private = { 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, + 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, + 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, + 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, + .public = { 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, + 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, + 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, + 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, + .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, + 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, + 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, + 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, + .valid = true + }, + { + .private = { 1 }, + .public = { 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, + 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, + 0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98, + 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f }, + .valid = true + }, + { + .private = { 1 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, + 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, + 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3, + 0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 }, + .valid = true + }, + { + .private = { 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, + 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, + 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, + 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, + .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, + 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, + 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, + 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, + .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, + 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, + 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, + 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, + .valid = true + }, + { + .private = { 1, 2, 3, 4 }, + .public = { 0 }, + .result = { 0 }, + .valid = false + }, + { + .private = { 2, 4, 6, 8 }, + .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, + 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, + 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, + 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 }, + .result = { 0 }, + .valid = false + }, + { + .private = { 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f }, + .result = { 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2, + 0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57, + 0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05, + 0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 }, + .valid = true + }, + { + .private = { 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 }, + .result = { 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d, + 0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12, + 0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99, + 0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c }, + .valid = true + }, + /* wycheproof - normal case */ + { + .private = { 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda, + 0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66, + 0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3, + 0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba }, + .public = { 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5, + 0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9, + 0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e, + 0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a }, + .result = { 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5, + 0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38, + 0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e, + 0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 }, + .valid = true + }, + /* wycheproof - public key on twist */ + { + .private = { 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4, + 0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5, + 0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49, + 0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 }, + .public = { 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5, + 0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8, + 0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3, + 0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 }, + .result = { 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff, + 0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d, + 0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe, + 0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 }, + .valid = true + }, + /* wycheproof - public key on twist */ + { + .private = { 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9, + 0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39, + 0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5, + 0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 }, + .public = { 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f, + 0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b, + 0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c, + 0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 }, + .result = { 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53, + 0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57, + 0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0, + 0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b }, + .valid = true + }, + /* wycheproof - public key on twist */ + { + .private = { 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc, + 0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d, + 0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67, + 0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c }, + .public = { 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97, + 0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f, + 0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45, + 0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a }, + .result = { 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93, + 0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2, + 0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44, + 0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a }, + .valid = true + }, + /* wycheproof - public key on twist */ + { + .private = { 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1, + 0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95, + 0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99, + 0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d }, + .public = { 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27, + 0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07, + 0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae, + 0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c }, + .result = { 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73, + 0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2, + 0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f, + 0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 }, + .valid = true + }, + /* wycheproof - public key on twist */ + { + .private = { 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9, + 0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd, + 0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b, + 0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 }, + .public = { 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5, + 0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52, + 0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8, + 0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 }, + .result = { 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86, + 0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4, + 0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6, + 0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 }, + .valid = true + }, + /* wycheproof - public key = 0 */ + { + .private = { 0x20, 0x74, 0x94, 0x03, 0x8f, 0x2b, 0xb8, 0x11, + 0xd4, 0x78, 0x05, 0xbc, 0xdf, 0x04, 0xa2, 0xac, + 0x58, 0x5a, 0xda, 0x7f, 0x2f, 0x23, 0x38, 0x9b, + 0xfd, 0x46, 0x58, 0xf9, 0xdd, 0xd4, 0xde, 0xbc }, + .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key = 1 */ + { + .private = { 0x20, 0x2e, 0x89, 0x72, 0xb6, 0x1c, 0x7e, 0x61, + 0x93, 0x0e, 0xb9, 0x45, 0x0b, 0x50, 0x70, 0xea, + 0xe1, 0xc6, 0x70, 0x47, 0x56, 0x85, 0x54, 0x1f, + 0x04, 0x76, 0x21, 0x7e, 0x48, 0x18, 0xcf, 0xab }, + .public = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - edge case on twist */ + { + .private = { 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04, + 0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77, + 0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90, + 0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 }, + .public = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97, + 0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9, + 0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7, + 0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 }, + .valid = true + }, + /* wycheproof - edge case on twist */ + { + .private = { 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36, + 0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd, + 0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c, + 0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 }, + .public = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e, + 0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b, + 0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e, + 0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 }, + .valid = true + }, + /* wycheproof - edge case on twist */ + { + .private = { 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed, + 0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e, + 0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd, + 0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 }, + .public = { 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff, + 0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00, + 0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 }, + .result = { 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f, + 0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1, + 0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10, + 0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b }, + .valid = true + }, + /* wycheproof - edge case on twist */ + { + .private = { 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3, + 0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d, + 0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00, + 0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 }, + .public = { 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00, + 0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff, + 0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f }, + .result = { 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8, + 0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4, + 0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70, + 0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b }, + .valid = true + }, + /* wycheproof - edge case on twist */ + { + .private = { 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3, + 0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a, + 0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e, + 0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 }, + .public = { 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57, + 0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c, + 0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59, + 0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 }, + .valid = true + }, + /* wycheproof - edge case on twist */ + { + .private = { 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f, + 0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42, + 0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9, + 0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 }, + .public = { 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c, + 0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5, + 0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65, + 0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6, + 0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4, + 0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8, + 0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe }, + .public = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7, + 0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca, + 0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f, + 0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa, + 0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3, + 0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52, + 0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3, + 0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e, + 0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75, + 0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26, + 0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea, + 0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00, + 0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, + .result = { 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8, + 0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32, + 0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87, + 0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c, + 0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6, + 0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb, + 0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 }, + .public = { 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff, + 0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff, + 0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff, + 0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f }, + .result = { 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85, + 0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f, + 0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0, + 0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38, + 0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b, + 0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c, + 0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, + .result = { 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b, + 0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81, + 0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3, + 0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d, + 0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42, + 0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98, + 0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, + 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, + 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, + 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f }, + .result = { 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c, + 0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9, + 0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89, + 0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 }, + .valid = true + }, + /* wycheproof - edge case for public key */ + { + .private = { 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29, + 0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6, + 0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c, + 0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f }, + .public = { 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75, + 0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89, + 0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c, + 0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f }, + .valid = true + }, + /* wycheproof - public key with low order */ + { + .private = { 0x10, 0x25, 0x5c, 0x92, 0x30, 0xa9, 0x7a, 0x30, + 0xa4, 0x58, 0xca, 0x28, 0x4a, 0x62, 0x96, 0x69, + 0x29, 0x3a, 0x31, 0x89, 0x0c, 0xda, 0x9d, 0x14, + 0x7f, 0xeb, 0xc7, 0xd1, 0xe2, 0x2d, 0x6b, 0xb1 }, + .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, + 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, + 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, + 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8, 0x00 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x78, 0xf1, 0xe8, 0xed, 0xf1, 0x44, 0x81, 0xb3, + 0x89, 0x44, 0x8d, 0xac, 0x8f, 0x59, 0xc7, 0x0b, + 0x03, 0x8e, 0x7c, 0xf9, 0x2e, 0xf2, 0xc7, 0xef, + 0xf5, 0x7a, 0x72, 0x46, 0x6e, 0x11, 0x52, 0x96 }, + .public = { 0x5f, 0x9c, 0x95, 0xbc, 0xa3, 0x50, 0x8c, 0x24, + 0xb1, 0xd0, 0xb1, 0x55, 0x9c, 0x83, 0xef, 0x5b, + 0x04, 0x44, 0x5c, 0xc4, 0x58, 0x1c, 0x8e, 0x86, + 0xd8, 0x22, 0x4e, 0xdd, 0xd0, 0x9f, 0x11, 0x57 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0xa0, 0xa0, 0x5a, 0x3e, 0x8f, 0x9f, 0x44, 0x20, + 0x4d, 0x5f, 0x80, 0x59, 0xa9, 0x4a, 0xc7, 0xdf, + 0xc3, 0x9a, 0x49, 0xac, 0x01, 0x6d, 0xd7, 0x43, + 0xdb, 0xfa, 0x43, 0xc5, 0xd6, 0x71, 0xfd, 0x88 }, + .public = { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0xd0, 0xdb, 0xb3, 0xed, 0x19, 0x06, 0x66, 0x3f, + 0x15, 0x42, 0x0a, 0xf3, 0x1f, 0x4e, 0xaf, 0x65, + 0x09, 0xd9, 0xa9, 0x94, 0x97, 0x23, 0x50, 0x06, + 0x05, 0xad, 0x7c, 0x1c, 0x6e, 0x74, 0x50, 0xa9 }, + .public = { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0xc0, 0xb1, 0xd0, 0xeb, 0x22, 0xb2, 0x44, 0xfe, + 0x32, 0x91, 0x14, 0x00, 0x72, 0xcd, 0xd9, 0xd9, + 0x89, 0xb5, 0xf0, 0xec, 0xd9, 0x6c, 0x10, 0x0f, + 0xeb, 0x5b, 0xca, 0x24, 0x1c, 0x1d, 0x9f, 0x8f }, + .public = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x48, 0x0b, 0xf4, 0x5f, 0x59, 0x49, 0x42, 0xa8, + 0xbc, 0x0f, 0x33, 0x53, 0xc6, 0xe8, 0xb8, 0x85, + 0x3d, 0x77, 0xf3, 0x51, 0xf1, 0xc2, 0xca, 0x6c, + 0x2d, 0x1a, 0xbf, 0x8a, 0x00, 0xb4, 0x22, 0x9c }, + .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x30, 0xf9, 0x93, 0xfc, 0xf8, 0x51, 0x4f, 0xc8, + 0x9b, 0xd8, 0xdb, 0x14, 0xcd, 0x43, 0xba, 0x0d, + 0x4b, 0x25, 0x30, 0xe7, 0x3c, 0x42, 0x76, 0xa0, + 0x5e, 0x1b, 0x14, 0x5d, 0x42, 0x0c, 0xed, 0xb4 }, + .public = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0xc0, 0x49, 0x74, 0xb7, 0x58, 0x38, 0x0e, 0x2a, + 0x5b, 0x5d, 0xf6, 0xeb, 0x09, 0xbb, 0x2f, 0x6b, + 0x34, 0x34, 0xf9, 0x82, 0x72, 0x2a, 0x8e, 0x67, + 0x6d, 0x3d, 0xa2, 0x51, 0xd1, 0xb3, 0xde, 0x83 }, + .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, + 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, + 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, + 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8, 0x80 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x50, 0x2a, 0x31, 0x37, 0x3d, 0xb3, 0x24, 0x46, + 0x84, 0x2f, 0xe5, 0xad, 0xd3, 0xe0, 0x24, 0x02, + 0x2e, 0xa5, 0x4f, 0x27, 0x41, 0x82, 0xaf, 0xc3, + 0xd9, 0xf1, 0xbb, 0x3d, 0x39, 0x53, 0x4e, 0xb5 }, + .public = { 0x5f, 0x9c, 0x95, 0xbc, 0xa3, 0x50, 0x8c, 0x24, + 0xb1, 0xd0, 0xb1, 0x55, 0x9c, 0x83, 0xef, 0x5b, + 0x04, 0x44, 0x5c, 0xc4, 0x58, 0x1c, 0x8e, 0x86, + 0xd8, 0x22, 0x4e, 0xdd, 0xd0, 0x9f, 0x11, 0xd7 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x90, 0xfa, 0x64, 0x17, 0xb0, 0xe3, 0x70, 0x30, + 0xfd, 0x6e, 0x43, 0xef, 0xf2, 0xab, 0xae, 0xf1, + 0x4c, 0x67, 0x93, 0x11, 0x7a, 0x03, 0x9c, 0xf6, + 0x21, 0x31, 0x8b, 0xa9, 0x0f, 0x4e, 0x98, 0xbe }, + .public = { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x78, 0xad, 0x3f, 0x26, 0x02, 0x7f, 0x1c, 0x9f, + 0xdd, 0x97, 0x5a, 0x16, 0x13, 0xb9, 0x47, 0x77, + 0x9b, 0xad, 0x2c, 0xf2, 0xb7, 0x41, 0xad, 0xe0, + 0x18, 0x40, 0x88, 0x5a, 0x30, 0xbb, 0x97, 0x9c }, + .public = { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key with low order */ + { + .private = { 0x98, 0xe2, 0x3d, 0xe7, 0xb1, 0xe0, 0x92, 0x6e, + 0xd9, 0xc8, 0x7e, 0x7b, 0x14, 0xba, 0xf5, 0x5f, + 0x49, 0x7a, 0x1d, 0x70, 0x96, 0xf9, 0x39, 0x77, + 0x68, 0x0e, 0x44, 0xdc, 0x1c, 0x7b, 0x7b, 0x8b }, + .public = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = false + }, + /* wycheproof - public key >= p */ + { + .private = { 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc, + 0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1, + 0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d, + 0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae }, + .public = { 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09, + 0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde, + 0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1, + 0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81, + 0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a, + 0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99, + 0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d }, + .public = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17, + 0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35, + 0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55, + 0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11, + 0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b, + 0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9, + 0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 }, + .public = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53, + 0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e, + 0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6, + 0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78, + 0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2, + 0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd, + 0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .result = { 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb, + 0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40, + 0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2, + 0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9, + 0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60, + 0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13, + 0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 }, + .public = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, + .result = { 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c, + 0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3, + 0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65, + 0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a, + 0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7, + 0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11, + 0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e }, + .public = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, + .result = { 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82, + 0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4, + 0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c, + 0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e, + 0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a, + 0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d, + 0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f }, + .public = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, + .result = { 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2, + 0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60, + 0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25, + 0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb, + 0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97, + 0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c, + 0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 }, + .public = { 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23, + 0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8, + 0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69, + 0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a, + 0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23, + 0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b, + 0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 }, + .public = { 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b, + 0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44, + 0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37, + 0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80, + 0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d, + 0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b, + 0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 }, + .public = { 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63, + 0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae, + 0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f, + 0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0, + 0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd, + 0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49, + 0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 }, + .public = { 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41, + 0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0, + 0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf, + 0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9, + 0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa, + 0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5, + 0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e }, + .public = { 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47, + 0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3, + 0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b, + 0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8, + 0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98, + 0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0, + 0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 }, + .public = { 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0, + 0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1, + 0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a, + 0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02, + 0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4, + 0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68, + 0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d }, + .public = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f, + 0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2, + 0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95, + 0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7, + 0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06, + 0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9, + 0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 }, + .public = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5, + 0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0, + 0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80, + 0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 }, + .valid = true + }, + /* wycheproof - public key >= p */ + { + .private = { 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd, + 0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4, + 0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04, + 0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0, + 0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac, + 0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48, + 0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 }, + .valid = true + }, + /* wycheproof - RFC 7748 */ + { + .private = { 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, + 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, + 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, + 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 }, + .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, + 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, + 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, + 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, + .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, + 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, + 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, + 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, + .valid = true + }, + /* wycheproof - RFC 7748 */ + { + .private = { 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c, + 0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5, + 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4, + 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d }, + .public = { 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3, + 0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c, + 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e, + 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 }, + .result = { 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d, + 0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8, + 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52, + 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde, + 0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8, + 0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4, + 0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 }, + .result = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d, + 0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64, + 0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd, + 0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 }, + .result = { 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8, + 0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf, + 0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94, + 0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d }, + .result = { 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84, + 0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62, + 0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e, + 0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 }, + .result = { 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8, + 0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58, + 0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02, + 0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 }, + .result = { 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9, + 0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a, + 0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44, + 0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b }, + .result = { 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd, + 0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22, + 0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56, + 0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b }, + .result = { 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53, + 0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f, + 0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18, + 0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f }, + .result = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55, + 0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b, + 0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79, + 0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f }, + .result = { 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39, + 0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c, + 0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb, + 0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e }, + .result = { 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04, + 0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10, + 0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58, + 0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c }, + .result = { 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3, + 0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c, + 0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88, + 0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 }, + .result = { 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a, + 0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49, + 0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a, + 0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }, + .valid = true + }, + /* wycheproof - edge case for shared secret */ + { + .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, + 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, + 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, + 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, + .public = { 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca, + 0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c, + 0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb, + 0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 }, + .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }, + .valid = true + }, + /* wycheproof - checking for overflow */ + { + .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, + 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, + 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, + 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, + .public = { 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58, + 0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7, + 0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01, + 0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d }, + .result = { 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d, + 0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27, + 0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b, + 0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 }, + .valid = true + }, + /* wycheproof - checking for overflow */ + { + .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, + 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, + 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, + 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, + .public = { 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26, + 0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2, + 0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44, + 0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e }, + .result = { 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6, + 0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d, + 0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e, + 0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 }, + .valid = true + }, + /* wycheproof - checking for overflow */ + { + .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, + 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, + 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, + 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, + .public = { 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61, + 0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67, + 0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e, + 0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c }, + .result = { 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65, + 0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce, + 0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0, + 0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 }, + .valid = true + }, + /* wycheproof - checking for overflow */ + { + .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, + 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, + 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, + 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, + .public = { 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee, + 0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d, + 0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14, + 0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 }, + .result = { 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e, + 0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc, + 0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5, + 0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b }, + .valid = true + }, + /* wycheproof - checking for overflow */ + { + .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, + 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, + 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, + 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, + .public = { 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4, + 0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5, + 0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c, + 0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 }, + .result = { 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b, + 0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93, + 0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f, + 0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 }, + .valid = true + }, + /* wycheproof - private key == -1 (mod order) */ + { + .private = { 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8, + 0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 }, + .public = { 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, + 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, + 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, + 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, + .result = { 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, + 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, + 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, + 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, + .valid = true + }, + /* wycheproof - private key == 1 (mod order) on twist */ + { + .private = { 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef, + 0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f }, + .public = { 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, + 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, + 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, + 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, + .result = { 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, + 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, + 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, + 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, + .valid = true + } +}; + +static void test_curve25519(struct kunit *test) +{ + for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { + const struct curve25519_test_vector *vec = + &curve25519_test_vectors[i]; + u8 out[CURVE25519_KEY_SIZE] = {}; + bool ret; + + ret = curve25519(out, vec->private, vec->public); + KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid, + "Wrong return value with test vector %zu", + i); + KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out), + "Wrong output with test vector %zu", i); + } +} + +static void test_curve25519_basepoint(struct kunit *test) +{ + for (size_t i = 0; i < 5; ++i) { + u8 in[CURVE25519_KEY_SIZE]; + u8 out[CURVE25519_KEY_SIZE]; + u8 out2[CURVE25519_KEY_SIZE]; + bool ret, ret2; + + get_random_bytes(in, sizeof(in)); + ret = curve25519_generate_public(out, in); + ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); + KUNIT_EXPECT_EQ_MSG(test, ret, ret2, + "in=%*phN", CURVE25519_KEY_SIZE, in); + KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE, + "in=%*phN", CURVE25519_KEY_SIZE, in); + } +} + +static struct kunit_case curve25519_test_cases[] = { + KUNIT_CASE(test_curve25519), + KUNIT_CASE(test_curve25519_basepoint), + {}, +}; + +static struct kunit_suite curve25519_test_suite = { + .name = "curve25519", + .test_cases = curve25519_test_cases, +}; +kunit_test_suite(curve25519_test_suite); + +MODULE_DESCRIPTION("KUnit tests for Curve25519"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8c06b330e8f79834924305362227e38e4e2469ae Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:20 -0700 Subject: lib/crypto: curve25519: Move a couple functions out-of-line Move curve25519() and curve25519_generate_public() from curve25519.h to curve25519.c. There's no good reason for them to be inline. Link: https://lore.kernel.org/r/20250906213523.84915-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/curve25519.h | 28 +++------------------------- lib/crypto/curve25519.c | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index 4e6dc840b159..78aa5f28c847 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -6,7 +6,6 @@ #ifndef CURVE25519_H #define CURVE25519_H -#include // For crypto_memneq. #include #include @@ -28,33 +27,12 @@ void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]); -static inline bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_arch(mypublic, secret, basepoint); - else - curve25519_generic(mypublic, secret, basepoint); - return crypto_memneq(mypublic, curve25519_null_point, - CURVE25519_KEY_SIZE); -} + const u8 basepoint[CURVE25519_KEY_SIZE]); -static inline bool -__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - if (unlikely(!crypto_memneq(secret, curve25519_null_point, - CURVE25519_KEY_SIZE))) - return false; - - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_base_arch(pub, secret); - else - curve25519_generic(pub, secret, curve25519_base_point); - return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); -} +bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]); static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE]) { diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 25f16777865b..1b786389d714 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -10,8 +10,40 @@ */ #include -#include +#include +#include #include +#include + +bool __must_check +curve25519(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) + curve25519_arch(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); + return crypto_memneq(mypublic, curve25519_null_point, + CURVE25519_KEY_SIZE); +} +EXPORT_SYMBOL(curve25519); + +bool __must_check +curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (unlikely(!crypto_memneq(secret, curve25519_null_point, + CURVE25519_KEY_SIZE))) + return false; + + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) + curve25519_base_arch(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); + return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); +} +EXPORT_SYMBOL(curve25519_generate_public); static int __init curve25519_init(void) { -- cgit v1.2.3 From 68546e5632c0b982663af575ae12cc5d81facc91 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 14:35:21 -0700 Subject: lib/crypto: curve25519: Consolidate into single module Reorganize the Curve25519 library code: - Build a single libcurve25519 module, instead of up to three modules: libcurve25519, libcurve25519-generic, and an arch-specific module. - Move the arch-specific Curve25519 code from arch/$(SRCARCH)/crypto/ to lib/crypto/$(SRCARCH)/. Centralize the build rules into lib/crypto/Makefile and lib/crypto/Kconfig. - Include the arch-specific code directly in lib/crypto/curve25519.c via a header, rather than using a separate .c file. - Eliminate the entanglement with CRYPTO. CRYPTO_LIB_CURVE25519 no longer selects CRYPTO, and the arch-specific Curve25519 code no longer depends on CRYPTO. This brings Curve25519 in line with the latest conventions for lib/crypto/, used by other algorithms. The exception is that I kept the generic code in separate translation units for now. (Some of the function names collide between the x86 and generic Curve25519 code. And the Curve25519 functions are very long anyway, so inlining doesn't matter as much for Curve25519 as it does for some other algorithms.) Link: https://lore.kernel.org/r/20250906213523.84915-11-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm/crypto/Kconfig | 12 - arch/arm/crypto/Makefile | 2 - arch/arm/crypto/curve25519-core.S | 2062 ------------------------- arch/arm/crypto/curve25519-glue.c | 62 - arch/powerpc/crypto/Kconfig | 12 - arch/powerpc/crypto/Makefile | 2 - arch/powerpc/crypto/curve25519-ppc64le-core.c | 195 --- arch/powerpc/crypto/curve25519-ppc64le_asm.S | 671 -------- arch/x86/crypto/Kconfig | 12 - arch/x86/crypto/Makefile | 5 - arch/x86/crypto/curve25519-x86_64.c | 1630 ------------------- include/crypto/curve25519.h | 10 - lib/crypto/Kconfig | 37 +- lib/crypto/Makefile | 26 +- lib/crypto/arm/curve25519-core.S | 2062 +++++++++++++++++++++++++ lib/crypto/arm/curve25519.h | 47 + lib/crypto/curve25519-generic.c | 25 - lib/crypto/curve25519.c | 50 +- lib/crypto/powerpc/curve25519-ppc64le_asm.S | 671 ++++++++ lib/crypto/powerpc/curve25519.h | 186 +++ lib/crypto/x86/curve25519.h | 1613 +++++++++++++++++++ 21 files changed, 4645 insertions(+), 4747 deletions(-) delete mode 100644 arch/arm/crypto/curve25519-core.S delete mode 100644 arch/arm/crypto/curve25519-glue.c delete mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c delete mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S delete mode 100644 arch/x86/crypto/curve25519-x86_64.c create mode 100644 lib/crypto/arm/curve25519-core.S create mode 100644 lib/crypto/arm/curve25519.h delete mode 100644 lib/crypto/curve25519-generic.c create mode 100644 lib/crypto/powerpc/curve25519-ppc64le_asm.S create mode 100644 lib/crypto/powerpc/curve25519.h create mode 100644 lib/crypto/x86/curve25519.h (limited to 'include') diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 97718d86f600..c436eec22d86 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -2,18 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (arm)" -config CRYPTO_CURVE25519_NEON - tristate - depends on KERNEL_MODE_NEON - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: arm with - - NEON (Advanced SIMD) extensions - config CRYPTO_GHASH_ARM_CE tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 4f23999ae17d..6346a73effc0 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o -obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o @@ -18,4 +17,3 @@ blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o -curve25519-neon-y := curve25519-core.o curve25519-glue.o diff --git a/arch/arm/crypto/curve25519-core.S b/arch/arm/crypto/curve25519-core.S deleted file mode 100644 index b697fa5d059a..000000000000 --- a/arch/arm/crypto/curve25519-core.S +++ /dev/null @@ -1,2062 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This - * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been - * manually reworked for use in kernel space. - */ - -#include - -.text -.arch armv7-a -.fpu neon -.align 4 - -ENTRY(curve25519_neon) - push {r4-r11, lr} - mov ip, sp - sub r3, sp, #704 - and r3, r3, #0xfffffff0 - mov sp, r3 - movw r4, #0 - movw r5, #254 - vmov.i32 q0, #1 - vshr.u64 q1, q0, #7 - vshr.u64 q0, q0, #8 - vmov.i32 d4, #19 - vmov.i32 d5, #38 - add r6, sp, #480 - vst1.8 {d2-d3}, [r6, : 128]! - vst1.8 {d0-d1}, [r6, : 128]! - vst1.8 {d4-d5}, [r6, : 128] - add r6, r3, #0 - vmov.i32 q2, #0 - vst1.8 {d4-d5}, [r6, : 128]! - vst1.8 {d4-d5}, [r6, : 128]! - vst1.8 d4, [r6, : 64] - add r6, r3, #0 - movw r7, #960 - sub r7, r7, #2 - neg r7, r7 - sub r7, r7, r7, LSL #7 - str r7, [r6] - add r6, sp, #672 - vld1.8 {d4-d5}, [r1]! - vld1.8 {d6-d7}, [r1] - vst1.8 {d4-d5}, [r6, : 128]! - vst1.8 {d6-d7}, [r6, : 128] - sub r1, r6, #16 - ldrb r6, [r1] - and r6, r6, #248 - strb r6, [r1] - ldrb r6, [r1, #31] - and r6, r6, #127 - orr r6, r6, #64 - strb r6, [r1, #31] - vmov.i64 q2, #0xffffffff - vshr.u64 q3, q2, #7 - vshr.u64 q2, q2, #6 - vld1.8 {d8}, [r2] - vld1.8 {d10}, [r2] - add r2, r2, #6 - vld1.8 {d12}, [r2] - vld1.8 {d14}, [r2] - add r2, r2, #6 - vld1.8 {d16}, [r2] - add r2, r2, #4 - vld1.8 {d18}, [r2] - vld1.8 {d20}, [r2] - add r2, r2, #6 - vld1.8 {d22}, [r2] - add r2, r2, #2 - vld1.8 {d24}, [r2] - vld1.8 {d26}, [r2] - vshr.u64 q5, q5, #26 - vshr.u64 q6, q6, #3 - vshr.u64 q7, q7, #29 - vshr.u64 q8, q8, #6 - vshr.u64 q10, q10, #25 - vshr.u64 q11, q11, #3 - vshr.u64 q12, q12, #12 - vshr.u64 q13, q13, #38 - vand q4, q4, q2 - vand q6, q6, q2 - vand q8, q8, q2 - vand q10, q10, q2 - vand q2, q12, q2 - vand q5, q5, q3 - vand q7, q7, q3 - vand q9, q9, q3 - vand q11, q11, q3 - vand q3, q13, q3 - add r2, r3, #48 - vadd.i64 q12, q4, q1 - vadd.i64 q13, q10, q1 - vshr.s64 q12, q12, #26 - vshr.s64 q13, q13, #26 - vadd.i64 q5, q5, q12 - vshl.i64 q12, q12, #26 - vadd.i64 q14, q5, q0 - vadd.i64 q11, q11, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q15, q11, q0 - vsub.i64 q4, q4, q12 - vshr.s64 q12, q14, #25 - vsub.i64 q10, q10, q13 - vshr.s64 q13, q15, #25 - vadd.i64 q6, q6, q12 - vshl.i64 q12, q12, #25 - vadd.i64 q14, q6, q1 - vadd.i64 q2, q2, q13 - vsub.i64 q5, q5, q12 - vshr.s64 q12, q14, #26 - vshl.i64 q13, q13, #25 - vadd.i64 q14, q2, q1 - vadd.i64 q7, q7, q12 - vshl.i64 q12, q12, #26 - vadd.i64 q15, q7, q0 - vsub.i64 q11, q11, q13 - vshr.s64 q13, q14, #26 - vsub.i64 q6, q6, q12 - vshr.s64 q12, q15, #25 - vadd.i64 q3, q3, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q14, q3, q0 - vadd.i64 q8, q8, q12 - vshl.i64 q12, q12, #25 - vadd.i64 q15, q8, q1 - add r2, r2, #8 - vsub.i64 q2, q2, q13 - vshr.s64 q13, q14, #25 - vsub.i64 q7, q7, q12 - vshr.s64 q12, q15, #26 - vadd.i64 q14, q13, q13 - vadd.i64 q9, q9, q12 - vtrn.32 d12, d14 - vshl.i64 q12, q12, #26 - vtrn.32 d13, d15 - vadd.i64 q0, q9, q0 - vadd.i64 q4, q4, q14 - vst1.8 d12, [r2, : 64]! - vshl.i64 q6, q13, #4 - vsub.i64 q7, q8, q12 - vshr.s64 q0, q0, #25 - vadd.i64 q4, q4, q6 - vadd.i64 q6, q10, q0 - vshl.i64 q0, q0, #25 - vadd.i64 q8, q6, q1 - vadd.i64 q4, q4, q13 - vshl.i64 q10, q13, #25 - vadd.i64 q1, q4, q1 - vsub.i64 q0, q9, q0 - vshr.s64 q8, q8, #26 - vsub.i64 q3, q3, q10 - vtrn.32 d14, d0 - vshr.s64 q1, q1, #26 - vtrn.32 d15, d1 - vadd.i64 q0, q11, q8 - vst1.8 d14, [r2, : 64] - vshl.i64 q7, q8, #26 - vadd.i64 q5, q5, q1 - vtrn.32 d4, d6 - vshl.i64 q1, q1, #26 - vtrn.32 d5, d7 - vsub.i64 q3, q6, q7 - add r2, r2, #16 - vsub.i64 q1, q4, q1 - vst1.8 d4, [r2, : 64] - vtrn.32 d6, d0 - vtrn.32 d7, d1 - sub r2, r2, #8 - vtrn.32 d2, d10 - vtrn.32 d3, d11 - vst1.8 d6, [r2, : 64] - sub r2, r2, #24 - vst1.8 d2, [r2, : 64] - add r2, r3, #96 - vmov.i32 q0, #0 - vmov.i64 d2, #0xff - vmov.i64 d3, #0 - vshr.u32 q1, q1, #7 - vst1.8 {d2-d3}, [r2, : 128]! - vst1.8 {d0-d1}, [r2, : 128]! - vst1.8 d0, [r2, : 64] - add r2, r3, #144 - vmov.i32 q0, #0 - vst1.8 {d0-d1}, [r2, : 128]! - vst1.8 {d0-d1}, [r2, : 128]! - vst1.8 d0, [r2, : 64] - add r2, r3, #240 - vmov.i32 q0, #0 - vmov.i64 d2, #0xff - vmov.i64 d3, #0 - vshr.u32 q1, q1, #7 - vst1.8 {d2-d3}, [r2, : 128]! - vst1.8 {d0-d1}, [r2, : 128]! - vst1.8 d0, [r2, : 64] - add r2, r3, #48 - add r6, r3, #192 - vld1.8 {d0-d1}, [r2, : 128]! - vld1.8 {d2-d3}, [r2, : 128]! - vld1.8 {d4}, [r2, : 64] - vst1.8 {d0-d1}, [r6, : 128]! - vst1.8 {d2-d3}, [r6, : 128]! - vst1.8 d4, [r6, : 64] -.Lmainloop: - mov r2, r5, LSR #3 - and r6, r5, #7 - ldrb r2, [r1, r2] - mov r2, r2, LSR r6 - and r2, r2, #1 - str r5, [sp, #456] - eor r4, r4, r2 - str r2, [sp, #460] - neg r2, r4 - add r4, r3, #96 - add r5, r3, #192 - add r6, r3, #144 - vld1.8 {d8-d9}, [r4, : 128]! - add r7, r3, #240 - vld1.8 {d10-d11}, [r5, : 128]! - veor q6, q4, q5 - vld1.8 {d14-d15}, [r6, : 128]! - vdup.i32 q8, r2 - vld1.8 {d18-d19}, [r7, : 128]! - veor q10, q7, q9 - vld1.8 {d22-d23}, [r4, : 128]! - vand q6, q6, q8 - vld1.8 {d24-d25}, [r5, : 128]! - vand q10, q10, q8 - vld1.8 {d26-d27}, [r6, : 128]! - veor q4, q4, q6 - vld1.8 {d28-d29}, [r7, : 128]! - veor q5, q5, q6 - vld1.8 {d0}, [r4, : 64] - veor q6, q7, q10 - vld1.8 {d2}, [r5, : 64] - veor q7, q9, q10 - vld1.8 {d4}, [r6, : 64] - veor q9, q11, q12 - vld1.8 {d6}, [r7, : 64] - veor q10, q0, q1 - sub r2, r4, #32 - vand q9, q9, q8 - sub r4, r5, #32 - vand q10, q10, q8 - sub r5, r6, #32 - veor q11, q11, q9 - sub r6, r7, #32 - veor q0, q0, q10 - veor q9, q12, q9 - veor q1, q1, q10 - veor q10, q13, q14 - veor q12, q2, q3 - vand q10, q10, q8 - vand q8, q12, q8 - veor q12, q13, q10 - veor q2, q2, q8 - veor q10, q14, q10 - veor q3, q3, q8 - vadd.i32 q8, q4, q6 - vsub.i32 q4, q4, q6 - vst1.8 {d16-d17}, [r2, : 128]! - vadd.i32 q6, q11, q12 - vst1.8 {d8-d9}, [r5, : 128]! - vsub.i32 q4, q11, q12 - vst1.8 {d12-d13}, [r2, : 128]! - vadd.i32 q6, q0, q2 - vst1.8 {d8-d9}, [r5, : 128]! - vsub.i32 q0, q0, q2 - vst1.8 d12, [r2, : 64] - vadd.i32 q2, q5, q7 - vst1.8 d0, [r5, : 64] - vsub.i32 q0, q5, q7 - vst1.8 {d4-d5}, [r4, : 128]! - vadd.i32 q2, q9, q10 - vst1.8 {d0-d1}, [r6, : 128]! - vsub.i32 q0, q9, q10 - vst1.8 {d4-d5}, [r4, : 128]! - vadd.i32 q2, q1, q3 - vst1.8 {d0-d1}, [r6, : 128]! - vsub.i32 q0, q1, q3 - vst1.8 d4, [r4, : 64] - vst1.8 d0, [r6, : 64] - add r2, sp, #512 - add r4, r3, #96 - add r5, r3, #144 - vld1.8 {d0-d1}, [r2, : 128] - vld1.8 {d2-d3}, [r4, : 128]! - vld1.8 {d4-d5}, [r5, : 128]! - vzip.i32 q1, q2 - vld1.8 {d6-d7}, [r4, : 128]! - vld1.8 {d8-d9}, [r5, : 128]! - vshl.i32 q5, q1, #1 - vzip.i32 q3, q4 - vshl.i32 q6, q2, #1 - vld1.8 {d14}, [r4, : 64] - vshl.i32 q8, q3, #1 - vld1.8 {d15}, [r5, : 64] - vshl.i32 q9, q4, #1 - vmul.i32 d21, d7, d1 - vtrn.32 d14, d15 - vmul.i32 q11, q4, q0 - vmul.i32 q0, q7, q0 - vmull.s32 q12, d2, d2 - vmlal.s32 q12, d11, d1 - vmlal.s32 q12, d12, d0 - vmlal.s32 q12, d13, d23 - vmlal.s32 q12, d16, d22 - vmlal.s32 q12, d7, d21 - vmull.s32 q10, d2, d11 - vmlal.s32 q10, d4, d1 - vmlal.s32 q10, d13, d0 - vmlal.s32 q10, d6, d23 - vmlal.s32 q10, d17, d22 - vmull.s32 q13, d10, d4 - vmlal.s32 q13, d11, d3 - vmlal.s32 q13, d13, d1 - vmlal.s32 q13, d16, d0 - vmlal.s32 q13, d17, d23 - vmlal.s32 q13, d8, d22 - vmull.s32 q1, d10, d5 - vmlal.s32 q1, d11, d4 - vmlal.s32 q1, d6, d1 - vmlal.s32 q1, d17, d0 - vmlal.s32 q1, d8, d23 - vmull.s32 q14, d10, d6 - vmlal.s32 q14, d11, d13 - vmlal.s32 q14, d4, d4 - vmlal.s32 q14, d17, d1 - vmlal.s32 q14, d18, d0 - vmlal.s32 q14, d9, d23 - vmull.s32 q11, d10, d7 - vmlal.s32 q11, d11, d6 - vmlal.s32 q11, d12, d5 - vmlal.s32 q11, d8, d1 - vmlal.s32 q11, d19, d0 - vmull.s32 q15, d10, d8 - vmlal.s32 q15, d11, d17 - vmlal.s32 q15, d12, d6 - vmlal.s32 q15, d13, d5 - vmlal.s32 q15, d19, d1 - vmlal.s32 q15, d14, d0 - vmull.s32 q2, d10, d9 - vmlal.s32 q2, d11, d8 - vmlal.s32 q2, d12, d7 - vmlal.s32 q2, d13, d6 - vmlal.s32 q2, d14, d1 - vmull.s32 q0, d15, d1 - vmlal.s32 q0, d10, d14 - vmlal.s32 q0, d11, d19 - vmlal.s32 q0, d12, d8 - vmlal.s32 q0, d13, d17 - vmlal.s32 q0, d6, d6 - add r2, sp, #480 - vld1.8 {d18-d19}, [r2, : 128]! - vmull.s32 q3, d16, d7 - vmlal.s32 q3, d10, d15 - vmlal.s32 q3, d11, d14 - vmlal.s32 q3, d12, d9 - vmlal.s32 q3, d13, d8 - vld1.8 {d8-d9}, [r2, : 128] - vadd.i64 q5, q12, q9 - vadd.i64 q6, q15, q9 - vshr.s64 q5, q5, #26 - vshr.s64 q6, q6, #26 - vadd.i64 q7, q10, q5 - vshl.i64 q5, q5, #26 - vadd.i64 q8, q7, q4 - vadd.i64 q2, q2, q6 - vshl.i64 q6, q6, #26 - vadd.i64 q10, q2, q4 - vsub.i64 q5, q12, q5 - vshr.s64 q8, q8, #25 - vsub.i64 q6, q15, q6 - vshr.s64 q10, q10, #25 - vadd.i64 q12, q13, q8 - vshl.i64 q8, q8, #25 - vadd.i64 q13, q12, q9 - vadd.i64 q0, q0, q10 - vsub.i64 q7, q7, q8 - vshr.s64 q8, q13, #26 - vshl.i64 q10, q10, #25 - vadd.i64 q13, q0, q9 - vadd.i64 q1, q1, q8 - vshl.i64 q8, q8, #26 - vadd.i64 q15, q1, q4 - vsub.i64 q2, q2, q10 - vshr.s64 q10, q13, #26 - vsub.i64 q8, q12, q8 - vshr.s64 q12, q15, #25 - vadd.i64 q3, q3, q10 - vshl.i64 q10, q10, #26 - vadd.i64 q13, q3, q4 - vadd.i64 q14, q14, q12 - add r2, r3, #288 - vshl.i64 q12, q12, #25 - add r4, r3, #336 - vadd.i64 q15, q14, q9 - add r2, r2, #8 - vsub.i64 q0, q0, q10 - add r4, r4, #8 - vshr.s64 q10, q13, #25 - vsub.i64 q1, q1, q12 - vshr.s64 q12, q15, #26 - vadd.i64 q13, q10, q10 - vadd.i64 q11, q11, q12 - vtrn.32 d16, d2 - vshl.i64 q12, q12, #26 - vtrn.32 d17, d3 - vadd.i64 q1, q11, q4 - vadd.i64 q4, q5, q13 - vst1.8 d16, [r2, : 64]! - vshl.i64 q5, q10, #4 - vst1.8 d17, [r4, : 64]! - vsub.i64 q8, q14, q12 - vshr.s64 q1, q1, #25 - vadd.i64 q4, q4, q5 - vadd.i64 q5, q6, q1 - vshl.i64 q1, q1, #25 - vadd.i64 q6, q5, q9 - vadd.i64 q4, q4, q10 - vshl.i64 q10, q10, #25 - vadd.i64 q9, q4, q9 - vsub.i64 q1, q11, q1 - vshr.s64 q6, q6, #26 - vsub.i64 q3, q3, q10 - vtrn.32 d16, d2 - vshr.s64 q9, q9, #26 - vtrn.32 d17, d3 - vadd.i64 q1, q2, q6 - vst1.8 d16, [r2, : 64] - vshl.i64 q2, q6, #26 - vst1.8 d17, [r4, : 64] - vadd.i64 q6, q7, q9 - vtrn.32 d0, d6 - vshl.i64 q7, q9, #26 - vtrn.32 d1, d7 - vsub.i64 q2, q5, q2 - add r2, r2, #16 - vsub.i64 q3, q4, q7 - vst1.8 d0, [r2, : 64] - add r4, r4, #16 - vst1.8 d1, [r4, : 64] - vtrn.32 d4, d2 - vtrn.32 d5, d3 - sub r2, r2, #8 - sub r4, r4, #8 - vtrn.32 d6, d12 - vtrn.32 d7, d13 - vst1.8 d4, [r2, : 64] - vst1.8 d5, [r4, : 64] - sub r2, r2, #24 - sub r4, r4, #24 - vst1.8 d6, [r2, : 64] - vst1.8 d7, [r4, : 64] - add r2, r3, #240 - add r4, r3, #96 - vld1.8 {d0-d1}, [r4, : 128]! - vld1.8 {d2-d3}, [r4, : 128]! - vld1.8 {d4}, [r4, : 64] - add r4, r3, #144 - vld1.8 {d6-d7}, [r4, : 128]! - vtrn.32 q0, q3 - vld1.8 {d8-d9}, [r4, : 128]! - vshl.i32 q5, q0, #4 - vtrn.32 q1, q4 - vshl.i32 q6, q3, #4 - vadd.i32 q5, q5, q0 - vadd.i32 q6, q6, q3 - vshl.i32 q7, q1, #4 - vld1.8 {d5}, [r4, : 64] - vshl.i32 q8, q4, #4 - vtrn.32 d4, d5 - vadd.i32 q7, q7, q1 - vadd.i32 q8, q8, q4 - vld1.8 {d18-d19}, [r2, : 128]! - vshl.i32 q10, q2, #4 - vld1.8 {d22-d23}, [r2, : 128]! - vadd.i32 q10, q10, q2 - vld1.8 {d24}, [r2, : 64] - vadd.i32 q5, q5, q0 - add r2, r3, #192 - vld1.8 {d26-d27}, [r2, : 128]! - vadd.i32 q6, q6, q3 - vld1.8 {d28-d29}, [r2, : 128]! - vadd.i32 q8, q8, q4 - vld1.8 {d25}, [r2, : 64] - vadd.i32 q10, q10, q2 - vtrn.32 q9, q13 - vadd.i32 q7, q7, q1 - vadd.i32 q5, q5, q0 - vtrn.32 q11, q14 - vadd.i32 q6, q6, q3 - add r2, sp, #528 - vadd.i32 q10, q10, q2 - vtrn.32 d24, d25 - vst1.8 {d12-d13}, [r2, : 128]! - vshl.i32 q6, q13, #1 - vst1.8 {d20-d21}, [r2, : 128]! - vshl.i32 q10, q14, #1 - vst1.8 {d12-d13}, [r2, : 128]! - vshl.i32 q15, q12, #1 - vadd.i32 q8, q8, q4 - vext.32 d10, d31, d30, #0 - vadd.i32 q7, q7, q1 - vst1.8 {d16-d17}, [r2, : 128]! - vmull.s32 q8, d18, d5 - vmlal.s32 q8, d26, d4 - vmlal.s32 q8, d19, d9 - vmlal.s32 q8, d27, d3 - vmlal.s32 q8, d22, d8 - vmlal.s32 q8, d28, d2 - vmlal.s32 q8, d23, d7 - vmlal.s32 q8, d29, d1 - vmlal.s32 q8, d24, d6 - vmlal.s32 q8, d25, d0 - vst1.8 {d14-d15}, [r2, : 128]! - vmull.s32 q2, d18, d4 - vmlal.s32 q2, d12, d9 - vmlal.s32 q2, d13, d8 - vmlal.s32 q2, d19, d3 - vmlal.s32 q2, d22, d2 - vmlal.s32 q2, d23, d1 - vmlal.s32 q2, d24, d0 - vst1.8 {d20-d21}, [r2, : 128]! - vmull.s32 q7, d18, d9 - vmlal.s32 q7, d26, d3 - vmlal.s32 q7, d19, d8 - vmlal.s32 q7, d27, d2 - vmlal.s32 q7, d22, d7 - vmlal.s32 q7, d28, d1 - vmlal.s32 q7, d23, d6 - vmlal.s32 q7, d29, d0 - vst1.8 {d10-d11}, [r2, : 128]! - vmull.s32 q5, d18, d3 - vmlal.s32 q5, d19, d2 - vmlal.s32 q5, d22, d1 - vmlal.s32 q5, d23, d0 - vmlal.s32 q5, d12, d8 - vst1.8 {d16-d17}, [r2, : 128] - vmull.s32 q4, d18, d8 - vmlal.s32 q4, d26, d2 - vmlal.s32 q4, d19, d7 - vmlal.s32 q4, d27, d1 - vmlal.s32 q4, d22, d6 - vmlal.s32 q4, d28, d0 - vmull.s32 q8, d18, d7 - vmlal.s32 q8, d26, d1 - vmlal.s32 q8, d19, d6 - vmlal.s32 q8, d27, d0 - add r2, sp, #544 - vld1.8 {d20-d21}, [r2, : 128] - vmlal.s32 q7, d24, d21 - vmlal.s32 q7, d25, d20 - vmlal.s32 q4, d23, d21 - vmlal.s32 q4, d29, d20 - vmlal.s32 q8, d22, d21 - vmlal.s32 q8, d28, d20 - vmlal.s32 q5, d24, d20 - vst1.8 {d14-d15}, [r2, : 128] - vmull.s32 q7, d18, d6 - vmlal.s32 q7, d26, d0 - add r2, sp, #624 - vld1.8 {d30-d31}, [r2, : 128] - vmlal.s32 q2, d30, d21 - vmlal.s32 q7, d19, d21 - vmlal.s32 q7, d27, d20 - add r2, sp, #592 - vld1.8 {d26-d27}, [r2, : 128] - vmlal.s32 q4, d25, d27 - vmlal.s32 q8, d29, d27 - vmlal.s32 q8, d25, d26 - vmlal.s32 q7, d28, d27 - vmlal.s32 q7, d29, d26 - add r2, sp, #576 - vld1.8 {d28-d29}, [r2, : 128] - vmlal.s32 q4, d24, d29 - vmlal.s32 q8, d23, d29 - vmlal.s32 q8, d24, d28 - vmlal.s32 q7, d22, d29 - vmlal.s32 q7, d23, d28 - vst1.8 {d8-d9}, [r2, : 128] - add r2, sp, #528 - vld1.8 {d8-d9}, [r2, : 128] - vmlal.s32 q7, d24, d9 - vmlal.s32 q7, d25, d31 - vmull.s32 q1, d18, d2 - vmlal.s32 q1, d19, d1 - vmlal.s32 q1, d22, d0 - vmlal.s32 q1, d24, d27 - vmlal.s32 q1, d23, d20 - vmlal.s32 q1, d12, d7 - vmlal.s32 q1, d13, d6 - vmull.s32 q6, d18, d1 - vmlal.s32 q6, d19, d0 - vmlal.s32 q6, d23, d27 - vmlal.s32 q6, d22, d20 - vmlal.s32 q6, d24, d26 - vmull.s32 q0, d18, d0 - vmlal.s32 q0, d22, d27 - vmlal.s32 q0, d23, d26 - vmlal.s32 q0, d24, d31 - vmlal.s32 q0, d19, d20 - add r2, sp, #608 - vld1.8 {d18-d19}, [r2, : 128] - vmlal.s32 q2, d18, d7 - vmlal.s32 q5, d18, d6 - vmlal.s32 q1, d18, d21 - vmlal.s32 q0, d18, d28 - vmlal.s32 q6, d18, d29 - vmlal.s32 q2, d19, d6 - vmlal.s32 q5, d19, d21 - vmlal.s32 q1, d19, d29 - vmlal.s32 q0, d19, d9 - vmlal.s32 q6, d19, d28 - add r2, sp, #560 - vld1.8 {d18-d19}, [r2, : 128] - add r2, sp, #480 - vld1.8 {d22-d23}, [r2, : 128] - vmlal.s32 q5, d19, d7 - vmlal.s32 q0, d18, d21 - vmlal.s32 q0, d19, d29 - vmlal.s32 q6, d18, d6 - add r2, sp, #496 - vld1.8 {d6-d7}, [r2, : 128] - vmlal.s32 q6, d19, d21 - add r2, sp, #544 - vld1.8 {d18-d19}, [r2, : 128] - vmlal.s32 q0, d30, d8 - add r2, sp, #640 - vld1.8 {d20-d21}, [r2, : 128] - vmlal.s32 q5, d30, d29 - add r2, sp, #576 - vld1.8 {d24-d25}, [r2, : 128] - vmlal.s32 q1, d30, d28 - vadd.i64 q13, q0, q11 - vadd.i64 q14, q5, q11 - vmlal.s32 q6, d30, d9 - vshr.s64 q4, q13, #26 - vshr.s64 q13, q14, #26 - vadd.i64 q7, q7, q4 - vshl.i64 q4, q4, #26 - vadd.i64 q14, q7, q3 - vadd.i64 q9, q9, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q15, q9, q3 - vsub.i64 q0, q0, q4 - vshr.s64 q4, q14, #25 - vsub.i64 q5, q5, q13 - vshr.s64 q13, q15, #25 - vadd.i64 q6, q6, q4 - vshl.i64 q4, q4, #25 - vadd.i64 q14, q6, q11 - vadd.i64 q2, q2, q13 - vsub.i64 q4, q7, q4 - vshr.s64 q7, q14, #26 - vshl.i64 q13, q13, #25 - vadd.i64 q14, q2, q11 - vadd.i64 q8, q8, q7 - vshl.i64 q7, q7, #26 - vadd.i64 q15, q8, q3 - vsub.i64 q9, q9, q13 - vshr.s64 q13, q14, #26 - vsub.i64 q6, q6, q7 - vshr.s64 q7, q15, #25 - vadd.i64 q10, q10, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q14, q10, q3 - vadd.i64 q1, q1, q7 - add r2, r3, #144 - vshl.i64 q7, q7, #25 - add r4, r3, #96 - vadd.i64 q15, q1, q11 - add r2, r2, #8 - vsub.i64 q2, q2, q13 - add r4, r4, #8 - vshr.s64 q13, q14, #25 - vsub.i64 q7, q8, q7 - vshr.s64 q8, q15, #26 - vadd.i64 q14, q13, q13 - vadd.i64 q12, q12, q8 - vtrn.32 d12, d14 - vshl.i64 q8, q8, #26 - vtrn.32 d13, d15 - vadd.i64 q3, q12, q3 - vadd.i64 q0, q0, q14 - vst1.8 d12, [r2, : 64]! - vshl.i64 q7, q13, #4 - vst1.8 d13, [r4, : 64]! - vsub.i64 q1, q1, q8 - vshr.s64 q3, q3, #25 - vadd.i64 q0, q0, q7 - vadd.i64 q5, q5, q3 - vshl.i64 q3, q3, #25 - vadd.i64 q6, q5, q11 - vadd.i64 q0, q0, q13 - vshl.i64 q7, q13, #25 - vadd.i64 q8, q0, q11 - vsub.i64 q3, q12, q3 - vshr.s64 q6, q6, #26 - vsub.i64 q7, q10, q7 - vtrn.32 d2, d6 - vshr.s64 q8, q8, #26 - vtrn.32 d3, d7 - vadd.i64 q3, q9, q6 - vst1.8 d2, [r2, : 64] - vshl.i64 q6, q6, #26 - vst1.8 d3, [r4, : 64] - vadd.i64 q1, q4, q8 - vtrn.32 d4, d14 - vshl.i64 q4, q8, #26 - vtrn.32 d5, d15 - vsub.i64 q5, q5, q6 - add r2, r2, #16 - vsub.i64 q0, q0, q4 - vst1.8 d4, [r2, : 64] - add r4, r4, #16 - vst1.8 d5, [r4, : 64] - vtrn.32 d10, d6 - vtrn.32 d11, d7 - sub r2, r2, #8 - sub r4, r4, #8 - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vst1.8 d10, [r2, : 64] - vst1.8 d11, [r4, : 64] - sub r2, r2, #24 - sub r4, r4, #24 - vst1.8 d0, [r2, : 64] - vst1.8 d1, [r4, : 64] - add r2, r3, #288 - add r4, r3, #336 - vld1.8 {d0-d1}, [r2, : 128]! - vld1.8 {d2-d3}, [r4, : 128]! - vsub.i32 q0, q0, q1 - vld1.8 {d2-d3}, [r2, : 128]! - vld1.8 {d4-d5}, [r4, : 128]! - vsub.i32 q1, q1, q2 - add r5, r3, #240 - vld1.8 {d4}, [r2, : 64] - vld1.8 {d6}, [r4, : 64] - vsub.i32 q2, q2, q3 - vst1.8 {d0-d1}, [r5, : 128]! - vst1.8 {d2-d3}, [r5, : 128]! - vst1.8 d4, [r5, : 64] - add r2, r3, #144 - add r4, r3, #96 - add r5, r3, #144 - add r6, r3, #192 - vld1.8 {d0-d1}, [r2, : 128]! - vld1.8 {d2-d3}, [r4, : 128]! - vsub.i32 q2, q0, q1 - vadd.i32 q0, q0, q1 - vld1.8 {d2-d3}, [r2, : 128]! - vld1.8 {d6-d7}, [r4, : 128]! - vsub.i32 q4, q1, q3 - vadd.i32 q1, q1, q3 - vld1.8 {d6}, [r2, : 64] - vld1.8 {d10}, [r4, : 64] - vsub.i32 q6, q3, q5 - vadd.i32 q3, q3, q5 - vst1.8 {d4-d5}, [r5, : 128]! - vst1.8 {d0-d1}, [r6, : 128]! - vst1.8 {d8-d9}, [r5, : 128]! - vst1.8 {d2-d3}, [r6, : 128]! - vst1.8 d12, [r5, : 64] - vst1.8 d6, [r6, : 64] - add r2, r3, #0 - add r4, r3, #240 - vld1.8 {d0-d1}, [r4, : 128]! - vld1.8 {d2-d3}, [r4, : 128]! - vld1.8 {d4}, [r4, : 64] - add r4, r3, #336 - vld1.8 {d6-d7}, [r4, : 128]! - vtrn.32 q0, q3 - vld1.8 {d8-d9}, [r4, : 128]! - vshl.i32 q5, q0, #4 - vtrn.32 q1, q4 - vshl.i32 q6, q3, #4 - vadd.i32 q5, q5, q0 - vadd.i32 q6, q6, q3 - vshl.i32 q7, q1, #4 - vld1.8 {d5}, [r4, : 64] - vshl.i32 q8, q4, #4 - vtrn.32 d4, d5 - vadd.i32 q7, q7, q1 - vadd.i32 q8, q8, q4 - vld1.8 {d18-d19}, [r2, : 128]! - vshl.i32 q10, q2, #4 - vld1.8 {d22-d23}, [r2, : 128]! - vadd.i32 q10, q10, q2 - vld1.8 {d24}, [r2, : 64] - vadd.i32 q5, q5, q0 - add r2, r3, #288 - vld1.8 {d26-d27}, [r2, : 128]! - vadd.i32 q6, q6, q3 - vld1.8 {d28-d29}, [r2, : 128]! - vadd.i32 q8, q8, q4 - vld1.8 {d25}, [r2, : 64] - vadd.i32 q10, q10, q2 - vtrn.32 q9, q13 - vadd.i32 q7, q7, q1 - vadd.i32 q5, q5, q0 - vtrn.32 q11, q14 - vadd.i32 q6, q6, q3 - add r2, sp, #528 - vadd.i32 q10, q10, q2 - vtrn.32 d24, d25 - vst1.8 {d12-d13}, [r2, : 128]! - vshl.i32 q6, q13, #1 - vst1.8 {d20-d21}, [r2, : 128]! - vshl.i32 q10, q14, #1 - vst1.8 {d12-d13}, [r2, : 128]! - vshl.i32 q15, q12, #1 - vadd.i32 q8, q8, q4 - vext.32 d10, d31, d30, #0 - vadd.i32 q7, q7, q1 - vst1.8 {d16-d17}, [r2, : 128]! - vmull.s32 q8, d18, d5 - vmlal.s32 q8, d26, d4 - vmlal.s32 q8, d19, d9 - vmlal.s32 q8, d27, d3 - vmlal.s32 q8, d22, d8 - vmlal.s32 q8, d28, d2 - vmlal.s32 q8, d23, d7 - vmlal.s32 q8, d29, d1 - vmlal.s32 q8, d24, d6 - vmlal.s32 q8, d25, d0 - vst1.8 {d14-d15}, [r2, : 128]! - vmull.s32 q2, d18, d4 - vmlal.s32 q2, d12, d9 - vmlal.s32 q2, d13, d8 - vmlal.s32 q2, d19, d3 - vmlal.s32 q2, d22, d2 - vmlal.s32 q2, d23, d1 - vmlal.s32 q2, d24, d0 - vst1.8 {d20-d21}, [r2, : 128]! - vmull.s32 q7, d18, d9 - vmlal.s32 q7, d26, d3 - vmlal.s32 q7, d19, d8 - vmlal.s32 q7, d27, d2 - vmlal.s32 q7, d22, d7 - vmlal.s32 q7, d28, d1 - vmlal.s32 q7, d23, d6 - vmlal.s32 q7, d29, d0 - vst1.8 {d10-d11}, [r2, : 128]! - vmull.s32 q5, d18, d3 - vmlal.s32 q5, d19, d2 - vmlal.s32 q5, d22, d1 - vmlal.s32 q5, d23, d0 - vmlal.s32 q5, d12, d8 - vst1.8 {d16-d17}, [r2, : 128]! - vmull.s32 q4, d18, d8 - vmlal.s32 q4, d26, d2 - vmlal.s32 q4, d19, d7 - vmlal.s32 q4, d27, d1 - vmlal.s32 q4, d22, d6 - vmlal.s32 q4, d28, d0 - vmull.s32 q8, d18, d7 - vmlal.s32 q8, d26, d1 - vmlal.s32 q8, d19, d6 - vmlal.s32 q8, d27, d0 - add r2, sp, #544 - vld1.8 {d20-d21}, [r2, : 128] - vmlal.s32 q7, d24, d21 - vmlal.s32 q7, d25, d20 - vmlal.s32 q4, d23, d21 - vmlal.s32 q4, d29, d20 - vmlal.s32 q8, d22, d21 - vmlal.s32 q8, d28, d20 - vmlal.s32 q5, d24, d20 - vst1.8 {d14-d15}, [r2, : 128] - vmull.s32 q7, d18, d6 - vmlal.s32 q7, d26, d0 - add r2, sp, #624 - vld1.8 {d30-d31}, [r2, : 128] - vmlal.s32 q2, d30, d21 - vmlal.s32 q7, d19, d21 - vmlal.s32 q7, d27, d20 - add r2, sp, #592 - vld1.8 {d26-d27}, [r2, : 128] - vmlal.s32 q4, d25, d27 - vmlal.s32 q8, d29, d27 - vmlal.s32 q8, d25, d26 - vmlal.s32 q7, d28, d27 - vmlal.s32 q7, d29, d26 - add r2, sp, #576 - vld1.8 {d28-d29}, [r2, : 128] - vmlal.s32 q4, d24, d29 - vmlal.s32 q8, d23, d29 - vmlal.s32 q8, d24, d28 - vmlal.s32 q7, d22, d29 - vmlal.s32 q7, d23, d28 - vst1.8 {d8-d9}, [r2, : 128] - add r2, sp, #528 - vld1.8 {d8-d9}, [r2, : 128] - vmlal.s32 q7, d24, d9 - vmlal.s32 q7, d25, d31 - vmull.s32 q1, d18, d2 - vmlal.s32 q1, d19, d1 - vmlal.s32 q1, d22, d0 - vmlal.s32 q1, d24, d27 - vmlal.s32 q1, d23, d20 - vmlal.s32 q1, d12, d7 - vmlal.s32 q1, d13, d6 - vmull.s32 q6, d18, d1 - vmlal.s32 q6, d19, d0 - vmlal.s32 q6, d23, d27 - vmlal.s32 q6, d22, d20 - vmlal.s32 q6, d24, d26 - vmull.s32 q0, d18, d0 - vmlal.s32 q0, d22, d27 - vmlal.s32 q0, d23, d26 - vmlal.s32 q0, d24, d31 - vmlal.s32 q0, d19, d20 - add r2, sp, #608 - vld1.8 {d18-d19}, [r2, : 128] - vmlal.s32 q2, d18, d7 - vmlal.s32 q5, d18, d6 - vmlal.s32 q1, d18, d21 - vmlal.s32 q0, d18, d28 - vmlal.s32 q6, d18, d29 - vmlal.s32 q2, d19, d6 - vmlal.s32 q5, d19, d21 - vmlal.s32 q1, d19, d29 - vmlal.s32 q0, d19, d9 - vmlal.s32 q6, d19, d28 - add r2, sp, #560 - vld1.8 {d18-d19}, [r2, : 128] - add r2, sp, #480 - vld1.8 {d22-d23}, [r2, : 128] - vmlal.s32 q5, d19, d7 - vmlal.s32 q0, d18, d21 - vmlal.s32 q0, d19, d29 - vmlal.s32 q6, d18, d6 - add r2, sp, #496 - vld1.8 {d6-d7}, [r2, : 128] - vmlal.s32 q6, d19, d21 - add r2, sp, #544 - vld1.8 {d18-d19}, [r2, : 128] - vmlal.s32 q0, d30, d8 - add r2, sp, #640 - vld1.8 {d20-d21}, [r2, : 128] - vmlal.s32 q5, d30, d29 - add r2, sp, #576 - vld1.8 {d24-d25}, [r2, : 128] - vmlal.s32 q1, d30, d28 - vadd.i64 q13, q0, q11 - vadd.i64 q14, q5, q11 - vmlal.s32 q6, d30, d9 - vshr.s64 q4, q13, #26 - vshr.s64 q13, q14, #26 - vadd.i64 q7, q7, q4 - vshl.i64 q4, q4, #26 - vadd.i64 q14, q7, q3 - vadd.i64 q9, q9, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q15, q9, q3 - vsub.i64 q0, q0, q4 - vshr.s64 q4, q14, #25 - vsub.i64 q5, q5, q13 - vshr.s64 q13, q15, #25 - vadd.i64 q6, q6, q4 - vshl.i64 q4, q4, #25 - vadd.i64 q14, q6, q11 - vadd.i64 q2, q2, q13 - vsub.i64 q4, q7, q4 - vshr.s64 q7, q14, #26 - vshl.i64 q13, q13, #25 - vadd.i64 q14, q2, q11 - vadd.i64 q8, q8, q7 - vshl.i64 q7, q7, #26 - vadd.i64 q15, q8, q3 - vsub.i64 q9, q9, q13 - vshr.s64 q13, q14, #26 - vsub.i64 q6, q6, q7 - vshr.s64 q7, q15, #25 - vadd.i64 q10, q10, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q14, q10, q3 - vadd.i64 q1, q1, q7 - add r2, r3, #288 - vshl.i64 q7, q7, #25 - add r4, r3, #96 - vadd.i64 q15, q1, q11 - add r2, r2, #8 - vsub.i64 q2, q2, q13 - add r4, r4, #8 - vshr.s64 q13, q14, #25 - vsub.i64 q7, q8, q7 - vshr.s64 q8, q15, #26 - vadd.i64 q14, q13, q13 - vadd.i64 q12, q12, q8 - vtrn.32 d12, d14 - vshl.i64 q8, q8, #26 - vtrn.32 d13, d15 - vadd.i64 q3, q12, q3 - vadd.i64 q0, q0, q14 - vst1.8 d12, [r2, : 64]! - vshl.i64 q7, q13, #4 - vst1.8 d13, [r4, : 64]! - vsub.i64 q1, q1, q8 - vshr.s64 q3, q3, #25 - vadd.i64 q0, q0, q7 - vadd.i64 q5, q5, q3 - vshl.i64 q3, q3, #25 - vadd.i64 q6, q5, q11 - vadd.i64 q0, q0, q13 - vshl.i64 q7, q13, #25 - vadd.i64 q8, q0, q11 - vsub.i64 q3, q12, q3 - vshr.s64 q6, q6, #26 - vsub.i64 q7, q10, q7 - vtrn.32 d2, d6 - vshr.s64 q8, q8, #26 - vtrn.32 d3, d7 - vadd.i64 q3, q9, q6 - vst1.8 d2, [r2, : 64] - vshl.i64 q6, q6, #26 - vst1.8 d3, [r4, : 64] - vadd.i64 q1, q4, q8 - vtrn.32 d4, d14 - vshl.i64 q4, q8, #26 - vtrn.32 d5, d15 - vsub.i64 q5, q5, q6 - add r2, r2, #16 - vsub.i64 q0, q0, q4 - vst1.8 d4, [r2, : 64] - add r4, r4, #16 - vst1.8 d5, [r4, : 64] - vtrn.32 d10, d6 - vtrn.32 d11, d7 - sub r2, r2, #8 - sub r4, r4, #8 - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vst1.8 d10, [r2, : 64] - vst1.8 d11, [r4, : 64] - sub r2, r2, #24 - sub r4, r4, #24 - vst1.8 d0, [r2, : 64] - vst1.8 d1, [r4, : 64] - add r2, sp, #512 - add r4, r3, #144 - add r5, r3, #192 - vld1.8 {d0-d1}, [r2, : 128] - vld1.8 {d2-d3}, [r4, : 128]! - vld1.8 {d4-d5}, [r5, : 128]! - vzip.i32 q1, q2 - vld1.8 {d6-d7}, [r4, : 128]! - vld1.8 {d8-d9}, [r5, : 128]! - vshl.i32 q5, q1, #1 - vzip.i32 q3, q4 - vshl.i32 q6, q2, #1 - vld1.8 {d14}, [r4, : 64] - vshl.i32 q8, q3, #1 - vld1.8 {d15}, [r5, : 64] - vshl.i32 q9, q4, #1 - vmul.i32 d21, d7, d1 - vtrn.32 d14, d15 - vmul.i32 q11, q4, q0 - vmul.i32 q0, q7, q0 - vmull.s32 q12, d2, d2 - vmlal.s32 q12, d11, d1 - vmlal.s32 q12, d12, d0 - vmlal.s32 q12, d13, d23 - vmlal.s32 q12, d16, d22 - vmlal.s32 q12, d7, d21 - vmull.s32 q10, d2, d11 - vmlal.s32 q10, d4, d1 - vmlal.s32 q10, d13, d0 - vmlal.s32 q10, d6, d23 - vmlal.s32 q10, d17, d22 - vmull.s32 q13, d10, d4 - vmlal.s32 q13, d11, d3 - vmlal.s32 q13, d13, d1 - vmlal.s32 q13, d16, d0 - vmlal.s32 q13, d17, d23 - vmlal.s32 q13, d8, d22 - vmull.s32 q1, d10, d5 - vmlal.s32 q1, d11, d4 - vmlal.s32 q1, d6, d1 - vmlal.s32 q1, d17, d0 - vmlal.s32 q1, d8, d23 - vmull.s32 q14, d10, d6 - vmlal.s32 q14, d11, d13 - vmlal.s32 q14, d4, d4 - vmlal.s32 q14, d17, d1 - vmlal.s32 q14, d18, d0 - vmlal.s32 q14, d9, d23 - vmull.s32 q11, d10, d7 - vmlal.s32 q11, d11, d6 - vmlal.s32 q11, d12, d5 - vmlal.s32 q11, d8, d1 - vmlal.s32 q11, d19, d0 - vmull.s32 q15, d10, d8 - vmlal.s32 q15, d11, d17 - vmlal.s32 q15, d12, d6 - vmlal.s32 q15, d13, d5 - vmlal.s32 q15, d19, d1 - vmlal.s32 q15, d14, d0 - vmull.s32 q2, d10, d9 - vmlal.s32 q2, d11, d8 - vmlal.s32 q2, d12, d7 - vmlal.s32 q2, d13, d6 - vmlal.s32 q2, d14, d1 - vmull.s32 q0, d15, d1 - vmlal.s32 q0, d10, d14 - vmlal.s32 q0, d11, d19 - vmlal.s32 q0, d12, d8 - vmlal.s32 q0, d13, d17 - vmlal.s32 q0, d6, d6 - add r2, sp, #480 - vld1.8 {d18-d19}, [r2, : 128]! - vmull.s32 q3, d16, d7 - vmlal.s32 q3, d10, d15 - vmlal.s32 q3, d11, d14 - vmlal.s32 q3, d12, d9 - vmlal.s32 q3, d13, d8 - vld1.8 {d8-d9}, [r2, : 128] - vadd.i64 q5, q12, q9 - vadd.i64 q6, q15, q9 - vshr.s64 q5, q5, #26 - vshr.s64 q6, q6, #26 - vadd.i64 q7, q10, q5 - vshl.i64 q5, q5, #26 - vadd.i64 q8, q7, q4 - vadd.i64 q2, q2, q6 - vshl.i64 q6, q6, #26 - vadd.i64 q10, q2, q4 - vsub.i64 q5, q12, q5 - vshr.s64 q8, q8, #25 - vsub.i64 q6, q15, q6 - vshr.s64 q10, q10, #25 - vadd.i64 q12, q13, q8 - vshl.i64 q8, q8, #25 - vadd.i64 q13, q12, q9 - vadd.i64 q0, q0, q10 - vsub.i64 q7, q7, q8 - vshr.s64 q8, q13, #26 - vshl.i64 q10, q10, #25 - vadd.i64 q13, q0, q9 - vadd.i64 q1, q1, q8 - vshl.i64 q8, q8, #26 - vadd.i64 q15, q1, q4 - vsub.i64 q2, q2, q10 - vshr.s64 q10, q13, #26 - vsub.i64 q8, q12, q8 - vshr.s64 q12, q15, #25 - vadd.i64 q3, q3, q10 - vshl.i64 q10, q10, #26 - vadd.i64 q13, q3, q4 - vadd.i64 q14, q14, q12 - add r2, r3, #144 - vshl.i64 q12, q12, #25 - add r4, r3, #192 - vadd.i64 q15, q14, q9 - add r2, r2, #8 - vsub.i64 q0, q0, q10 - add r4, r4, #8 - vshr.s64 q10, q13, #25 - vsub.i64 q1, q1, q12 - vshr.s64 q12, q15, #26 - vadd.i64 q13, q10, q10 - vadd.i64 q11, q11, q12 - vtrn.32 d16, d2 - vshl.i64 q12, q12, #26 - vtrn.32 d17, d3 - vadd.i64 q1, q11, q4 - vadd.i64 q4, q5, q13 - vst1.8 d16, [r2, : 64]! - vshl.i64 q5, q10, #4 - vst1.8 d17, [r4, : 64]! - vsub.i64 q8, q14, q12 - vshr.s64 q1, q1, #25 - vadd.i64 q4, q4, q5 - vadd.i64 q5, q6, q1 - vshl.i64 q1, q1, #25 - vadd.i64 q6, q5, q9 - vadd.i64 q4, q4, q10 - vshl.i64 q10, q10, #25 - vadd.i64 q9, q4, q9 - vsub.i64 q1, q11, q1 - vshr.s64 q6, q6, #26 - vsub.i64 q3, q3, q10 - vtrn.32 d16, d2 - vshr.s64 q9, q9, #26 - vtrn.32 d17, d3 - vadd.i64 q1, q2, q6 - vst1.8 d16, [r2, : 64] - vshl.i64 q2, q6, #26 - vst1.8 d17, [r4, : 64] - vadd.i64 q6, q7, q9 - vtrn.32 d0, d6 - vshl.i64 q7, q9, #26 - vtrn.32 d1, d7 - vsub.i64 q2, q5, q2 - add r2, r2, #16 - vsub.i64 q3, q4, q7 - vst1.8 d0, [r2, : 64] - add r4, r4, #16 - vst1.8 d1, [r4, : 64] - vtrn.32 d4, d2 - vtrn.32 d5, d3 - sub r2, r2, #8 - sub r4, r4, #8 - vtrn.32 d6, d12 - vtrn.32 d7, d13 - vst1.8 d4, [r2, : 64] - vst1.8 d5, [r4, : 64] - sub r2, r2, #24 - sub r4, r4, #24 - vst1.8 d6, [r2, : 64] - vst1.8 d7, [r4, : 64] - add r2, r3, #336 - add r4, r3, #288 - vld1.8 {d0-d1}, [r2, : 128]! - vld1.8 {d2-d3}, [r4, : 128]! - vadd.i32 q0, q0, q1 - vld1.8 {d2-d3}, [r2, : 128]! - vld1.8 {d4-d5}, [r4, : 128]! - vadd.i32 q1, q1, q2 - add r5, r3, #288 - vld1.8 {d4}, [r2, : 64] - vld1.8 {d6}, [r4, : 64] - vadd.i32 q2, q2, q3 - vst1.8 {d0-d1}, [r5, : 128]! - vst1.8 {d2-d3}, [r5, : 128]! - vst1.8 d4, [r5, : 64] - add r2, r3, #48 - add r4, r3, #144 - vld1.8 {d0-d1}, [r4, : 128]! - vld1.8 {d2-d3}, [r4, : 128]! - vld1.8 {d4}, [r4, : 64] - add r4, r3, #288 - vld1.8 {d6-d7}, [r4, : 128]! - vtrn.32 q0, q3 - vld1.8 {d8-d9}, [r4, : 128]! - vshl.i32 q5, q0, #4 - vtrn.32 q1, q4 - vshl.i32 q6, q3, #4 - vadd.i32 q5, q5, q0 - vadd.i32 q6, q6, q3 - vshl.i32 q7, q1, #4 - vld1.8 {d5}, [r4, : 64] - vshl.i32 q8, q4, #4 - vtrn.32 d4, d5 - vadd.i32 q7, q7, q1 - vadd.i32 q8, q8, q4 - vld1.8 {d18-d19}, [r2, : 128]! - vshl.i32 q10, q2, #4 - vld1.8 {d22-d23}, [r2, : 128]! - vadd.i32 q10, q10, q2 - vld1.8 {d24}, [r2, : 64] - vadd.i32 q5, q5, q0 - add r2, r3, #240 - vld1.8 {d26-d27}, [r2, : 128]! - vadd.i32 q6, q6, q3 - vld1.8 {d28-d29}, [r2, : 128]! - vadd.i32 q8, q8, q4 - vld1.8 {d25}, [r2, : 64] - vadd.i32 q10, q10, q2 - vtrn.32 q9, q13 - vadd.i32 q7, q7, q1 - vadd.i32 q5, q5, q0 - vtrn.32 q11, q14 - vadd.i32 q6, q6, q3 - add r2, sp, #528 - vadd.i32 q10, q10, q2 - vtrn.32 d24, d25 - vst1.8 {d12-d13}, [r2, : 128]! - vshl.i32 q6, q13, #1 - vst1.8 {d20-d21}, [r2, : 128]! - vshl.i32 q10, q14, #1 - vst1.8 {d12-d13}, [r2, : 128]! - vshl.i32 q15, q12, #1 - vadd.i32 q8, q8, q4 - vext.32 d10, d31, d30, #0 - vadd.i32 q7, q7, q1 - vst1.8 {d16-d17}, [r2, : 128]! - vmull.s32 q8, d18, d5 - vmlal.s32 q8, d26, d4 - vmlal.s32 q8, d19, d9 - vmlal.s32 q8, d27, d3 - vmlal.s32 q8, d22, d8 - vmlal.s32 q8, d28, d2 - vmlal.s32 q8, d23, d7 - vmlal.s32 q8, d29, d1 - vmlal.s32 q8, d24, d6 - vmlal.s32 q8, d25, d0 - vst1.8 {d14-d15}, [r2, : 128]! - vmull.s32 q2, d18, d4 - vmlal.s32 q2, d12, d9 - vmlal.s32 q2, d13, d8 - vmlal.s32 q2, d19, d3 - vmlal.s32 q2, d22, d2 - vmlal.s32 q2, d23, d1 - vmlal.s32 q2, d24, d0 - vst1.8 {d20-d21}, [r2, : 128]! - vmull.s32 q7, d18, d9 - vmlal.s32 q7, d26, d3 - vmlal.s32 q7, d19, d8 - vmlal.s32 q7, d27, d2 - vmlal.s32 q7, d22, d7 - vmlal.s32 q7, d28, d1 - vmlal.s32 q7, d23, d6 - vmlal.s32 q7, d29, d0 - vst1.8 {d10-d11}, [r2, : 128]! - vmull.s32 q5, d18, d3 - vmlal.s32 q5, d19, d2 - vmlal.s32 q5, d22, d1 - vmlal.s32 q5, d23, d0 - vmlal.s32 q5, d12, d8 - vst1.8 {d16-d17}, [r2, : 128]! - vmull.s32 q4, d18, d8 - vmlal.s32 q4, d26, d2 - vmlal.s32 q4, d19, d7 - vmlal.s32 q4, d27, d1 - vmlal.s32 q4, d22, d6 - vmlal.s32 q4, d28, d0 - vmull.s32 q8, d18, d7 - vmlal.s32 q8, d26, d1 - vmlal.s32 q8, d19, d6 - vmlal.s32 q8, d27, d0 - add r2, sp, #544 - vld1.8 {d20-d21}, [r2, : 128] - vmlal.s32 q7, d24, d21 - vmlal.s32 q7, d25, d20 - vmlal.s32 q4, d23, d21 - vmlal.s32 q4, d29, d20 - vmlal.s32 q8, d22, d21 - vmlal.s32 q8, d28, d20 - vmlal.s32 q5, d24, d20 - vst1.8 {d14-d15}, [r2, : 128] - vmull.s32 q7, d18, d6 - vmlal.s32 q7, d26, d0 - add r2, sp, #624 - vld1.8 {d30-d31}, [r2, : 128] - vmlal.s32 q2, d30, d21 - vmlal.s32 q7, d19, d21 - vmlal.s32 q7, d27, d20 - add r2, sp, #592 - vld1.8 {d26-d27}, [r2, : 128] - vmlal.s32 q4, d25, d27 - vmlal.s32 q8, d29, d27 - vmlal.s32 q8, d25, d26 - vmlal.s32 q7, d28, d27 - vmlal.s32 q7, d29, d26 - add r2, sp, #576 - vld1.8 {d28-d29}, [r2, : 128] - vmlal.s32 q4, d24, d29 - vmlal.s32 q8, d23, d29 - vmlal.s32 q8, d24, d28 - vmlal.s32 q7, d22, d29 - vmlal.s32 q7, d23, d28 - vst1.8 {d8-d9}, [r2, : 128] - add r2, sp, #528 - vld1.8 {d8-d9}, [r2, : 128] - vmlal.s32 q7, d24, d9 - vmlal.s32 q7, d25, d31 - vmull.s32 q1, d18, d2 - vmlal.s32 q1, d19, d1 - vmlal.s32 q1, d22, d0 - vmlal.s32 q1, d24, d27 - vmlal.s32 q1, d23, d20 - vmlal.s32 q1, d12, d7 - vmlal.s32 q1, d13, d6 - vmull.s32 q6, d18, d1 - vmlal.s32 q6, d19, d0 - vmlal.s32 q6, d23, d27 - vmlal.s32 q6, d22, d20 - vmlal.s32 q6, d24, d26 - vmull.s32 q0, d18, d0 - vmlal.s32 q0, d22, d27 - vmlal.s32 q0, d23, d26 - vmlal.s32 q0, d24, d31 - vmlal.s32 q0, d19, d20 - add r2, sp, #608 - vld1.8 {d18-d19}, [r2, : 128] - vmlal.s32 q2, d18, d7 - vmlal.s32 q5, d18, d6 - vmlal.s32 q1, d18, d21 - vmlal.s32 q0, d18, d28 - vmlal.s32 q6, d18, d29 - vmlal.s32 q2, d19, d6 - vmlal.s32 q5, d19, d21 - vmlal.s32 q1, d19, d29 - vmlal.s32 q0, d19, d9 - vmlal.s32 q6, d19, d28 - add r2, sp, #560 - vld1.8 {d18-d19}, [r2, : 128] - add r2, sp, #480 - vld1.8 {d22-d23}, [r2, : 128] - vmlal.s32 q5, d19, d7 - vmlal.s32 q0, d18, d21 - vmlal.s32 q0, d19, d29 - vmlal.s32 q6, d18, d6 - add r2, sp, #496 - vld1.8 {d6-d7}, [r2, : 128] - vmlal.s32 q6, d19, d21 - add r2, sp, #544 - vld1.8 {d18-d19}, [r2, : 128] - vmlal.s32 q0, d30, d8 - add r2, sp, #640 - vld1.8 {d20-d21}, [r2, : 128] - vmlal.s32 q5, d30, d29 - add r2, sp, #576 - vld1.8 {d24-d25}, [r2, : 128] - vmlal.s32 q1, d30, d28 - vadd.i64 q13, q0, q11 - vadd.i64 q14, q5, q11 - vmlal.s32 q6, d30, d9 - vshr.s64 q4, q13, #26 - vshr.s64 q13, q14, #26 - vadd.i64 q7, q7, q4 - vshl.i64 q4, q4, #26 - vadd.i64 q14, q7, q3 - vadd.i64 q9, q9, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q15, q9, q3 - vsub.i64 q0, q0, q4 - vshr.s64 q4, q14, #25 - vsub.i64 q5, q5, q13 - vshr.s64 q13, q15, #25 - vadd.i64 q6, q6, q4 - vshl.i64 q4, q4, #25 - vadd.i64 q14, q6, q11 - vadd.i64 q2, q2, q13 - vsub.i64 q4, q7, q4 - vshr.s64 q7, q14, #26 - vshl.i64 q13, q13, #25 - vadd.i64 q14, q2, q11 - vadd.i64 q8, q8, q7 - vshl.i64 q7, q7, #26 - vadd.i64 q15, q8, q3 - vsub.i64 q9, q9, q13 - vshr.s64 q13, q14, #26 - vsub.i64 q6, q6, q7 - vshr.s64 q7, q15, #25 - vadd.i64 q10, q10, q13 - vshl.i64 q13, q13, #26 - vadd.i64 q14, q10, q3 - vadd.i64 q1, q1, q7 - add r2, r3, #240 - vshl.i64 q7, q7, #25 - add r4, r3, #144 - vadd.i64 q15, q1, q11 - add r2, r2, #8 - vsub.i64 q2, q2, q13 - add r4, r4, #8 - vshr.s64 q13, q14, #25 - vsub.i64 q7, q8, q7 - vshr.s64 q8, q15, #26 - vadd.i64 q14, q13, q13 - vadd.i64 q12, q12, q8 - vtrn.32 d12, d14 - vshl.i64 q8, q8, #26 - vtrn.32 d13, d15 - vadd.i64 q3, q12, q3 - vadd.i64 q0, q0, q14 - vst1.8 d12, [r2, : 64]! - vshl.i64 q7, q13, #4 - vst1.8 d13, [r4, : 64]! - vsub.i64 q1, q1, q8 - vshr.s64 q3, q3, #25 - vadd.i64 q0, q0, q7 - vadd.i64 q5, q5, q3 - vshl.i64 q3, q3, #25 - vadd.i64 q6, q5, q11 - vadd.i64 q0, q0, q13 - vshl.i64 q7, q13, #25 - vadd.i64 q8, q0, q11 - vsub.i64 q3, q12, q3 - vshr.s64 q6, q6, #26 - vsub.i64 q7, q10, q7 - vtrn.32 d2, d6 - vshr.s64 q8, q8, #26 - vtrn.32 d3, d7 - vadd.i64 q3, q9, q6 - vst1.8 d2, [r2, : 64] - vshl.i64 q6, q6, #26 - vst1.8 d3, [r4, : 64] - vadd.i64 q1, q4, q8 - vtrn.32 d4, d14 - vshl.i64 q4, q8, #26 - vtrn.32 d5, d15 - vsub.i64 q5, q5, q6 - add r2, r2, #16 - vsub.i64 q0, q0, q4 - vst1.8 d4, [r2, : 64] - add r4, r4, #16 - vst1.8 d5, [r4, : 64] - vtrn.32 d10, d6 - vtrn.32 d11, d7 - sub r2, r2, #8 - sub r4, r4, #8 - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vst1.8 d10, [r2, : 64] - vst1.8 d11, [r4, : 64] - sub r2, r2, #24 - sub r4, r4, #24 - vst1.8 d0, [r2, : 64] - vst1.8 d1, [r4, : 64] - ldr r2, [sp, #456] - ldr r4, [sp, #460] - subs r5, r2, #1 - bge .Lmainloop - add r1, r3, #144 - add r2, r3, #336 - vld1.8 {d0-d1}, [r1, : 128]! - vld1.8 {d2-d3}, [r1, : 128]! - vld1.8 {d4}, [r1, : 64] - vst1.8 {d0-d1}, [r2, : 128]! - vst1.8 {d2-d3}, [r2, : 128]! - vst1.8 d4, [r2, : 64] - movw r1, #0 -.Linvertloop: - add r2, r3, #144 - movw r4, #0 - movw r5, #2 - cmp r1, #1 - moveq r5, #1 - addeq r2, r3, #336 - addeq r4, r3, #48 - cmp r1, #2 - moveq r5, #1 - addeq r2, r3, #48 - cmp r1, #3 - moveq r5, #5 - addeq r4, r3, #336 - cmp r1, #4 - moveq r5, #10 - cmp r1, #5 - moveq r5, #20 - cmp r1, #6 - moveq r5, #10 - addeq r2, r3, #336 - addeq r4, r3, #336 - cmp r1, #7 - moveq r5, #50 - cmp r1, #8 - moveq r5, #100 - cmp r1, #9 - moveq r5, #50 - addeq r2, r3, #336 - cmp r1, #10 - moveq r5, #5 - addeq r2, r3, #48 - cmp r1, #11 - moveq r5, #0 - addeq r2, r3, #96 - add r6, r3, #144 - add r7, r3, #288 - vld1.8 {d0-d1}, [r6, : 128]! - vld1.8 {d2-d3}, [r6, : 128]! - vld1.8 {d4}, [r6, : 64] - vst1.8 {d0-d1}, [r7, : 128]! - vst1.8 {d2-d3}, [r7, : 128]! - vst1.8 d4, [r7, : 64] - cmp r5, #0 - beq .Lskipsquaringloop -.Lsquaringloop: - add r6, r3, #288 - add r7, r3, #288 - add r8, r3, #288 - vmov.i32 q0, #19 - vmov.i32 q1, #0 - vmov.i32 q2, #1 - vzip.i32 q1, q2 - vld1.8 {d4-d5}, [r7, : 128]! - vld1.8 {d6-d7}, [r7, : 128]! - vld1.8 {d9}, [r7, : 64] - vld1.8 {d10-d11}, [r6, : 128]! - add r7, sp, #384 - vld1.8 {d12-d13}, [r6, : 128]! - vmul.i32 q7, q2, q0 - vld1.8 {d8}, [r6, : 64] - vext.32 d17, d11, d10, #1 - vmul.i32 q9, q3, q0 - vext.32 d16, d10, d8, #1 - vshl.u32 q10, q5, q1 - vext.32 d22, d14, d4, #1 - vext.32 d24, d18, d6, #1 - vshl.u32 q13, q6, q1 - vshl.u32 d28, d8, d2 - vrev64.i32 d22, d22 - vmul.i32 d1, d9, d1 - vrev64.i32 d24, d24 - vext.32 d29, d8, d13, #1 - vext.32 d0, d1, d9, #1 - vrev64.i32 d0, d0 - vext.32 d2, d9, d1, #1 - vext.32 d23, d15, d5, #1 - vmull.s32 q4, d20, d4 - vrev64.i32 d23, d23 - vmlal.s32 q4, d21, d1 - vrev64.i32 d2, d2 - vmlal.s32 q4, d26, d19 - vext.32 d3, d5, d15, #1 - vmlal.s32 q4, d27, d18 - vrev64.i32 d3, d3 - vmlal.s32 q4, d28, d15 - vext.32 d14, d12, d11, #1 - vmull.s32 q5, d16, d23 - vext.32 d15, d13, d12, #1 - vmlal.s32 q5, d17, d4 - vst1.8 d8, [r7, : 64]! - vmlal.s32 q5, d14, d1 - vext.32 d12, d9, d8, #0 - vmlal.s32 q5, d15, d19 - vmov.i64 d13, #0 - vmlal.s32 q5, d29, d18 - vext.32 d25, d19, d7, #1 - vmlal.s32 q6, d20, d5 - vrev64.i32 d25, d25 - vmlal.s32 q6, d21, d4 - vst1.8 d11, [r7, : 64]! - vmlal.s32 q6, d26, d1 - vext.32 d9, d10, d10, #0 - vmlal.s32 q6, d27, d19 - vmov.i64 d8, #0 - vmlal.s32 q6, d28, d18 - vmlal.s32 q4, d16, d24 - vmlal.s32 q4, d17, d5 - vmlal.s32 q4, d14, d4 - vst1.8 d12, [r7, : 64]! - vmlal.s32 q4, d15, d1 - vext.32 d10, d13, d12, #0 - vmlal.s32 q4, d29, d19 - vmov.i64 d11, #0 - vmlal.s32 q5, d20, d6 - vmlal.s32 q5, d21, d5 - vmlal.s32 q5, d26, d4 - vext.32 d13, d8, d8, #0 - vmlal.s32 q5, d27, d1 - vmov.i64 d12, #0 - vmlal.s32 q5, d28, d19 - vst1.8 d9, [r7, : 64]! - vmlal.s32 q6, d16, d25 - vmlal.s32 q6, d17, d6 - vst1.8 d10, [r7, : 64] - vmlal.s32 q6, d14, d5 - vext.32 d8, d11, d10, #0 - vmlal.s32 q6, d15, d4 - vmov.i64 d9, #0 - vmlal.s32 q6, d29, d1 - vmlal.s32 q4, d20, d7 - vmlal.s32 q4, d21, d6 - vmlal.s32 q4, d26, d5 - vext.32 d11, d12, d12, #0 - vmlal.s32 q4, d27, d4 - vmov.i64 d10, #0 - vmlal.s32 q4, d28, d1 - vmlal.s32 q5, d16, d0 - sub r6, r7, #32 - vmlal.s32 q5, d17, d7 - vmlal.s32 q5, d14, d6 - vext.32 d30, d9, d8, #0 - vmlal.s32 q5, d15, d5 - vld1.8 {d31}, [r6, : 64]! - vmlal.s32 q5, d29, d4 - vmlal.s32 q15, d20, d0 - vext.32 d0, d6, d18, #1 - vmlal.s32 q15, d21, d25 - vrev64.i32 d0, d0 - vmlal.s32 q15, d26, d24 - vext.32 d1, d7, d19, #1 - vext.32 d7, d10, d10, #0 - vmlal.s32 q15, d27, d23 - vrev64.i32 d1, d1 - vld1.8 {d6}, [r6, : 64] - vmlal.s32 q15, d28, d22 - vmlal.s32 q3, d16, d4 - add r6, r6, #24 - vmlal.s32 q3, d17, d2 - vext.32 d4, d31, d30, #0 - vmov d17, d11 - vmlal.s32 q3, d14, d1 - vext.32 d11, d13, d13, #0 - vext.32 d13, d30, d30, #0 - vmlal.s32 q3, d15, d0 - vext.32 d1, d8, d8, #0 - vmlal.s32 q3, d29, d3 - vld1.8 {d5}, [r6, : 64] - sub r6, r6, #16 - vext.32 d10, d6, d6, #0 - vmov.i32 q1, #0xffffffff - vshl.i64 q4, q1, #25 - add r7, sp, #480 - vld1.8 {d14-d15}, [r7, : 128] - vadd.i64 q9, q2, q7 - vshl.i64 q1, q1, #26 - vshr.s64 q10, q9, #26 - vld1.8 {d0}, [r6, : 64]! - vadd.i64 q5, q5, q10 - vand q9, q9, q1 - vld1.8 {d16}, [r6, : 64]! - add r6, sp, #496 - vld1.8 {d20-d21}, [r6, : 128] - vadd.i64 q11, q5, q10 - vsub.i64 q2, q2, q9 - vshr.s64 q9, q11, #25 - vext.32 d12, d5, d4, #0 - vand q11, q11, q4 - vadd.i64 q0, q0, q9 - vmov d19, d7 - vadd.i64 q3, q0, q7 - vsub.i64 q5, q5, q11 - vshr.s64 q11, q3, #26 - vext.32 d18, d11, d10, #0 - vand q3, q3, q1 - vadd.i64 q8, q8, q11 - vadd.i64 q11, q8, q10 - vsub.i64 q0, q0, q3 - vshr.s64 q3, q11, #25 - vand q11, q11, q4 - vadd.i64 q3, q6, q3 - vadd.i64 q6, q3, q7 - vsub.i64 q8, q8, q11 - vshr.s64 q11, q6, #26 - vand q6, q6, q1 - vadd.i64 q9, q9, q11 - vadd.i64 d25, d19, d21 - vsub.i64 q3, q3, q6 - vshr.s64 d23, d25, #25 - vand q4, q12, q4 - vadd.i64 d21, d23, d23 - vshl.i64 d25, d23, #4 - vadd.i64 d21, d21, d23 - vadd.i64 d25, d25, d21 - vadd.i64 d4, d4, d25 - vzip.i32 q0, q8 - vadd.i64 d12, d4, d14 - add r6, r8, #8 - vst1.8 d0, [r6, : 64] - vsub.i64 d19, d19, d9 - add r6, r6, #16 - vst1.8 d16, [r6, : 64] - vshr.s64 d22, d12, #26 - vand q0, q6, q1 - vadd.i64 d10, d10, d22 - vzip.i32 q3, q9 - vsub.i64 d4, d4, d0 - sub r6, r6, #8 - vst1.8 d6, [r6, : 64] - add r6, r6, #16 - vst1.8 d18, [r6, : 64] - vzip.i32 q2, q5 - sub r6, r6, #32 - vst1.8 d4, [r6, : 64] - subs r5, r5, #1 - bhi .Lsquaringloop -.Lskipsquaringloop: - mov r2, r2 - add r5, r3, #288 - add r6, r3, #144 - vmov.i32 q0, #19 - vmov.i32 q1, #0 - vmov.i32 q2, #1 - vzip.i32 q1, q2 - vld1.8 {d4-d5}, [r5, : 128]! - vld1.8 {d6-d7}, [r5, : 128]! - vld1.8 {d9}, [r5, : 64] - vld1.8 {d10-d11}, [r2, : 128]! - add r5, sp, #384 - vld1.8 {d12-d13}, [r2, : 128]! - vmul.i32 q7, q2, q0 - vld1.8 {d8}, [r2, : 64] - vext.32 d17, d11, d10, #1 - vmul.i32 q9, q3, q0 - vext.32 d16, d10, d8, #1 - vshl.u32 q10, q5, q1 - vext.32 d22, d14, d4, #1 - vext.32 d24, d18, d6, #1 - vshl.u32 q13, q6, q1 - vshl.u32 d28, d8, d2 - vrev64.i32 d22, d22 - vmul.i32 d1, d9, d1 - vrev64.i32 d24, d24 - vext.32 d29, d8, d13, #1 - vext.32 d0, d1, d9, #1 - vrev64.i32 d0, d0 - vext.32 d2, d9, d1, #1 - vext.32 d23, d15, d5, #1 - vmull.s32 q4, d20, d4 - vrev64.i32 d23, d23 - vmlal.s32 q4, d21, d1 - vrev64.i32 d2, d2 - vmlal.s32 q4, d26, d19 - vext.32 d3, d5, d15, #1 - vmlal.s32 q4, d27, d18 - vrev64.i32 d3, d3 - vmlal.s32 q4, d28, d15 - vext.32 d14, d12, d11, #1 - vmull.s32 q5, d16, d23 - vext.32 d15, d13, d12, #1 - vmlal.s32 q5, d17, d4 - vst1.8 d8, [r5, : 64]! - vmlal.s32 q5, d14, d1 - vext.32 d12, d9, d8, #0 - vmlal.s32 q5, d15, d19 - vmov.i64 d13, #0 - vmlal.s32 q5, d29, d18 - vext.32 d25, d19, d7, #1 - vmlal.s32 q6, d20, d5 - vrev64.i32 d25, d25 - vmlal.s32 q6, d21, d4 - vst1.8 d11, [r5, : 64]! - vmlal.s32 q6, d26, d1 - vext.32 d9, d10, d10, #0 - vmlal.s32 q6, d27, d19 - vmov.i64 d8, #0 - vmlal.s32 q6, d28, d18 - vmlal.s32 q4, d16, d24 - vmlal.s32 q4, d17, d5 - vmlal.s32 q4, d14, d4 - vst1.8 d12, [r5, : 64]! - vmlal.s32 q4, d15, d1 - vext.32 d10, d13, d12, #0 - vmlal.s32 q4, d29, d19 - vmov.i64 d11, #0 - vmlal.s32 q5, d20, d6 - vmlal.s32 q5, d21, d5 - vmlal.s32 q5, d26, d4 - vext.32 d13, d8, d8, #0 - vmlal.s32 q5, d27, d1 - vmov.i64 d12, #0 - vmlal.s32 q5, d28, d19 - vst1.8 d9, [r5, : 64]! - vmlal.s32 q6, d16, d25 - vmlal.s32 q6, d17, d6 - vst1.8 d10, [r5, : 64] - vmlal.s32 q6, d14, d5 - vext.32 d8, d11, d10, #0 - vmlal.s32 q6, d15, d4 - vmov.i64 d9, #0 - vmlal.s32 q6, d29, d1 - vmlal.s32 q4, d20, d7 - vmlal.s32 q4, d21, d6 - vmlal.s32 q4, d26, d5 - vext.32 d11, d12, d12, #0 - vmlal.s32 q4, d27, d4 - vmov.i64 d10, #0 - vmlal.s32 q4, d28, d1 - vmlal.s32 q5, d16, d0 - sub r2, r5, #32 - vmlal.s32 q5, d17, d7 - vmlal.s32 q5, d14, d6 - vext.32 d30, d9, d8, #0 - vmlal.s32 q5, d15, d5 - vld1.8 {d31}, [r2, : 64]! - vmlal.s32 q5, d29, d4 - vmlal.s32 q15, d20, d0 - vext.32 d0, d6, d18, #1 - vmlal.s32 q15, d21, d25 - vrev64.i32 d0, d0 - vmlal.s32 q15, d26, d24 - vext.32 d1, d7, d19, #1 - vext.32 d7, d10, d10, #0 - vmlal.s32 q15, d27, d23 - vrev64.i32 d1, d1 - vld1.8 {d6}, [r2, : 64] - vmlal.s32 q15, d28, d22 - vmlal.s32 q3, d16, d4 - add r2, r2, #24 - vmlal.s32 q3, d17, d2 - vext.32 d4, d31, d30, #0 - vmov d17, d11 - vmlal.s32 q3, d14, d1 - vext.32 d11, d13, d13, #0 - vext.32 d13, d30, d30, #0 - vmlal.s32 q3, d15, d0 - vext.32 d1, d8, d8, #0 - vmlal.s32 q3, d29, d3 - vld1.8 {d5}, [r2, : 64] - sub r2, r2, #16 - vext.32 d10, d6, d6, #0 - vmov.i32 q1, #0xffffffff - vshl.i64 q4, q1, #25 - add r5, sp, #480 - vld1.8 {d14-d15}, [r5, : 128] - vadd.i64 q9, q2, q7 - vshl.i64 q1, q1, #26 - vshr.s64 q10, q9, #26 - vld1.8 {d0}, [r2, : 64]! - vadd.i64 q5, q5, q10 - vand q9, q9, q1 - vld1.8 {d16}, [r2, : 64]! - add r2, sp, #496 - vld1.8 {d20-d21}, [r2, : 128] - vadd.i64 q11, q5, q10 - vsub.i64 q2, q2, q9 - vshr.s64 q9, q11, #25 - vext.32 d12, d5, d4, #0 - vand q11, q11, q4 - vadd.i64 q0, q0, q9 - vmov d19, d7 - vadd.i64 q3, q0, q7 - vsub.i64 q5, q5, q11 - vshr.s64 q11, q3, #26 - vext.32 d18, d11, d10, #0 - vand q3, q3, q1 - vadd.i64 q8, q8, q11 - vadd.i64 q11, q8, q10 - vsub.i64 q0, q0, q3 - vshr.s64 q3, q11, #25 - vand q11, q11, q4 - vadd.i64 q3, q6, q3 - vadd.i64 q6, q3, q7 - vsub.i64 q8, q8, q11 - vshr.s64 q11, q6, #26 - vand q6, q6, q1 - vadd.i64 q9, q9, q11 - vadd.i64 d25, d19, d21 - vsub.i64 q3, q3, q6 - vshr.s64 d23, d25, #25 - vand q4, q12, q4 - vadd.i64 d21, d23, d23 - vshl.i64 d25, d23, #4 - vadd.i64 d21, d21, d23 - vadd.i64 d25, d25, d21 - vadd.i64 d4, d4, d25 - vzip.i32 q0, q8 - vadd.i64 d12, d4, d14 - add r2, r6, #8 - vst1.8 d0, [r2, : 64] - vsub.i64 d19, d19, d9 - add r2, r2, #16 - vst1.8 d16, [r2, : 64] - vshr.s64 d22, d12, #26 - vand q0, q6, q1 - vadd.i64 d10, d10, d22 - vzip.i32 q3, q9 - vsub.i64 d4, d4, d0 - sub r2, r2, #8 - vst1.8 d6, [r2, : 64] - add r2, r2, #16 - vst1.8 d18, [r2, : 64] - vzip.i32 q2, q5 - sub r2, r2, #32 - vst1.8 d4, [r2, : 64] - cmp r4, #0 - beq .Lskippostcopy - add r2, r3, #144 - mov r4, r4 - vld1.8 {d0-d1}, [r2, : 128]! - vld1.8 {d2-d3}, [r2, : 128]! - vld1.8 {d4}, [r2, : 64] - vst1.8 {d0-d1}, [r4, : 128]! - vst1.8 {d2-d3}, [r4, : 128]! - vst1.8 d4, [r4, : 64] -.Lskippostcopy: - cmp r1, #1 - bne .Lskipfinalcopy - add r2, r3, #288 - add r4, r3, #144 - vld1.8 {d0-d1}, [r2, : 128]! - vld1.8 {d2-d3}, [r2, : 128]! - vld1.8 {d4}, [r2, : 64] - vst1.8 {d0-d1}, [r4, : 128]! - vst1.8 {d2-d3}, [r4, : 128]! - vst1.8 d4, [r4, : 64] -.Lskipfinalcopy: - add r1, r1, #1 - cmp r1, #12 - blo .Linvertloop - add r1, r3, #144 - ldr r2, [r1], #4 - ldr r3, [r1], #4 - ldr r4, [r1], #4 - ldr r5, [r1], #4 - ldr r6, [r1], #4 - ldr r7, [r1], #4 - ldr r8, [r1], #4 - ldr r9, [r1], #4 - ldr r10, [r1], #4 - ldr r1, [r1] - add r11, r1, r1, LSL #4 - add r11, r11, r1, LSL #1 - add r11, r11, #16777216 - mov r11, r11, ASR #25 - add r11, r11, r2 - mov r11, r11, ASR #26 - add r11, r11, r3 - mov r11, r11, ASR #25 - add r11, r11, r4 - mov r11, r11, ASR #26 - add r11, r11, r5 - mov r11, r11, ASR #25 - add r11, r11, r6 - mov r11, r11, ASR #26 - add r11, r11, r7 - mov r11, r11, ASR #25 - add r11, r11, r8 - mov r11, r11, ASR #26 - add r11, r11, r9 - mov r11, r11, ASR #25 - add r11, r11, r10 - mov r11, r11, ASR #26 - add r11, r11, r1 - mov r11, r11, ASR #25 - add r2, r2, r11 - add r2, r2, r11, LSL #1 - add r2, r2, r11, LSL #4 - mov r11, r2, ASR #26 - add r3, r3, r11 - sub r2, r2, r11, LSL #26 - mov r11, r3, ASR #25 - add r4, r4, r11 - sub r3, r3, r11, LSL #25 - mov r11, r4, ASR #26 - add r5, r5, r11 - sub r4, r4, r11, LSL #26 - mov r11, r5, ASR #25 - add r6, r6, r11 - sub r5, r5, r11, LSL #25 - mov r11, r6, ASR #26 - add r7, r7, r11 - sub r6, r6, r11, LSL #26 - mov r11, r7, ASR #25 - add r8, r8, r11 - sub r7, r7, r11, LSL #25 - mov r11, r8, ASR #26 - add r9, r9, r11 - sub r8, r8, r11, LSL #26 - mov r11, r9, ASR #25 - add r10, r10, r11 - sub r9, r9, r11, LSL #25 - mov r11, r10, ASR #26 - add r1, r1, r11 - sub r10, r10, r11, LSL #26 - mov r11, r1, ASR #25 - sub r1, r1, r11, LSL #25 - add r2, r2, r3, LSL #26 - mov r3, r3, LSR #6 - add r3, r3, r4, LSL #19 - mov r4, r4, LSR #13 - add r4, r4, r5, LSL #13 - mov r5, r5, LSR #19 - add r5, r5, r6, LSL #6 - add r6, r7, r8, LSL #25 - mov r7, r8, LSR #7 - add r7, r7, r9, LSL #19 - mov r8, r9, LSR #13 - add r8, r8, r10, LSL #12 - mov r9, r10, LSR #20 - add r1, r9, r1, LSL #6 - str r2, [r0] - str r3, [r0, #4] - str r4, [r0, #8] - str r5, [r0, #12] - str r6, [r0, #16] - str r7, [r0, #20] - str r8, [r0, #24] - str r1, [r0, #28] - movw r0, #0 - mov sp, ip - pop {r4-r11, pc} -ENDPROC(curve25519_neon) diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c deleted file mode 100644 index 3076020d8fbe..000000000000 --- a/arch/arm/crypto/curve25519-glue.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This - * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been - * manually reworked for use in kernel space. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]) -{ - if (static_branch_likely(&have_neon) && crypto_simd_usable()) { - kernel_neon_begin(); - curve25519_neon(out, scalar, point); - kernel_neon_end(); - } else { - curve25519_generic(out, scalar, point); - } -} -EXPORT_SYMBOL(curve25519_arch); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - return curve25519_arch(pub, secret, curve25519_base_point); -} -EXPORT_SYMBOL(curve25519_base_arch); - -static int __init arm_curve25519_init(void) -{ - if (elf_hwcap & HWCAP_NEON) - static_branch_enable(&have_neon); - return 0; -} - -static void __exit arm_curve25519_exit(void) -{ -} - -module_init(arm_curve25519_init); -module_exit(arm_curve25519_exit); - -MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 6106a219da6a..662aed46f9c7 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,18 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" -config CRYPTO_CURVE25519_PPC64 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: PowerPC64 - - Little-endian - config CRYPTO_AES_PPC_SPE tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)" depends on SPE diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 9eb59dce67f3..5960e5300db7 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -8,12 +8,10 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o -obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o -curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c deleted file mode 100644 index 6eb18ee19cad..000000000000 --- a/arch/powerpc/crypto/curve25519-ppc64le-core.c +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2024- IBM Corp. - * - * X25519 scalar multiplication with 51 bits limbs for PPC64le. - * Based on RFC7748 and AArch64 optimized implementation for X25519 - * - Algorithm 1 Scalar multiplication of a variable point - */ - -#include - -#include -#include -#include -#include - -#include -#include - -typedef uint64_t fe51[5]; - -asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); -asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); -asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); -asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); -asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); -asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); -asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit); - -#define fmul x25519_fe51_mul -#define fsqr x25519_fe51_sqr -#define fmul121666 x25519_fe51_mul121666 -#define fe51_tobytes x25519_fe51_tobytes - -static void fadd(fe51 h, const fe51 f, const fe51 g) -{ - h[0] = f[0] + g[0]; - h[1] = f[1] + g[1]; - h[2] = f[2] + g[2]; - h[3] = f[3] + g[3]; - h[4] = f[4] + g[4]; -} - -/* - * Prime = 2 ** 255 - 19, 255 bits - * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed) - * - * Prime in 5 51-bit limbs - */ -static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff}; - -static void fsub(fe51 h, const fe51 f, const fe51 g) -{ - h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; - h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; - h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; - h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; - h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; -} - -static void fe51_frombytes(fe51 h, const uint8_t *s) -{ - /* - * Make sure 64-bit aligned. - */ - unsigned char sbuf[32+8]; - unsigned char *sb = PTR_ALIGN((void *)sbuf, 8); - - memcpy(sb, s, 32); - x25519_fe51_frombytes(h, sb); -} - -static void finv(fe51 o, const fe51 i) -{ - fe51 a0, b, c, t00; - - fsqr(a0, i); - x25519_fe51_sqr_times(t00, a0, 2); - - fmul(b, t00, i); - fmul(a0, b, a0); - - fsqr(t00, a0); - - fmul(b, t00, b); - x25519_fe51_sqr_times(t00, b, 5); - - fmul(b, t00, b); - x25519_fe51_sqr_times(t00, b, 10); - - fmul(c, t00, b); - x25519_fe51_sqr_times(t00, c, 20); - - fmul(t00, t00, c); - x25519_fe51_sqr_times(t00, t00, 10); - - fmul(b, t00, b); - x25519_fe51_sqr_times(t00, b, 50); - - fmul(c, t00, b); - x25519_fe51_sqr_times(t00, c, 100); - - fmul(t00, t00, c); - x25519_fe51_sqr_times(t00, t00, 50); - - fmul(t00, t00, b); - x25519_fe51_sqr_times(t00, t00, 5); - - fmul(o, t00, a0); -} - -static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], - const uint8_t point[32]) -{ - fe51 x1, x2, z2, x3, z3; - uint8_t s[32]; - unsigned int swap = 0; - int i; - - memcpy(s, scalar, 32); - s[0] &= 0xf8; - s[31] &= 0x7f; - s[31] |= 0x40; - fe51_frombytes(x1, point); - - z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; - x3[0] = x1[0]; - x3[1] = x1[1]; - x3[2] = x1[2]; - x3[3] = x1[3]; - x3[4] = x1[4]; - - x2[0] = z3[0] = 1; - x2[1] = z3[1] = 0; - x2[2] = z3[2] = 0; - x2[3] = z3[3] = 0; - x2[4] = z3[4] = 0; - - for (i = 254; i >= 0; --i) { - unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); - fe51 a, b, c, d, e; - fe51 da, cb, aa, bb; - fe51 dacb_p, dacb_m; - - swap ^= k_t; - x25519_cswap(x2, x3, swap); - x25519_cswap(z2, z3, swap); - swap = k_t; - - fsub(b, x2, z2); // B = x_2 - z_2 - fadd(a, x2, z2); // A = x_2 + z_2 - fsub(d, x3, z3); // D = x_3 - z_3 - fadd(c, x3, z3); // C = x_3 + z_3 - - fsqr(bb, b); // BB = B^2 - fsqr(aa, a); // AA = A^2 - fmul(da, d, a); // DA = D * A - fmul(cb, c, b); // CB = C * B - - fsub(e, aa, bb); // E = AA - BB - fmul(x2, aa, bb); // x2 = AA * BB - fadd(dacb_p, da, cb); // DA + CB - fsub(dacb_m, da, cb); // DA - CB - - fmul121666(z3, e); // 121666 * E - fsqr(z2, dacb_m); // (DA - CB)^2 - fsqr(x3, dacb_p); // x3 = (DA + CB)^2 - fadd(b, bb, z3); // BB + 121666 * E - fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2 - fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2) - } - - finv(z2, z2); - fmul(x2, x2, z2); - fe51_tobytes(out, x2); -} - -void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) -{ - curve25519_fe51(mypublic, secret, basepoint); -} -EXPORT_SYMBOL(curve25519_arch); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - curve25519_fe51(pub, secret, curve25519_base_point); -} -EXPORT_SYMBOL(curve25519_base_arch); - -MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Danny Tsen "); diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S deleted file mode 100644 index 06c1febe24b9..000000000000 --- a/arch/powerpc/crypto/curve25519-ppc64le_asm.S +++ /dev/null @@ -1,671 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -# -# This code is taken from CRYPTOGAMs[1] and is included here using the option -# in the license to distribute the code under the GPL. Therefore this program -# is free software; you can redistribute it and/or modify it under the terms of -# the GNU General Public License version 2 as published by the Free Software -# Foundation. -# -# [1] https://github.com/dot-asm/cryptogams/ - -# Copyright (c) 2006-2017, CRYPTOGAMS by -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain copyright notices, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# * Neither the name of the CRYPTOGAMS nor the names of its -# copyright holder and contributors may be used to endorse or -# promote products derived from this software without specific -# prior written permission. -# -# ALTERNATIVELY, provided that this notice is retained in full, this -# product may be distributed under the terms of the GNU General Public -# License (GPL), in which case the provisions of the GPL apply INSTEAD OF -# those given above. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# -# ==================================================================== -# Written and Modified by Danny Tsen -# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes -# and x25519_cswap -# -# Copyright 2024- IBM Corp. -# -# X25519 lower-level primitives for PPC64. -# - -#include - -.text - -.align 5 -SYM_FUNC_START(x25519_fe51_mul) - - stdu 1,-144(1) - std 21,56(1) - std 22,64(1) - std 23,72(1) - std 24,80(1) - std 25,88(1) - std 26,96(1) - std 27,104(1) - std 28,112(1) - std 29,120(1) - std 30,128(1) - std 31,136(1) - - ld 6,0(5) - ld 7,0(4) - ld 8,8(4) - ld 9,16(4) - ld 10,24(4) - ld 11,32(4) - - mulld 22,7,6 - mulhdu 23,7,6 - - mulld 24,8,6 - mulhdu 25,8,6 - - mulld 30,11,6 - mulhdu 31,11,6 - ld 4,8(5) - mulli 11,11,19 - - mulld 26,9,6 - mulhdu 27,9,6 - - mulld 28,10,6 - mulhdu 29,10,6 - mulld 12,11,4 - mulhdu 21,11,4 - addc 22,22,12 - adde 23,23,21 - - mulld 12,7,4 - mulhdu 21,7,4 - addc 24,24,12 - adde 25,25,21 - - mulld 12,10,4 - mulhdu 21,10,4 - ld 6,16(5) - mulli 10,10,19 - addc 30,30,12 - adde 31,31,21 - - mulld 12,8,4 - mulhdu 21,8,4 - addc 26,26,12 - adde 27,27,21 - - mulld 12,9,4 - mulhdu 21,9,4 - addc 28,28,12 - adde 29,29,21 - mulld 12,10,6 - mulhdu 21,10,6 - addc 22,22,12 - adde 23,23,21 - - mulld 12,11,6 - mulhdu 21,11,6 - addc 24,24,12 - adde 25,25,21 - - mulld 12,9,6 - mulhdu 21,9,6 - ld 4,24(5) - mulli 9,9,19 - addc 30,30,12 - adde 31,31,21 - - mulld 12,7,6 - mulhdu 21,7,6 - addc 26,26,12 - adde 27,27,21 - - mulld 12,8,6 - mulhdu 21,8,6 - addc 28,28,12 - adde 29,29,21 - mulld 12,9,4 - mulhdu 21,9,4 - addc 22,22,12 - adde 23,23,21 - - mulld 12,10,4 - mulhdu 21,10,4 - addc 24,24,12 - adde 25,25,21 - - mulld 12,8,4 - mulhdu 21,8,4 - ld 6,32(5) - mulli 8,8,19 - addc 30,30,12 - adde 31,31,21 - - mulld 12,11,4 - mulhdu 21,11,4 - addc 26,26,12 - adde 27,27,21 - - mulld 12,7,4 - mulhdu 21,7,4 - addc 28,28,12 - adde 29,29,21 - mulld 12,8,6 - mulhdu 21,8,6 - addc 22,22,12 - adde 23,23,21 - - mulld 12,9,6 - mulhdu 21,9,6 - addc 24,24,12 - adde 25,25,21 - - mulld 12,10,6 - mulhdu 21,10,6 - addc 26,26,12 - adde 27,27,21 - - mulld 12,11,6 - mulhdu 21,11,6 - addc 28,28,12 - adde 29,29,21 - - mulld 12,7,6 - mulhdu 21,7,6 - addc 30,30,12 - adde 31,31,21 - -.Lfe51_reduce: - li 0,-1 - srdi 0,0,13 - - srdi 12,26,51 - and 9,26,0 - insrdi 12,27,51,0 - srdi 21,22,51 - and 7,22,0 - insrdi 21,23,51,0 - addc 28,28,12 - addze 29,29 - addc 24,24,21 - addze 25,25 - - srdi 12,28,51 - and 10,28,0 - insrdi 12,29,51,0 - srdi 21,24,51 - and 8,24,0 - insrdi 21,25,51,0 - addc 30,30,12 - addze 31,31 - add 9,9,21 - - srdi 12,30,51 - and 11,30,0 - insrdi 12,31,51,0 - mulli 12,12,19 - - add 7,7,12 - - srdi 21,9,51 - and 9,9,0 - add 10,10,21 - - srdi 12,7,51 - and 7,7,0 - add 8,8,12 - - std 9,16(3) - std 10,24(3) - std 11,32(3) - std 7,0(3) - std 8,8(3) - - ld 21,56(1) - ld 22,64(1) - ld 23,72(1) - ld 24,80(1) - ld 25,88(1) - ld 26,96(1) - ld 27,104(1) - ld 28,112(1) - ld 29,120(1) - ld 30,128(1) - ld 31,136(1) - addi 1,1,144 - blr -SYM_FUNC_END(x25519_fe51_mul) - -.align 5 -SYM_FUNC_START(x25519_fe51_sqr) - - stdu 1,-144(1) - std 21,56(1) - std 22,64(1) - std 23,72(1) - std 24,80(1) - std 25,88(1) - std 26,96(1) - std 27,104(1) - std 28,112(1) - std 29,120(1) - std 30,128(1) - std 31,136(1) - - ld 7,0(4) - ld 8,8(4) - ld 9,16(4) - ld 10,24(4) - ld 11,32(4) - - add 6,7,7 - mulli 21,11,19 - - mulld 22,7,7 - mulhdu 23,7,7 - mulld 24,8,6 - mulhdu 25,8,6 - mulld 26,9,6 - mulhdu 27,9,6 - mulld 28,10,6 - mulhdu 29,10,6 - mulld 30,11,6 - mulhdu 31,11,6 - add 6,8,8 - mulld 12,11,21 - mulhdu 11,11,21 - addc 28,28,12 - adde 29,29,11 - - mulli 5,10,19 - - mulld 12,8,8 - mulhdu 11,8,8 - addc 26,26,12 - adde 27,27,11 - mulld 12,9,6 - mulhdu 11,9,6 - addc 28,28,12 - adde 29,29,11 - mulld 12,10,6 - mulhdu 11,10,6 - addc 30,30,12 - adde 31,31,11 - mulld 12,21,6 - mulhdu 11,21,6 - add 6,10,10 - addc 22,22,12 - adde 23,23,11 - mulld 12,10,5 - mulhdu 10,10,5 - addc 24,24,12 - adde 25,25,10 - mulld 12,6,21 - mulhdu 10,6,21 - add 6,9,9 - addc 26,26,12 - adde 27,27,10 - - mulld 12,9,9 - mulhdu 10,9,9 - addc 30,30,12 - adde 31,31,10 - mulld 12,5,6 - mulhdu 10,5,6 - addc 22,22,12 - adde 23,23,10 - mulld 12,21,6 - mulhdu 10,21,6 - addc 24,24,12 - adde 25,25,10 - - b .Lfe51_reduce -SYM_FUNC_END(x25519_fe51_sqr) - -.align 5 -SYM_FUNC_START(x25519_fe51_mul121666) - - stdu 1,-144(1) - std 21,56(1) - std 22,64(1) - std 23,72(1) - std 24,80(1) - std 25,88(1) - std 26,96(1) - std 27,104(1) - std 28,112(1) - std 29,120(1) - std 30,128(1) - std 31,136(1) - - lis 6,1 - ori 6,6,56130 - ld 7,0(4) - ld 8,8(4) - ld 9,16(4) - ld 10,24(4) - ld 11,32(4) - - mulld 22,7,6 - mulhdu 23,7,6 - mulld 24,8,6 - mulhdu 25,8,6 - mulld 26,9,6 - mulhdu 27,9,6 - mulld 28,10,6 - mulhdu 29,10,6 - mulld 30,11,6 - mulhdu 31,11,6 - - b .Lfe51_reduce -SYM_FUNC_END(x25519_fe51_mul121666) - -.align 5 -SYM_FUNC_START(x25519_fe51_sqr_times) - - stdu 1,-144(1) - std 21,56(1) - std 22,64(1) - std 23,72(1) - std 24,80(1) - std 25,88(1) - std 26,96(1) - std 27,104(1) - std 28,112(1) - std 29,120(1) - std 30,128(1) - std 31,136(1) - - ld 7,0(4) - ld 8,8(4) - ld 9,16(4) - ld 10,24(4) - ld 11,32(4) - - mtctr 5 - -.Lsqr_times_loop: - add 6,7,7 - mulli 21,11,19 - - mulld 22,7,7 - mulhdu 23,7,7 - mulld 24,8,6 - mulhdu 25,8,6 - mulld 26,9,6 - mulhdu 27,9,6 - mulld 28,10,6 - mulhdu 29,10,6 - mulld 30,11,6 - mulhdu 31,11,6 - add 6,8,8 - mulld 12,11,21 - mulhdu 11,11,21 - addc 28,28,12 - adde 29,29,11 - - mulli 5,10,19 - - mulld 12,8,8 - mulhdu 11,8,8 - addc 26,26,12 - adde 27,27,11 - mulld 12,9,6 - mulhdu 11,9,6 - addc 28,28,12 - adde 29,29,11 - mulld 12,10,6 - mulhdu 11,10,6 - addc 30,30,12 - adde 31,31,11 - mulld 12,21,6 - mulhdu 11,21,6 - add 6,10,10 - addc 22,22,12 - adde 23,23,11 - mulld 12,10,5 - mulhdu 10,10,5 - addc 24,24,12 - adde 25,25,10 - mulld 12,6,21 - mulhdu 10,6,21 - add 6,9,9 - addc 26,26,12 - adde 27,27,10 - - mulld 12,9,9 - mulhdu 10,9,9 - addc 30,30,12 - adde 31,31,10 - mulld 12,5,6 - mulhdu 10,5,6 - addc 22,22,12 - adde 23,23,10 - mulld 12,21,6 - mulhdu 10,21,6 - addc 24,24,12 - adde 25,25,10 - - # fe51_reduce - li 0,-1 - srdi 0,0,13 - - srdi 12,26,51 - and 9,26,0 - insrdi 12,27,51,0 - srdi 21,22,51 - and 7,22,0 - insrdi 21,23,51,0 - addc 28,28,12 - addze 29,29 - addc 24,24,21 - addze 25,25 - - srdi 12,28,51 - and 10,28,0 - insrdi 12,29,51,0 - srdi 21,24,51 - and 8,24,0 - insrdi 21,25,51,0 - addc 30,30,12 - addze 31,31 - add 9,9,21 - - srdi 12,30,51 - and 11,30,0 - insrdi 12,31,51,0 - mulli 12,12,19 - - add 7,7,12 - - srdi 21,9,51 - and 9,9,0 - add 10,10,21 - - srdi 12,7,51 - and 7,7,0 - add 8,8,12 - - bdnz .Lsqr_times_loop - - std 9,16(3) - std 10,24(3) - std 11,32(3) - std 7,0(3) - std 8,8(3) - - ld 21,56(1) - ld 22,64(1) - ld 23,72(1) - ld 24,80(1) - ld 25,88(1) - ld 26,96(1) - ld 27,104(1) - ld 28,112(1) - ld 29,120(1) - ld 30,128(1) - ld 31,136(1) - addi 1,1,144 - blr -SYM_FUNC_END(x25519_fe51_sqr_times) - -.align 5 -SYM_FUNC_START(x25519_fe51_frombytes) - - li 12, -1 - srdi 12, 12, 13 # 0x7ffffffffffff - - ld 5, 0(4) - ld 6, 8(4) - ld 7, 16(4) - ld 8, 24(4) - - srdi 10, 5, 51 - and 5, 5, 12 # h0 - - sldi 11, 6, 13 - or 11, 10, 11 # h1t - srdi 10, 6, 38 - and 6, 11, 12 # h1 - - sldi 11, 7, 26 - or 10, 10, 11 # h2t - - srdi 11, 7, 25 - and 7, 10, 12 # h2 - sldi 10, 8, 39 - or 11, 11, 10 # h3t - - srdi 9, 8, 12 - and 8, 11, 12 # h3 - and 9, 9, 12 # h4 - - std 5, 0(3) - std 6, 8(3) - std 7, 16(3) - std 8, 24(3) - std 9, 32(3) - - blr -SYM_FUNC_END(x25519_fe51_frombytes) - -.align 5 -SYM_FUNC_START(x25519_fe51_tobytes) - - ld 5, 0(4) - ld 6, 8(4) - ld 7, 16(4) - ld 8, 24(4) - ld 9, 32(4) - - li 12, -1 - srdi 12, 12, 13 # 0x7ffffffffffff - - # Full reducuction - addi 10, 5, 19 - srdi 10, 10, 51 - add 10, 10, 6 - srdi 10, 10, 51 - add 10, 10, 7 - srdi 10, 10, 51 - add 10, 10, 8 - srdi 10, 10, 51 - add 10, 10, 9 - srdi 10, 10, 51 - - mulli 10, 10, 19 - add 5, 5, 10 - srdi 11, 5, 51 - add 6, 6, 11 - srdi 11, 6, 51 - add 7, 7, 11 - srdi 11, 7, 51 - add 8, 8, 11 - srdi 11, 8, 51 - add 9, 9, 11 - - and 5, 5, 12 - and 6, 6, 12 - and 7, 7, 12 - and 8, 8, 12 - and 9, 9, 12 - - sldi 10, 6, 51 - or 5, 5, 10 # s0 - - srdi 11, 6, 13 - sldi 10, 7, 38 - or 6, 11, 10 # s1 - - srdi 11, 7, 26 - sldi 10, 8, 25 - or 7, 11, 10 # s2 - - srdi 11, 8, 39 - sldi 10, 9, 12 - or 8, 11, 10 # s4 - - std 5, 0(3) - std 6, 8(3) - std 7, 16(3) - std 8, 24(3) - - blr -SYM_FUNC_END(x25519_fe51_tobytes) - -.align 5 -SYM_FUNC_START(x25519_cswap) - - li 7, 5 - neg 6, 5 - mtctr 7 - -.Lswap_loop: - ld 8, 0(3) - ld 9, 0(4) - xor 10, 8, 9 - and 10, 10, 6 - xor 11, 8, 10 - xor 12, 9, 10 - std 11, 0(3) - addi 3, 3, 8 - std 12, 0(4) - addi 4, 4, 8 - bdnz .Lswap_loop - - blr -SYM_FUNC_END(x25519_cswap) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 6a895a571b00..d9c6fc78cf33 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -2,18 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" -config CRYPTO_CURVE25519_X86 - tristate - depends on 64BIT - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: x86_64 using: - - ADX (large integer arithmetic) - config CRYPTO_AES_NI_INTEL tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" select CRYPTO_AEAD diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index d402963d6b57..dfba7e5e88ea 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -62,8 +62,6 @@ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o -obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o - obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o @@ -81,6 +79,3 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o - -# Disable GCOV in odd or sensitive code -GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c deleted file mode 100644 index ab91368284a4..000000000000 --- a/arch/x86/crypto/curve25519-x86_64.c +++ /dev/null @@ -1,1630 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2020 Jason A. Donenfeld . All Rights Reserved. - * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation - */ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -static __always_inline u64 eq_mask(u64 a, u64 b) -{ - u64 x = a ^ b; - u64 minus_x = ~x + (u64)1U; - u64 x_or_minus_x = x | minus_x; - u64 xnx = x_or_minus_x >> (u32)63U; - return xnx - (u64)1U; -} - -static __always_inline u64 gte_mask(u64 a, u64 b) -{ - u64 x = a; - u64 y = b; - u64 x_xor_y = x ^ y; - u64 x_sub_y = x - y; - u64 x_sub_y_xor_y = x_sub_y ^ y; - u64 q = x_xor_y | x_sub_y_xor_y; - u64 x_xor_q = x ^ q; - u64 x_xor_q_ = x_xor_q >> (u32)63U; - return x_xor_q_ - (u64)1U; -} - -/* Computes the addition of four-element f1 with value in f2 - * and returns the carry (if any) */ -static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) -{ - u64 carry_r; - - asm volatile( - /* Clear registers to propagate the carry bit */ - " xor %%r8d, %%r8d;" - " xor %%r9d, %%r9d;" - " xor %%r10d, %%r10d;" - " xor %%r11d, %%r11d;" - " xor %k1, %k1;" - - /* Begin addition chain */ - " addq 0(%3), %0;" - " movq %0, 0(%2);" - " adcxq 8(%3), %%r8;" - " movq %%r8, 8(%2);" - " adcxq 16(%3), %%r9;" - " movq %%r9, 16(%2);" - " adcxq 24(%3), %%r10;" - " movq %%r10, 24(%2);" - - /* Return the carry bit in a register */ - " adcx %%r11, %1;" - : "+&r"(f2), "=&r"(carry_r) - : "r"(out), "r"(f1) - : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); - - return carry_r; -} - -/* Computes the field addition of two field elements */ -static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) -{ - asm volatile( - /* Compute the raw addition of f1 + f2 */ - " movq 0(%0), %%r8;" - " addq 0(%2), %%r8;" - " movq 8(%0), %%r9;" - " adcxq 8(%2), %%r9;" - " movq 16(%0), %%r10;" - " adcxq 16(%2), %%r10;" - " movq 24(%0), %%r11;" - " adcxq 24(%2), %%r11;" - - /* Wrap the result back into the field */ - - /* Step 1: Compute carry*38 */ - " mov $0, %%rax;" - " mov $38, %0;" - " cmovc %0, %%rax;" - - /* Step 2: Add carry*38 to the original sum */ - " xor %%ecx, %%ecx;" - " add %%rax, %%r8;" - " adcx %%rcx, %%r9;" - " movq %%r9, 8(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 16(%1);" - " adcx %%rcx, %%r11;" - " movq %%r11, 24(%1);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %0, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 0(%1);" - : "+&r"(f2) - : "r"(out), "r"(f1) - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); -} - -/* Computes the field subtraction of two field elements */ -static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) -{ - asm volatile( - /* Compute the raw subtraction of f1-f2 */ - " movq 0(%1), %%r8;" - " subq 0(%2), %%r8;" - " movq 8(%1), %%r9;" - " sbbq 8(%2), %%r9;" - " movq 16(%1), %%r10;" - " sbbq 16(%2), %%r10;" - " movq 24(%1), %%r11;" - " sbbq 24(%2), %%r11;" - - /* Wrap the result back into the field */ - - /* Step 1: Compute carry*38 */ - " mov $0, %%rax;" - " mov $38, %%rcx;" - " cmovc %%rcx, %%rax;" - - /* Step 2: Subtract carry*38 from the original difference */ - " sub %%rax, %%r8;" - " sbb $0, %%r9;" - " sbb $0, %%r10;" - " sbb $0, %%r11;" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rcx, %%rax;" - " sub %%rax, %%r8;" - - /* Store the result */ - " movq %%r8, 0(%0);" - " movq %%r9, 8(%0);" - " movq %%r10, 16(%0);" - " movq %%r11, 24(%0);" - : - : "r"(out), "r"(f1), "r"(f2) - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); -} - -/* Computes a field multiplication: out <- f1 * f2 - * Uses the 8-element buffer tmp for intermediate results */ -static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) -{ - asm volatile( - - /* Compute the raw multiplication: tmp <- src1 * src2 */ - - /* Compute src1[0] * src2 */ - " movq 0(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " movq %%r8, 0(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " movq %%r10, 8(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - - /* Compute src1[1] * src2 */ - " movq 8(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 8(%2), %%r8;" - " movq %%r8, 8(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 16(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " mov $0, %%r8;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - - /* Compute src1[2] * src2 */ - " movq 16(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 16(%2), %%r8;" - " movq %%r8, 16(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 24(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " mov $0, %%r8;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - - /* Compute src1[3] * src2 */ - " movq 24(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 24(%2), %%r8;" - " movq %%r8, 24(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 32(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " movq %%rbx, 40(%2);" - " mov $0, %%r8;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " movq %%r14, 48(%2);" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - " movq %%rax, 56(%2);" - - /* Line up pointers */ - " mov %2, %0;" - " mov %3, %2;" - - /* Wrap the result back into the field */ - - /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ - " mov $38, %%rdx;" - " mulxq 32(%0), %%r8, %%r13;" - " xor %k1, %k1;" - " adoxq 0(%0), %%r8;" - " mulxq 40(%0), %%r9, %%rbx;" - " adcx %%r13, %%r9;" - " adoxq 8(%0), %%r9;" - " mulxq 48(%0), %%r10, %%r13;" - " adcx %%rbx, %%r10;" - " adoxq 16(%0), %%r10;" - " mulxq 56(%0), %%r11, %%rax;" - " adcx %%r13, %%r11;" - " adoxq 24(%0), %%r11;" - " adcx %1, %%rax;" - " adox %1, %%rax;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %1, %%r9;" - " movq %%r9, 8(%2);" - " adcx %1, %%r10;" - " movq %%r10, 16(%2);" - " adcx %1, %%r11;" - " movq %%r11, 24(%2);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 0(%2);" - : "+&r"(f1), "+&r"(f2), "+&r"(tmp) - : "r"(out) - : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", - "%r14", "memory", "cc"); -} - -/* Computes two field multiplications: - * out[0] <- f1[0] * f2[0] - * out[1] <- f1[1] * f2[1] - * Uses the 16-element buffer tmp for intermediate results: */ -static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) -{ - asm volatile( - - /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ - - /* Compute src1[0] * src2 */ - " movq 0(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " movq %%r8, 0(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " movq %%r10, 8(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - - /* Compute src1[1] * src2 */ - " movq 8(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 8(%2), %%r8;" - " movq %%r8, 8(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 16(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " mov $0, %%r8;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - - /* Compute src1[2] * src2 */ - " movq 16(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 16(%2), %%r8;" - " movq %%r8, 16(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 24(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " mov $0, %%r8;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - - /* Compute src1[3] * src2 */ - " movq 24(%0), %%rdx;" - " mulxq 0(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 24(%2), %%r8;" - " movq %%r8, 24(%2);" - " mulxq 8(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 32(%2);" - " mulxq 16(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " movq %%rbx, 40(%2);" - " mov $0, %%r8;" - " mulxq 24(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " movq %%r14, 48(%2);" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - " movq %%rax, 56(%2);" - - /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ - - /* Compute src1[0] * src2 */ - " movq 32(%0), %%rdx;" - " mulxq 32(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " movq %%r8, 64(%2);" - " mulxq 40(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " movq %%r10, 72(%2);" - " mulxq 48(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " mulxq 56(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - - /* Compute src1[1] * src2 */ - " movq 40(%0), %%rdx;" - " mulxq 32(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 72(%2), %%r8;" - " movq %%r8, 72(%2);" - " mulxq 40(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 80(%2);" - " mulxq 48(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " mov $0, %%r8;" - " mulxq 56(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - - /* Compute src1[2] * src2 */ - " movq 48(%0), %%rdx;" - " mulxq 32(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 80(%2), %%r8;" - " movq %%r8, 80(%2);" - " mulxq 40(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 88(%2);" - " mulxq 48(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " mov $0, %%r8;" - " mulxq 56(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - - /* Compute src1[3] * src2 */ - " movq 56(%0), %%rdx;" - " mulxq 32(%1), %%r8, %%r9;" - " xor %%r10d, %%r10d;" - " adcxq 88(%2), %%r8;" - " movq %%r8, 88(%2);" - " mulxq 40(%1), %%r10, %%r11;" - " adox %%r9, %%r10;" - " adcx %%rbx, %%r10;" - " movq %%r10, 96(%2);" - " mulxq 48(%1), %%rbx, %%r13;" - " adox %%r11, %%rbx;" - " adcx %%r14, %%rbx;" - " movq %%rbx, 104(%2);" - " mov $0, %%r8;" - " mulxq 56(%1), %%r14, %%rdx;" - " adox %%r13, %%r14;" - " adcx %%rax, %%r14;" - " movq %%r14, 112(%2);" - " mov $0, %%rax;" - " adox %%rdx, %%rax;" - " adcx %%r8, %%rax;" - " movq %%rax, 120(%2);" - - /* Line up pointers */ - " mov %2, %0;" - " mov %3, %2;" - - /* Wrap the results back into the field */ - - /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ - " mov $38, %%rdx;" - " mulxq 32(%0), %%r8, %%r13;" - " xor %k1, %k1;" - " adoxq 0(%0), %%r8;" - " mulxq 40(%0), %%r9, %%rbx;" - " adcx %%r13, %%r9;" - " adoxq 8(%0), %%r9;" - " mulxq 48(%0), %%r10, %%r13;" - " adcx %%rbx, %%r10;" - " adoxq 16(%0), %%r10;" - " mulxq 56(%0), %%r11, %%rax;" - " adcx %%r13, %%r11;" - " adoxq 24(%0), %%r11;" - " adcx %1, %%rax;" - " adox %1, %%rax;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %1, %%r9;" - " movq %%r9, 8(%2);" - " adcx %1, %%r10;" - " movq %%r10, 16(%2);" - " adcx %1, %%r11;" - " movq %%r11, 24(%2);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 0(%2);" - - /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ - " mov $38, %%rdx;" - " mulxq 96(%0), %%r8, %%r13;" - " xor %k1, %k1;" - " adoxq 64(%0), %%r8;" - " mulxq 104(%0), %%r9, %%rbx;" - " adcx %%r13, %%r9;" - " adoxq 72(%0), %%r9;" - " mulxq 112(%0), %%r10, %%r13;" - " adcx %%rbx, %%r10;" - " adoxq 80(%0), %%r10;" - " mulxq 120(%0), %%r11, %%rax;" - " adcx %%r13, %%r11;" - " adoxq 88(%0), %%r11;" - " adcx %1, %%rax;" - " adox %1, %%rax;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %1, %%r9;" - " movq %%r9, 40(%2);" - " adcx %1, %%r10;" - " movq %%r10, 48(%2);" - " adcx %1, %%r11;" - " movq %%r11, 56(%2);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 32(%2);" - : "+&r"(f1), "+&r"(f2), "+&r"(tmp) - : "r"(out) - : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", - "%r14", "memory", "cc"); -} - -/* Computes the field multiplication of four-element f1 with value in f2 - * Requires f2 to be smaller than 2^17 */ -static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) -{ - register u64 f2_r asm("rdx") = f2; - - asm volatile( - /* Compute the raw multiplication of f1*f2 */ - " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ - " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ - " add %%rcx, %%r9;" - " mov $0, %%rcx;" - " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ - " adcx %%rbx, %%r10;" - " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ - " adcx %%r13, %%r11;" - " adcx %%rcx, %%rax;" - - /* Wrap the result back into the field */ - - /* Step 1: Compute carry*38 */ - " mov $38, %%rdx;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %%rcx, %%r9;" - " movq %%r9, 8(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 16(%1);" - " adcx %%rcx, %%r11;" - " movq %%r11, 24(%1);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 0(%1);" - : "+&r"(f2_r) - : "r"(out), "r"(f1) - : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", - "memory", "cc"); -} - -/* Computes p1 <- bit ? p2 : p1 in constant time */ -static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) -{ - asm volatile( - /* Transfer bit into CF flag */ - " add $18446744073709551615, %0;" - - /* cswap p1[0], p2[0] */ - " movq 0(%1), %%r8;" - " movq 0(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 0(%1);" - " movq %%r9, 0(%2);" - - /* cswap p1[1], p2[1] */ - " movq 8(%1), %%r8;" - " movq 8(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 8(%1);" - " movq %%r9, 8(%2);" - - /* cswap p1[2], p2[2] */ - " movq 16(%1), %%r8;" - " movq 16(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 16(%1);" - " movq %%r9, 16(%2);" - - /* cswap p1[3], p2[3] */ - " movq 24(%1), %%r8;" - " movq 24(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 24(%1);" - " movq %%r9, 24(%2);" - - /* cswap p1[4], p2[4] */ - " movq 32(%1), %%r8;" - " movq 32(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 32(%1);" - " movq %%r9, 32(%2);" - - /* cswap p1[5], p2[5] */ - " movq 40(%1), %%r8;" - " movq 40(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 40(%1);" - " movq %%r9, 40(%2);" - - /* cswap p1[6], p2[6] */ - " movq 48(%1), %%r8;" - " movq 48(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 48(%1);" - " movq %%r9, 48(%2);" - - /* cswap p1[7], p2[7] */ - " movq 56(%1), %%r8;" - " movq 56(%2), %%r9;" - " mov %%r8, %%r10;" - " cmovc %%r9, %%r8;" - " cmovc %%r10, %%r9;" - " movq %%r8, 56(%1);" - " movq %%r9, 56(%2);" - : "+&r"(bit) - : "r"(p1), "r"(p2) - : "%r8", "%r9", "%r10", "memory", "cc"); -} - -/* Computes the square of a field element: out <- f * f - * Uses the 8-element buffer tmp for intermediate results */ -static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) -{ - asm volatile( - /* Compute the raw multiplication: tmp <- f * f */ - - /* Step 1: Compute all partial products */ - " movq 0(%0), %%rdx;" /* f[0] */ - " mulxq 8(%0), %%r8, %%r14;" - " xor %%r15d, %%r15d;" /* f[1]*f[0] */ - " mulxq 16(%0), %%r9, %%r10;" - " adcx %%r14, %%r9;" /* f[2]*f[0] */ - " mulxq 24(%0), %%rax, %%rcx;" - " adcx %%rax, %%r10;" /* f[3]*f[0] */ - " movq 24(%0), %%rdx;" /* f[3] */ - " mulxq 8(%0), %%r11, %%rbx;" - " adcx %%rcx, %%r11;" /* f[1]*f[3] */ - " mulxq 16(%0), %%rax, %%r13;" - " adcx %%rax, %%rbx;" /* f[2]*f[3] */ - " movq 8(%0), %%rdx;" - " adcx %%r15, %%r13;" /* f1 */ - " mulxq 16(%0), %%rax, %%rcx;" - " mov $0, %%r14;" /* f[2]*f[1] */ - - /* Step 2: Compute two parallel carry chains */ - " xor %%r15d, %%r15d;" - " adox %%rax, %%r10;" - " adcx %%r8, %%r8;" - " adox %%rcx, %%r11;" - " adcx %%r9, %%r9;" - " adox %%r15, %%rbx;" - " adcx %%r10, %%r10;" - " adox %%r15, %%r13;" - " adcx %%r11, %%r11;" - " adox %%r15, %%r14;" - " adcx %%rbx, %%rbx;" - " adcx %%r13, %%r13;" - " adcx %%r14, %%r14;" - - /* Step 3: Compute intermediate squares */ - " movq 0(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ - " movq %%rax, 0(%1);" - " add %%rcx, %%r8;" - " movq %%r8, 8(%1);" - " movq 8(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ - " adcx %%rax, %%r9;" - " movq %%r9, 16(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 24(%1);" - " movq 16(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ - " adcx %%rax, %%r11;" - " movq %%r11, 32(%1);" - " adcx %%rcx, %%rbx;" - " movq %%rbx, 40(%1);" - " movq 24(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ - " adcx %%rax, %%r13;" - " movq %%r13, 48(%1);" - " adcx %%rcx, %%r14;" - " movq %%r14, 56(%1);" - - /* Line up pointers */ - " mov %1, %0;" - " mov %2, %1;" - - /* Wrap the result back into the field */ - - /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ - " mov $38, %%rdx;" - " mulxq 32(%0), %%r8, %%r13;" - " xor %%ecx, %%ecx;" - " adoxq 0(%0), %%r8;" - " mulxq 40(%0), %%r9, %%rbx;" - " adcx %%r13, %%r9;" - " adoxq 8(%0), %%r9;" - " mulxq 48(%0), %%r10, %%r13;" - " adcx %%rbx, %%r10;" - " adoxq 16(%0), %%r10;" - " mulxq 56(%0), %%r11, %%rax;" - " adcx %%r13, %%r11;" - " adoxq 24(%0), %%r11;" - " adcx %%rcx, %%rax;" - " adox %%rcx, %%rax;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %%rcx, %%r9;" - " movq %%r9, 8(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 16(%1);" - " adcx %%rcx, %%r11;" - " movq %%r11, 24(%1);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 0(%1);" - : "+&r"(f), "+&r"(tmp) - : "r"(out) - : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", - "%r13", "%r14", "%r15", "memory", "cc"); -} - -/* Computes two field squarings: - * out[0] <- f[0] * f[0] - * out[1] <- f[1] * f[1] - * Uses the 16-element buffer tmp for intermediate results */ -static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) -{ - asm volatile( - /* Step 1: Compute all partial products */ - " movq 0(%0), %%rdx;" /* f[0] */ - " mulxq 8(%0), %%r8, %%r14;" - " xor %%r15d, %%r15d;" /* f[1]*f[0] */ - " mulxq 16(%0), %%r9, %%r10;" - " adcx %%r14, %%r9;" /* f[2]*f[0] */ - " mulxq 24(%0), %%rax, %%rcx;" - " adcx %%rax, %%r10;" /* f[3]*f[0] */ - " movq 24(%0), %%rdx;" /* f[3] */ - " mulxq 8(%0), %%r11, %%rbx;" - " adcx %%rcx, %%r11;" /* f[1]*f[3] */ - " mulxq 16(%0), %%rax, %%r13;" - " adcx %%rax, %%rbx;" /* f[2]*f[3] */ - " movq 8(%0), %%rdx;" - " adcx %%r15, %%r13;" /* f1 */ - " mulxq 16(%0), %%rax, %%rcx;" - " mov $0, %%r14;" /* f[2]*f[1] */ - - /* Step 2: Compute two parallel carry chains */ - " xor %%r15d, %%r15d;" - " adox %%rax, %%r10;" - " adcx %%r8, %%r8;" - " adox %%rcx, %%r11;" - " adcx %%r9, %%r9;" - " adox %%r15, %%rbx;" - " adcx %%r10, %%r10;" - " adox %%r15, %%r13;" - " adcx %%r11, %%r11;" - " adox %%r15, %%r14;" - " adcx %%rbx, %%rbx;" - " adcx %%r13, %%r13;" - " adcx %%r14, %%r14;" - - /* Step 3: Compute intermediate squares */ - " movq 0(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ - " movq %%rax, 0(%1);" - " add %%rcx, %%r8;" - " movq %%r8, 8(%1);" - " movq 8(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ - " adcx %%rax, %%r9;" - " movq %%r9, 16(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 24(%1);" - " movq 16(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ - " adcx %%rax, %%r11;" - " movq %%r11, 32(%1);" - " adcx %%rcx, %%rbx;" - " movq %%rbx, 40(%1);" - " movq 24(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ - " adcx %%rax, %%r13;" - " movq %%r13, 48(%1);" - " adcx %%rcx, %%r14;" - " movq %%r14, 56(%1);" - - /* Step 1: Compute all partial products */ - " movq 32(%0), %%rdx;" /* f[0] */ - " mulxq 40(%0), %%r8, %%r14;" - " xor %%r15d, %%r15d;" /* f[1]*f[0] */ - " mulxq 48(%0), %%r9, %%r10;" - " adcx %%r14, %%r9;" /* f[2]*f[0] */ - " mulxq 56(%0), %%rax, %%rcx;" - " adcx %%rax, %%r10;" /* f[3]*f[0] */ - " movq 56(%0), %%rdx;" /* f[3] */ - " mulxq 40(%0), %%r11, %%rbx;" - " adcx %%rcx, %%r11;" /* f[1]*f[3] */ - " mulxq 48(%0), %%rax, %%r13;" - " adcx %%rax, %%rbx;" /* f[2]*f[3] */ - " movq 40(%0), %%rdx;" - " adcx %%r15, %%r13;" /* f1 */ - " mulxq 48(%0), %%rax, %%rcx;" - " mov $0, %%r14;" /* f[2]*f[1] */ - - /* Step 2: Compute two parallel carry chains */ - " xor %%r15d, %%r15d;" - " adox %%rax, %%r10;" - " adcx %%r8, %%r8;" - " adox %%rcx, %%r11;" - " adcx %%r9, %%r9;" - " adox %%r15, %%rbx;" - " adcx %%r10, %%r10;" - " adox %%r15, %%r13;" - " adcx %%r11, %%r11;" - " adox %%r15, %%r14;" - " adcx %%rbx, %%rbx;" - " adcx %%r13, %%r13;" - " adcx %%r14, %%r14;" - - /* Step 3: Compute intermediate squares */ - " movq 32(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ - " movq %%rax, 64(%1);" - " add %%rcx, %%r8;" - " movq %%r8, 72(%1);" - " movq 40(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ - " adcx %%rax, %%r9;" - " movq %%r9, 80(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 88(%1);" - " movq 48(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ - " adcx %%rax, %%r11;" - " movq %%r11, 96(%1);" - " adcx %%rcx, %%rbx;" - " movq %%rbx, 104(%1);" - " movq 56(%0), %%rdx;" - " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ - " adcx %%rax, %%r13;" - " movq %%r13, 112(%1);" - " adcx %%rcx, %%r14;" - " movq %%r14, 120(%1);" - - /* Line up pointers */ - " mov %1, %0;" - " mov %2, %1;" - - /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ - " mov $38, %%rdx;" - " mulxq 32(%0), %%r8, %%r13;" - " xor %%ecx, %%ecx;" - " adoxq 0(%0), %%r8;" - " mulxq 40(%0), %%r9, %%rbx;" - " adcx %%r13, %%r9;" - " adoxq 8(%0), %%r9;" - " mulxq 48(%0), %%r10, %%r13;" - " adcx %%rbx, %%r10;" - " adoxq 16(%0), %%r10;" - " mulxq 56(%0), %%r11, %%rax;" - " adcx %%r13, %%r11;" - " adoxq 24(%0), %%r11;" - " adcx %%rcx, %%rax;" - " adox %%rcx, %%rax;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %%rcx, %%r9;" - " movq %%r9, 8(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 16(%1);" - " adcx %%rcx, %%r11;" - " movq %%r11, 24(%1);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 0(%1);" - - /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ - " mov $38, %%rdx;" - " mulxq 96(%0), %%r8, %%r13;" - " xor %%ecx, %%ecx;" - " adoxq 64(%0), %%r8;" - " mulxq 104(%0), %%r9, %%rbx;" - " adcx %%r13, %%r9;" - " adoxq 72(%0), %%r9;" - " mulxq 112(%0), %%r10, %%r13;" - " adcx %%rbx, %%r10;" - " adoxq 80(%0), %%r10;" - " mulxq 120(%0), %%r11, %%rax;" - " adcx %%r13, %%r11;" - " adoxq 88(%0), %%r11;" - " adcx %%rcx, %%rax;" - " adox %%rcx, %%rax;" - " imul %%rdx, %%rax;" - - /* Step 2: Fold the carry back into dst */ - " add %%rax, %%r8;" - " adcx %%rcx, %%r9;" - " movq %%r9, 40(%1);" - " adcx %%rcx, %%r10;" - " movq %%r10, 48(%1);" - " adcx %%rcx, %%r11;" - " movq %%r11, 56(%1);" - - /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ - " mov $0, %%rax;" - " cmovc %%rdx, %%rax;" - " add %%rax, %%r8;" - " movq %%r8, 32(%1);" - : "+&r"(f), "+&r"(tmp) - : "r"(out) - : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", - "%r13", "%r14", "%r15", "memory", "cc"); -} - -static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2) -{ - u64 *nq = p01_tmp1; - u64 *nq_p1 = p01_tmp1 + (u32)8U; - u64 *tmp1 = p01_tmp1 + (u32)16U; - u64 *x1 = q; - u64 *x2 = nq; - u64 *z2 = nq + (u32)4U; - u64 *z3 = nq_p1 + (u32)4U; - u64 *a = tmp1; - u64 *b = tmp1 + (u32)4U; - u64 *ab = tmp1; - u64 *dc = tmp1 + (u32)8U; - u64 *x3; - u64 *z31; - u64 *d0; - u64 *c0; - u64 *a1; - u64 *b1; - u64 *d; - u64 *c; - u64 *ab1; - u64 *dc1; - fadd(a, x2, z2); - fsub(b, x2, z2); - x3 = nq_p1; - z31 = nq_p1 + (u32)4U; - d0 = dc; - c0 = dc + (u32)4U; - fadd(c0, x3, z31); - fsub(d0, x3, z31); - fmul2(dc, dc, ab, tmp2); - fadd(x3, d0, c0); - fsub(z31, d0, c0); - a1 = tmp1; - b1 = tmp1 + (u32)4U; - d = tmp1 + (u32)8U; - c = tmp1 + (u32)12U; - ab1 = tmp1; - dc1 = tmp1 + (u32)8U; - fsqr2(dc1, ab1, tmp2); - fsqr2(nq_p1, nq_p1, tmp2); - a1[0U] = c[0U]; - a1[1U] = c[1U]; - a1[2U] = c[2U]; - a1[3U] = c[3U]; - fsub(c, d, c); - fmul_scalar(b1, c, (u64)121665U); - fadd(b1, b1, d); - fmul2(nq, dc1, ab1, tmp2); - fmul(z3, z3, x1, tmp2); -} - -static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2) -{ - u64 *x2 = nq; - u64 *z2 = nq + (u32)4U; - u64 *a = tmp1; - u64 *b = tmp1 + (u32)4U; - u64 *d = tmp1 + (u32)8U; - u64 *c = tmp1 + (u32)12U; - u64 *ab = tmp1; - u64 *dc = tmp1 + (u32)8U; - fadd(a, x2, z2); - fsub(b, x2, z2); - fsqr2(dc, ab, tmp2); - a[0U] = c[0U]; - a[1U] = c[1U]; - a[2U] = c[2U]; - a[3U] = c[3U]; - fsub(c, d, c); - fmul_scalar(b, c, (u64)121665U); - fadd(b, b, d); - fmul2(nq, dc, ab, tmp2); -} - -static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1) -{ - u64 tmp2[16U] = { 0U }; - u64 p01_tmp1_swap[33U] = { 0U }; - u64 *p0 = p01_tmp1_swap; - u64 *p01 = p01_tmp1_swap; - u64 *p03 = p01; - u64 *p11 = p01 + (u32)8U; - u64 *x0; - u64 *z0; - u64 *p01_tmp1; - u64 *p01_tmp11; - u64 *nq10; - u64 *nq_p11; - u64 *swap1; - u64 sw0; - u64 *nq1; - u64 *tmp1; - memcpy(p11, init1, (u32)8U * sizeof(init1[0U])); - x0 = p03; - z0 = p03 + (u32)4U; - x0[0U] = (u64)1U; - x0[1U] = (u64)0U; - x0[2U] = (u64)0U; - x0[3U] = (u64)0U; - z0[0U] = (u64)0U; - z0[1U] = (u64)0U; - z0[2U] = (u64)0U; - z0[3U] = (u64)0U; - p01_tmp1 = p01_tmp1_swap; - p01_tmp11 = p01_tmp1_swap; - nq10 = p01_tmp1_swap; - nq_p11 = p01_tmp1_swap + (u32)8U; - swap1 = p01_tmp1_swap + (u32)32U; - cswap2((u64)1U, nq10, nq_p11); - point_add_and_double(init1, p01_tmp11, tmp2); - swap1[0U] = (u64)1U; - { - u32 i; - for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) { - u64 *p01_tmp12 = p01_tmp1_swap; - u64 *swap2 = p01_tmp1_swap + (u32)32U; - u64 *nq2 = p01_tmp12; - u64 *nq_p12 = p01_tmp12 + (u32)8U; - u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U); - u64 sw = swap2[0U] ^ bit; - cswap2(sw, nq2, nq_p12); - point_add_and_double(init1, p01_tmp12, tmp2); - swap2[0U] = bit; - } - } - sw0 = swap1[0U]; - cswap2(sw0, nq10, nq_p11); - nq1 = p01_tmp1; - tmp1 = p01_tmp1 + (u32)16U; - point_double(nq1, tmp1, tmp2); - point_double(nq1, tmp1, tmp2); - point_double(nq1, tmp1, tmp2); - memcpy(out, p0, (u32)8U * sizeof(p0[0U])); - - memzero_explicit(tmp2, sizeof(tmp2)); - memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap)); -} - -static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1) -{ - u32 i; - fsqr(o, inp, tmp); - for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U) - fsqr(o, o, tmp); -} - -static void finv(u64 *o, const u64 *i, u64 *tmp) -{ - u64 t1[16U] = { 0U }; - u64 *a0 = t1; - u64 *b = t1 + (u32)4U; - u64 *c = t1 + (u32)8U; - u64 *t00 = t1 + (u32)12U; - u64 *tmp1 = tmp; - u64 *a; - u64 *t0; - fsquare_times(a0, i, tmp1, (u32)1U); - fsquare_times(t00, a0, tmp1, (u32)2U); - fmul(b, t00, i, tmp); - fmul(a0, b, a0, tmp); - fsquare_times(t00, a0, tmp1, (u32)1U); - fmul(b, t00, b, tmp); - fsquare_times(t00, b, tmp1, (u32)5U); - fmul(b, t00, b, tmp); - fsquare_times(t00, b, tmp1, (u32)10U); - fmul(c, t00, b, tmp); - fsquare_times(t00, c, tmp1, (u32)20U); - fmul(t00, t00, c, tmp); - fsquare_times(t00, t00, tmp1, (u32)10U); - fmul(b, t00, b, tmp); - fsquare_times(t00, b, tmp1, (u32)50U); - fmul(c, t00, b, tmp); - fsquare_times(t00, c, tmp1, (u32)100U); - fmul(t00, t00, c, tmp); - fsquare_times(t00, t00, tmp1, (u32)50U); - fmul(t00, t00, b, tmp); - fsquare_times(t00, t00, tmp1, (u32)5U); - a = t1; - t0 = t1 + (u32)12U; - fmul(o, t0, a, tmp); -} - -static void store_felem(u64 *b, u64 *f) -{ - u64 f30 = f[3U]; - u64 top_bit0 = f30 >> (u32)63U; - u64 f31; - u64 top_bit; - u64 f0; - u64 f1; - u64 f2; - u64 f3; - u64 m0; - u64 m1; - u64 m2; - u64 m3; - u64 mask; - u64 f0_; - u64 f1_; - u64 f2_; - u64 f3_; - u64 o0; - u64 o1; - u64 o2; - u64 o3; - f[3U] = f30 & (u64)0x7fffffffffffffffU; - add_scalar(f, f, (u64)19U * top_bit0); - f31 = f[3U]; - top_bit = f31 >> (u32)63U; - f[3U] = f31 & (u64)0x7fffffffffffffffU; - add_scalar(f, f, (u64)19U * top_bit); - f0 = f[0U]; - f1 = f[1U]; - f2 = f[2U]; - f3 = f[3U]; - m0 = gte_mask(f0, (u64)0xffffffffffffffedU); - m1 = eq_mask(f1, (u64)0xffffffffffffffffU); - m2 = eq_mask(f2, (u64)0xffffffffffffffffU); - m3 = eq_mask(f3, (u64)0x7fffffffffffffffU); - mask = ((m0 & m1) & m2) & m3; - f0_ = f0 - (mask & (u64)0xffffffffffffffedU); - f1_ = f1 - (mask & (u64)0xffffffffffffffffU); - f2_ = f2 - (mask & (u64)0xffffffffffffffffU); - f3_ = f3 - (mask & (u64)0x7fffffffffffffffU); - o0 = f0_; - o1 = f1_; - o2 = f2_; - o3 = f3_; - b[0U] = o0; - b[1U] = o1; - b[2U] = o2; - b[3U] = o3; -} - -static void encode_point(u8 *o, const u64 *i) -{ - const u64 *x = i; - const u64 *z = i + (u32)4U; - u64 tmp[4U] = { 0U }; - u64 tmp_w[16U] = { 0U }; - finv(tmp, z, tmp_w); - fmul(tmp, tmp, x, tmp_w); - store_felem((u64 *)o, tmp); -} - -static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub) -{ - u64 init1[8U] = { 0U }; - u64 tmp[4U] = { 0U }; - u64 tmp3; - u64 *x; - u64 *z; - { - u32 i; - for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) { - u64 *os = tmp; - const u8 *bj = pub + i * (u32)8U; - u64 u = *(u64 *)bj; - u64 r = u; - u64 x0 = r; - os[i] = x0; - } - } - tmp3 = tmp[3U]; - tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU; - x = init1; - z = init1 + (u32)4U; - z[0U] = (u64)1U; - z[1U] = (u64)0U; - z[2U] = (u64)0U; - z[3U] = (u64)0U; - x[0U] = tmp[0U]; - x[1U] = tmp[1U]; - x[2U] = tmp[2U]; - x[3U] = tmp[3U]; - montgomery_ladder(init1, priv, init1); - encode_point(out, init1); -} - -/* The below constants were generated using this sage script: - * - * #!/usr/bin/env sage - * import sys - * from sage.all import * - * def limbs(n): - * n = int(n) - * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64) - * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l - * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0]) - * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0] - * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s)) - * print("static const u64 table_ladder[] = {") - * p = ec.lift_x(9) - * for i in range(252): - * l = (p[0] + p[2]) / (p[0] - p[2]) - * print(("\t%s" + ("," if i != 251 else "")) % limbs(l)) - * p = p * 2 - * print("};") - * - */ - -static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL }; - -static const u64 table_ladder[] = { - 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL, - 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL, - 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL, - 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL, - 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL, - 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL, - 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL, - 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL, - 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL, - 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL, - 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL, - 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL, - 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL, - 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL, - 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL, - 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL, - 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL, - 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL, - 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL, - 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL, - 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL, - 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL, - 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL, - 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL, - 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL, - 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL, - 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL, - 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL, - 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL, - 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL, - 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL, - 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL, - 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL, - 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL, - 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL, - 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL, - 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL, - 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL, - 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL, - 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL, - 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL, - 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL, - 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL, - 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL, - 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL, - 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL, - 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL, - 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL, - 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL, - 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL, - 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL, - 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL, - 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL, - 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL, - 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL, - 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL, - 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL, - 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL, - 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL, - 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL, - 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL, - 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL, - 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL, - 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL, - 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL, - 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL, - 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL, - 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL, - 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL, - 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL, - 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL, - 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL, - 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL, - 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL, - 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL, - 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL, - 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL, - 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL, - 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL, - 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL, - 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL, - 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL, - 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL, - 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL, - 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL, - 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL, - 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL, - 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL, - 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL, - 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL, - 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL, - 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL, - 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL, - 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL, - 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL, - 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL, - 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL, - 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL, - 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL, - 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL, - 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL, - 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL, - 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL, - 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL, - 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL, - 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL, - 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL, - 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL, - 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL, - 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL, - 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL, - 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL, - 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL, - 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL, - 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL, - 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL, - 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL, - 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL, - 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL, - 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL, - 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL, - 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL, - 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL, - 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL, - 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL, - 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL, - 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL, - 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL, - 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL, - 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL, - 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL, - 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL, - 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL, - 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL, - 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL, - 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL, - 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL, - 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL, - 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL, - 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL, - 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL, - 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL, - 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL, - 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL, - 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL, - 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL, - 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL, - 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL, - 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL, - 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL, - 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL, - 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL, - 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL, - 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL, - 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL, - 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL, - 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL, - 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL, - 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL, - 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL, - 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL, - 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL, - 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL, - 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL, - 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL, - 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL, - 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL, - 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL, - 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL, - 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL, - 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL, - 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL, - 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL, - 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL, - 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL, - 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL, - 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL, - 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL, - 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL, - 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL, - 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL, - 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL, - 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL, - 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL, - 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL, - 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL, - 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL, - 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL, - 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL, - 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL, - 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL, - 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL, - 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL, - 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL, - 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL, - 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL, - 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL, - 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL, - 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL, - 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL, - 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL, - 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL, - 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL, - 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL, - 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL, - 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL, - 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL, - 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL, - 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL, - 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL, - 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL, - 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL, - 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL, - 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL, - 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL, - 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL, - 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL, - 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL, - 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL, - 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL, - 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL, - 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL, - 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL, - 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL, - 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL, - 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL, - 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL, - 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL, - 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL, - 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL, - 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL, - 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL, - 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL, - 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL, - 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL, - 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL, - 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL, - 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL, - 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL, - 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL, - 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL, - 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL, - 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL, - 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL, - 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL, - 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL, - 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL, - 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL, - 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL, - 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL, - 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL, - 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL -}; - -static void curve25519_ever64_base(u8 *out, const u8 *priv) -{ - u64 swap = 1; - int i, j, k; - u64 tmp[16 + 32 + 4]; - u64 *x1 = &tmp[0]; - u64 *z1 = &tmp[4]; - u64 *x2 = &tmp[8]; - u64 *z2 = &tmp[12]; - u64 *xz1 = &tmp[0]; - u64 *xz2 = &tmp[8]; - u64 *a = &tmp[0 + 16]; - u64 *b = &tmp[4 + 16]; - u64 *c = &tmp[8 + 16]; - u64 *ab = &tmp[0 + 16]; - u64 *abcd = &tmp[0 + 16]; - u64 *ef = &tmp[16 + 16]; - u64 *efgh = &tmp[16 + 16]; - u64 *key = &tmp[0 + 16 + 32]; - - memcpy(key, priv, 32); - ((u8 *)key)[0] &= 248; - ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64; - - x1[0] = 1, x1[1] = x1[2] = x1[3] = 0; - z1[0] = 1, z1[1] = z1[2] = z1[3] = 0; - z2[0] = 1, z2[1] = z2[2] = z2[3] = 0; - memcpy(x2, p_minus_s, sizeof(p_minus_s)); - - j = 3; - for (i = 0; i < 4; ++i) { - while (j < (const int[]){ 64, 64, 64, 63 }[i]) { - u64 bit = (key[i] >> j) & 1; - k = (64 * i + j - 3); - swap = swap ^ bit; - cswap2(swap, xz1, xz2); - swap = bit; - fsub(b, x1, z1); - fadd(a, x1, z1); - fmul(c, &table_ladder[4 * k], b, ef); - fsub(b, a, c); - fadd(a, a, c); - fsqr2(ab, ab, efgh); - fmul2(xz1, xz2, ab, efgh); - ++j; - } - j = 0; - } - - point_double(xz1, abcd, efgh); - point_double(xz1, abcd, efgh); - point_double(xz1, abcd, efgh); - encode_point(out, xz1); - - memzero_explicit(tmp, sizeof(tmp)); -} - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); - -void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) -{ - if (static_branch_likely(&curve25519_use_bmi2_adx)) - curve25519_ever64(mypublic, secret, basepoint); - else - curve25519_generic(mypublic, secret, basepoint); -} -EXPORT_SYMBOL(curve25519_arch); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - if (static_branch_likely(&curve25519_use_bmi2_adx)) - curve25519_ever64_base(pub, secret); - else - curve25519_generic(pub, secret, curve25519_base_point); -} -EXPORT_SYMBOL(curve25519_base_arch); - -static int __init curve25519_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) - static_branch_enable(&curve25519_use_bmi2_adx); - return 0; -} - -static void __exit curve25519_mod_exit(void) -{ -} - -module_init(curve25519_mod_init); -module_exit(curve25519_mod_exit); - -MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index 78aa5f28c847..db63a5577c00 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -13,20 +13,10 @@ enum curve25519_lengths { CURVE25519_KEY_SIZE = 32 }; -extern const u8 curve25519_null_point[]; -extern const u8 curve25519_base_point[]; - void curve25519_generic(u8 out[CURVE25519_KEY_SIZE], const u8 scalar[CURVE25519_KEY_SIZE], const u8 point[CURVE25519_KEY_SIZE]); -void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]); - bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 37d85e0c9b97..eea17e36a22b 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -54,35 +54,24 @@ config CRYPTO_LIB_CHACHA_ARCH default y if S390 default y if X86_64 -config CRYPTO_ARCH_HAVE_LIB_CURVE25519 - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Curve25519 library interface, - either builtin or as a module. - -config CRYPTO_LIB_CURVE25519_GENERIC +config CRYPTO_LIB_CURVE25519 tristate select CRYPTO_LIB_UTILS help - This symbol can be depended upon by arch implementations of the - Curve25519 library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_CURVE25519. + The Curve25519 library functions. Select this if your module uses any + of the functions from . -config CRYPTO_LIB_CURVE25519_INTERNAL - tristate - select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n +config CRYPTO_LIB_CURVE25519_ARCH + bool + depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN + default y if ARM && KERNEL_MODE_NEON + default y if PPC64 && CPU_LITTLE_ENDIAN + default y if X86_64 -config CRYPTO_LIB_CURVE25519 - tristate - select CRYPTO - select CRYPTO_LIB_CURVE25519_INTERNAL - help - Enable the Curve25519 library interface. This interface may be - fulfilled by either the generic implementation or an arch-specific - one, if one is available and enabled. +config CRYPTO_LIB_CURVE25519_GENERIC + bool + depends on CRYPTO_LIB_CURVE25519 + default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64 config CRYPTO_LIB_DES tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 6c3be971ace0..bded351aeace 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -76,17 +76,31 @@ obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o libchacha20poly1305-y += chacha20poly1305.o libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o -obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o -libcurve25519-generic-y := curve25519-fiat32.o -libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o -libcurve25519-generic-y += curve25519-generic.o +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o +libcurve25519-y := curve25519.o + +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_curve25519.o := n + +ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y) +libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o +else +libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o +endif # clang versions prior to 18 may blow out the stack with KASAN ifeq ($(call clang-min-version, 180000),) KASAN_SANITIZE_curve25519-hacl64.o := n endif -obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o -libcurve25519-y += curve25519.o +ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y) +CFLAGS_curve25519.o += -I$(src)/$(SRCARCH) +libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o +libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o +endif + +################################################################################ obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o diff --git a/lib/crypto/arm/curve25519-core.S b/lib/crypto/arm/curve25519-core.S new file mode 100644 index 000000000000..b697fa5d059a --- /dev/null +++ b/lib/crypto/arm/curve25519-core.S @@ -0,0 +1,2062 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been + * manually reworked for use in kernel space. + */ + +#include + +.text +.arch armv7-a +.fpu neon +.align 4 + +ENTRY(curve25519_neon) + push {r4-r11, lr} + mov ip, sp + sub r3, sp, #704 + and r3, r3, #0xfffffff0 + mov sp, r3 + movw r4, #0 + movw r5, #254 + vmov.i32 q0, #1 + vshr.u64 q1, q0, #7 + vshr.u64 q0, q0, #8 + vmov.i32 d4, #19 + vmov.i32 d5, #38 + add r6, sp, #480 + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d4-d5}, [r6, : 128] + add r6, r3, #0 + vmov.i32 q2, #0 + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 d4, [r6, : 64] + add r6, r3, #0 + movw r7, #960 + sub r7, r7, #2 + neg r7, r7 + sub r7, r7, r7, LSL #7 + str r7, [r6] + add r6, sp, #672 + vld1.8 {d4-d5}, [r1]! + vld1.8 {d6-d7}, [r1] + vst1.8 {d4-d5}, [r6, : 128]! + vst1.8 {d6-d7}, [r6, : 128] + sub r1, r6, #16 + ldrb r6, [r1] + and r6, r6, #248 + strb r6, [r1] + ldrb r6, [r1, #31] + and r6, r6, #127 + orr r6, r6, #64 + strb r6, [r1, #31] + vmov.i64 q2, #0xffffffff + vshr.u64 q3, q2, #7 + vshr.u64 q2, q2, #6 + vld1.8 {d8}, [r2] + vld1.8 {d10}, [r2] + add r2, r2, #6 + vld1.8 {d12}, [r2] + vld1.8 {d14}, [r2] + add r2, r2, #6 + vld1.8 {d16}, [r2] + add r2, r2, #4 + vld1.8 {d18}, [r2] + vld1.8 {d20}, [r2] + add r2, r2, #6 + vld1.8 {d22}, [r2] + add r2, r2, #2 + vld1.8 {d24}, [r2] + vld1.8 {d26}, [r2] + vshr.u64 q5, q5, #26 + vshr.u64 q6, q6, #3 + vshr.u64 q7, q7, #29 + vshr.u64 q8, q8, #6 + vshr.u64 q10, q10, #25 + vshr.u64 q11, q11, #3 + vshr.u64 q12, q12, #12 + vshr.u64 q13, q13, #38 + vand q4, q4, q2 + vand q6, q6, q2 + vand q8, q8, q2 + vand q10, q10, q2 + vand q2, q12, q2 + vand q5, q5, q3 + vand q7, q7, q3 + vand q9, q9, q3 + vand q11, q11, q3 + vand q3, q13, q3 + add r2, r3, #48 + vadd.i64 q12, q4, q1 + vadd.i64 q13, q10, q1 + vshr.s64 q12, q12, #26 + vshr.s64 q13, q13, #26 + vadd.i64 q5, q5, q12 + vshl.i64 q12, q12, #26 + vadd.i64 q14, q5, q0 + vadd.i64 q11, q11, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q11, q0 + vsub.i64 q4, q4, q12 + vshr.s64 q12, q14, #25 + vsub.i64 q10, q10, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q12 + vshl.i64 q12, q12, #25 + vadd.i64 q14, q6, q1 + vadd.i64 q2, q2, q13 + vsub.i64 q5, q5, q12 + vshr.s64 q12, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q1 + vadd.i64 q7, q7, q12 + vshl.i64 q12, q12, #26 + vadd.i64 q15, q7, q0 + vsub.i64 q11, q11, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q12 + vshr.s64 q12, q15, #25 + vadd.i64 q3, q3, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q3, q0 + vadd.i64 q8, q8, q12 + vshl.i64 q12, q12, #25 + vadd.i64 q15, q8, q1 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q7, q12 + vshr.s64 q12, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q9, q9, q12 + vtrn.32 d12, d14 + vshl.i64 q12, q12, #26 + vtrn.32 d13, d15 + vadd.i64 q0, q9, q0 + vadd.i64 q4, q4, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q6, q13, #4 + vsub.i64 q7, q8, q12 + vshr.s64 q0, q0, #25 + vadd.i64 q4, q4, q6 + vadd.i64 q6, q10, q0 + vshl.i64 q0, q0, #25 + vadd.i64 q8, q6, q1 + vadd.i64 q4, q4, q13 + vshl.i64 q10, q13, #25 + vadd.i64 q1, q4, q1 + vsub.i64 q0, q9, q0 + vshr.s64 q8, q8, #26 + vsub.i64 q3, q3, q10 + vtrn.32 d14, d0 + vshr.s64 q1, q1, #26 + vtrn.32 d15, d1 + vadd.i64 q0, q11, q8 + vst1.8 d14, [r2, : 64] + vshl.i64 q7, q8, #26 + vadd.i64 q5, q5, q1 + vtrn.32 d4, d6 + vshl.i64 q1, q1, #26 + vtrn.32 d5, d7 + vsub.i64 q3, q6, q7 + add r2, r2, #16 + vsub.i64 q1, q4, q1 + vst1.8 d4, [r2, : 64] + vtrn.32 d6, d0 + vtrn.32 d7, d1 + sub r2, r2, #8 + vtrn.32 d2, d10 + vtrn.32 d3, d11 + vst1.8 d6, [r2, : 64] + sub r2, r2, #24 + vst1.8 d2, [r2, : 64] + add r2, r3, #96 + vmov.i32 q0, #0 + vmov.i64 d2, #0xff + vmov.i64 d3, #0 + vshr.u32 q1, q1, #7 + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 d0, [r2, : 64] + add r2, r3, #144 + vmov.i32 q0, #0 + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 d0, [r2, : 64] + add r2, r3, #240 + vmov.i32 q0, #0 + vmov.i64 d2, #0xff + vmov.i64 d3, #0 + vshr.u32 q1, q1, #7 + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 d0, [r2, : 64] + add r2, r3, #48 + add r6, r3, #192 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4}, [r2, : 64] + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 d4, [r6, : 64] +.Lmainloop: + mov r2, r5, LSR #3 + and r6, r5, #7 + ldrb r2, [r1, r2] + mov r2, r2, LSR r6 + and r2, r2, #1 + str r5, [sp, #456] + eor r4, r4, r2 + str r2, [sp, #460] + neg r2, r4 + add r4, r3, #96 + add r5, r3, #192 + add r6, r3, #144 + vld1.8 {d8-d9}, [r4, : 128]! + add r7, r3, #240 + vld1.8 {d10-d11}, [r5, : 128]! + veor q6, q4, q5 + vld1.8 {d14-d15}, [r6, : 128]! + vdup.i32 q8, r2 + vld1.8 {d18-d19}, [r7, : 128]! + veor q10, q7, q9 + vld1.8 {d22-d23}, [r4, : 128]! + vand q6, q6, q8 + vld1.8 {d24-d25}, [r5, : 128]! + vand q10, q10, q8 + vld1.8 {d26-d27}, [r6, : 128]! + veor q4, q4, q6 + vld1.8 {d28-d29}, [r7, : 128]! + veor q5, q5, q6 + vld1.8 {d0}, [r4, : 64] + veor q6, q7, q10 + vld1.8 {d2}, [r5, : 64] + veor q7, q9, q10 + vld1.8 {d4}, [r6, : 64] + veor q9, q11, q12 + vld1.8 {d6}, [r7, : 64] + veor q10, q0, q1 + sub r2, r4, #32 + vand q9, q9, q8 + sub r4, r5, #32 + vand q10, q10, q8 + sub r5, r6, #32 + veor q11, q11, q9 + sub r6, r7, #32 + veor q0, q0, q10 + veor q9, q12, q9 + veor q1, q1, q10 + veor q10, q13, q14 + veor q12, q2, q3 + vand q10, q10, q8 + vand q8, q12, q8 + veor q12, q13, q10 + veor q2, q2, q8 + veor q10, q14, q10 + veor q3, q3, q8 + vadd.i32 q8, q4, q6 + vsub.i32 q4, q4, q6 + vst1.8 {d16-d17}, [r2, : 128]! + vadd.i32 q6, q11, q12 + vst1.8 {d8-d9}, [r5, : 128]! + vsub.i32 q4, q11, q12 + vst1.8 {d12-d13}, [r2, : 128]! + vadd.i32 q6, q0, q2 + vst1.8 {d8-d9}, [r5, : 128]! + vsub.i32 q0, q0, q2 + vst1.8 d12, [r2, : 64] + vadd.i32 q2, q5, q7 + vst1.8 d0, [r5, : 64] + vsub.i32 q0, q5, q7 + vst1.8 {d4-d5}, [r4, : 128]! + vadd.i32 q2, q9, q10 + vst1.8 {d0-d1}, [r6, : 128]! + vsub.i32 q0, q9, q10 + vst1.8 {d4-d5}, [r4, : 128]! + vadd.i32 q2, q1, q3 + vst1.8 {d0-d1}, [r6, : 128]! + vsub.i32 q0, q1, q3 + vst1.8 d4, [r4, : 64] + vst1.8 d0, [r6, : 64] + add r2, sp, #512 + add r4, r3, #96 + add r5, r3, #144 + vld1.8 {d0-d1}, [r2, : 128] + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4-d5}, [r5, : 128]! + vzip.i32 q1, q2 + vld1.8 {d6-d7}, [r4, : 128]! + vld1.8 {d8-d9}, [r5, : 128]! + vshl.i32 q5, q1, #1 + vzip.i32 q3, q4 + vshl.i32 q6, q2, #1 + vld1.8 {d14}, [r4, : 64] + vshl.i32 q8, q3, #1 + vld1.8 {d15}, [r5, : 64] + vshl.i32 q9, q4, #1 + vmul.i32 d21, d7, d1 + vtrn.32 d14, d15 + vmul.i32 q11, q4, q0 + vmul.i32 q0, q7, q0 + vmull.s32 q12, d2, d2 + vmlal.s32 q12, d11, d1 + vmlal.s32 q12, d12, d0 + vmlal.s32 q12, d13, d23 + vmlal.s32 q12, d16, d22 + vmlal.s32 q12, d7, d21 + vmull.s32 q10, d2, d11 + vmlal.s32 q10, d4, d1 + vmlal.s32 q10, d13, d0 + vmlal.s32 q10, d6, d23 + vmlal.s32 q10, d17, d22 + vmull.s32 q13, d10, d4 + vmlal.s32 q13, d11, d3 + vmlal.s32 q13, d13, d1 + vmlal.s32 q13, d16, d0 + vmlal.s32 q13, d17, d23 + vmlal.s32 q13, d8, d22 + vmull.s32 q1, d10, d5 + vmlal.s32 q1, d11, d4 + vmlal.s32 q1, d6, d1 + vmlal.s32 q1, d17, d0 + vmlal.s32 q1, d8, d23 + vmull.s32 q14, d10, d6 + vmlal.s32 q14, d11, d13 + vmlal.s32 q14, d4, d4 + vmlal.s32 q14, d17, d1 + vmlal.s32 q14, d18, d0 + vmlal.s32 q14, d9, d23 + vmull.s32 q11, d10, d7 + vmlal.s32 q11, d11, d6 + vmlal.s32 q11, d12, d5 + vmlal.s32 q11, d8, d1 + vmlal.s32 q11, d19, d0 + vmull.s32 q15, d10, d8 + vmlal.s32 q15, d11, d17 + vmlal.s32 q15, d12, d6 + vmlal.s32 q15, d13, d5 + vmlal.s32 q15, d19, d1 + vmlal.s32 q15, d14, d0 + vmull.s32 q2, d10, d9 + vmlal.s32 q2, d11, d8 + vmlal.s32 q2, d12, d7 + vmlal.s32 q2, d13, d6 + vmlal.s32 q2, d14, d1 + vmull.s32 q0, d15, d1 + vmlal.s32 q0, d10, d14 + vmlal.s32 q0, d11, d19 + vmlal.s32 q0, d12, d8 + vmlal.s32 q0, d13, d17 + vmlal.s32 q0, d6, d6 + add r2, sp, #480 + vld1.8 {d18-d19}, [r2, : 128]! + vmull.s32 q3, d16, d7 + vmlal.s32 q3, d10, d15 + vmlal.s32 q3, d11, d14 + vmlal.s32 q3, d12, d9 + vmlal.s32 q3, d13, d8 + vld1.8 {d8-d9}, [r2, : 128] + vadd.i64 q5, q12, q9 + vadd.i64 q6, q15, q9 + vshr.s64 q5, q5, #26 + vshr.s64 q6, q6, #26 + vadd.i64 q7, q10, q5 + vshl.i64 q5, q5, #26 + vadd.i64 q8, q7, q4 + vadd.i64 q2, q2, q6 + vshl.i64 q6, q6, #26 + vadd.i64 q10, q2, q4 + vsub.i64 q5, q12, q5 + vshr.s64 q8, q8, #25 + vsub.i64 q6, q15, q6 + vshr.s64 q10, q10, #25 + vadd.i64 q12, q13, q8 + vshl.i64 q8, q8, #25 + vadd.i64 q13, q12, q9 + vadd.i64 q0, q0, q10 + vsub.i64 q7, q7, q8 + vshr.s64 q8, q13, #26 + vshl.i64 q10, q10, #25 + vadd.i64 q13, q0, q9 + vadd.i64 q1, q1, q8 + vshl.i64 q8, q8, #26 + vadd.i64 q15, q1, q4 + vsub.i64 q2, q2, q10 + vshr.s64 q10, q13, #26 + vsub.i64 q8, q12, q8 + vshr.s64 q12, q15, #25 + vadd.i64 q3, q3, q10 + vshl.i64 q10, q10, #26 + vadd.i64 q13, q3, q4 + vadd.i64 q14, q14, q12 + add r2, r3, #288 + vshl.i64 q12, q12, #25 + add r4, r3, #336 + vadd.i64 q15, q14, q9 + add r2, r2, #8 + vsub.i64 q0, q0, q10 + add r4, r4, #8 + vshr.s64 q10, q13, #25 + vsub.i64 q1, q1, q12 + vshr.s64 q12, q15, #26 + vadd.i64 q13, q10, q10 + vadd.i64 q11, q11, q12 + vtrn.32 d16, d2 + vshl.i64 q12, q12, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q11, q4 + vadd.i64 q4, q5, q13 + vst1.8 d16, [r2, : 64]! + vshl.i64 q5, q10, #4 + vst1.8 d17, [r4, : 64]! + vsub.i64 q8, q14, q12 + vshr.s64 q1, q1, #25 + vadd.i64 q4, q4, q5 + vadd.i64 q5, q6, q1 + vshl.i64 q1, q1, #25 + vadd.i64 q6, q5, q9 + vadd.i64 q4, q4, q10 + vshl.i64 q10, q10, #25 + vadd.i64 q9, q4, q9 + vsub.i64 q1, q11, q1 + vshr.s64 q6, q6, #26 + vsub.i64 q3, q3, q10 + vtrn.32 d16, d2 + vshr.s64 q9, q9, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q2, q6 + vst1.8 d16, [r2, : 64] + vshl.i64 q2, q6, #26 + vst1.8 d17, [r4, : 64] + vadd.i64 q6, q7, q9 + vtrn.32 d0, d6 + vshl.i64 q7, q9, #26 + vtrn.32 d1, d7 + vsub.i64 q2, q5, q2 + add r2, r2, #16 + vsub.i64 q3, q4, q7 + vst1.8 d0, [r2, : 64] + add r4, r4, #16 + vst1.8 d1, [r4, : 64] + vtrn.32 d4, d2 + vtrn.32 d5, d3 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d6, d12 + vtrn.32 d7, d13 + vst1.8 d4, [r2, : 64] + vst1.8 d5, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d6, [r2, : 64] + vst1.8 d7, [r4, : 64] + add r2, r3, #240 + add r4, r3, #96 + vld1.8 {d0-d1}, [r4, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4}, [r4, : 64] + add r4, r3, #144 + vld1.8 {d6-d7}, [r4, : 128]! + vtrn.32 q0, q3 + vld1.8 {d8-d9}, [r4, : 128]! + vshl.i32 q5, q0, #4 + vtrn.32 q1, q4 + vshl.i32 q6, q3, #4 + vadd.i32 q5, q5, q0 + vadd.i32 q6, q6, q3 + vshl.i32 q7, q1, #4 + vld1.8 {d5}, [r4, : 64] + vshl.i32 q8, q4, #4 + vtrn.32 d4, d5 + vadd.i32 q7, q7, q1 + vadd.i32 q8, q8, q4 + vld1.8 {d18-d19}, [r2, : 128]! + vshl.i32 q10, q2, #4 + vld1.8 {d22-d23}, [r2, : 128]! + vadd.i32 q10, q10, q2 + vld1.8 {d24}, [r2, : 64] + vadd.i32 q5, q5, q0 + add r2, r3, #192 + vld1.8 {d26-d27}, [r2, : 128]! + vadd.i32 q6, q6, q3 + vld1.8 {d28-d29}, [r2, : 128]! + vadd.i32 q8, q8, q4 + vld1.8 {d25}, [r2, : 64] + vadd.i32 q10, q10, q2 + vtrn.32 q9, q13 + vadd.i32 q7, q7, q1 + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 + add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 + vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 + vmlal.s32 q8, d27, d3 + vmlal.s32 q8, d22, d8 + vmlal.s32 q8, d28, d2 + vmlal.s32 q8, d23, d7 + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 + vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 + vmlal.s32 q2, d19, d3 + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 + vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 + vmlal.s32 q7, d27, d2 + vmlal.s32 q7, d22, d7 + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 + vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 + vst1.8 {d16-d17}, [r2, : 128] + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 + vmlal.s32 q4, d27, d1 + vmlal.s32 q4, d22, d6 + vmlal.s32 q4, d28, d0 + vmull.s32 q8, d18, d7 + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 + add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 + vmlal.s32 q4, d23, d21 + vmlal.s32 q4, d29, d20 + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 + add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 + add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 + add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 + vst1.8 {d8-d9}, [r2, : 128] + add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 + vmull.s32 q1, d18, d2 + vmlal.s32 q1, d19, d1 + vmlal.s32 q1, d22, d0 + vmlal.s32 q1, d24, d27 + vmlal.s32 q1, d23, d20 + vmlal.s32 q1, d12, d7 + vmlal.s32 q1, d13, d6 + vmull.s32 q6, d18, d1 + vmlal.s32 q6, d19, d0 + vmlal.s32 q6, d23, d27 + vmlal.s32 q6, d22, d20 + vmlal.s32 q6, d24, d26 + vmull.s32 q0, d18, d0 + vmlal.s32 q0, d22, d27 + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 + add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 + vmlal.s32 q5, d18, d6 + vmlal.s32 q1, d18, d21 + vmlal.s32 q0, d18, d28 + vmlal.s32 q6, d18, d29 + vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 + add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] + add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 + add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 + add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 + add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 + add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 + vadd.i64 q14, q5, q11 + vmlal.s32 q6, d30, d9 + vshr.s64 q4, q13, #26 + vshr.s64 q13, q14, #26 + vadd.i64 q7, q7, q4 + vshl.i64 q4, q4, #26 + vadd.i64 q14, q7, q3 + vadd.i64 q9, q9, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q9, q3 + vsub.i64 q0, q0, q4 + vshr.s64 q4, q14, #25 + vsub.i64 q5, q5, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q4 + vshl.i64 q4, q4, #25 + vadd.i64 q14, q6, q11 + vadd.i64 q2, q2, q13 + vsub.i64 q4, q7, q4 + vshr.s64 q7, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q11 + vadd.i64 q8, q8, q7 + vshl.i64 q7, q7, #26 + vadd.i64 q15, q8, q3 + vsub.i64 q9, q9, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q7 + vshr.s64 q7, q15, #25 + vadd.i64 q10, q10, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q10, q3 + vadd.i64 q1, q1, q7 + add r2, r3, #144 + vshl.i64 q7, q7, #25 + add r4, r3, #96 + vadd.i64 q15, q1, q11 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + add r4, r4, #8 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q8, q7 + vshr.s64 q8, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q12, q12, q8 + vtrn.32 d12, d14 + vshl.i64 q8, q8, #26 + vtrn.32 d13, d15 + vadd.i64 q3, q12, q3 + vadd.i64 q0, q0, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q7, q13, #4 + vst1.8 d13, [r4, : 64]! + vsub.i64 q1, q1, q8 + vshr.s64 q3, q3, #25 + vadd.i64 q0, q0, q7 + vadd.i64 q5, q5, q3 + vshl.i64 q3, q3, #25 + vadd.i64 q6, q5, q11 + vadd.i64 q0, q0, q13 + vshl.i64 q7, q13, #25 + vadd.i64 q8, q0, q11 + vsub.i64 q3, q12, q3 + vshr.s64 q6, q6, #26 + vsub.i64 q7, q10, q7 + vtrn.32 d2, d6 + vshr.s64 q8, q8, #26 + vtrn.32 d3, d7 + vadd.i64 q3, q9, q6 + vst1.8 d2, [r2, : 64] + vshl.i64 q6, q6, #26 + vst1.8 d3, [r4, : 64] + vadd.i64 q1, q4, q8 + vtrn.32 d4, d14 + vshl.i64 q4, q8, #26 + vtrn.32 d5, d15 + vsub.i64 q5, q5, q6 + add r2, r2, #16 + vsub.i64 q0, q0, q4 + vst1.8 d4, [r2, : 64] + add r4, r4, #16 + vst1.8 d5, [r4, : 64] + vtrn.32 d10, d6 + vtrn.32 d11, d7 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vst1.8 d10, [r2, : 64] + vst1.8 d11, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] + add r2, r3, #288 + add r4, r3, #336 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vsub.i32 q0, q0, q1 + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4-d5}, [r4, : 128]! + vsub.i32 q1, q1, q2 + add r5, r3, #240 + vld1.8 {d4}, [r2, : 64] + vld1.8 {d6}, [r4, : 64] + vsub.i32 q2, q2, q3 + vst1.8 {d0-d1}, [r5, : 128]! + vst1.8 {d2-d3}, [r5, : 128]! + vst1.8 d4, [r5, : 64] + add r2, r3, #144 + add r4, r3, #96 + add r5, r3, #144 + add r6, r3, #192 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vsub.i32 q2, q0, q1 + vadd.i32 q0, q0, q1 + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d6-d7}, [r4, : 128]! + vsub.i32 q4, q1, q3 + vadd.i32 q1, q1, q3 + vld1.8 {d6}, [r2, : 64] + vld1.8 {d10}, [r4, : 64] + vsub.i32 q6, q3, q5 + vadd.i32 q3, q3, q5 + vst1.8 {d4-d5}, [r5, : 128]! + vst1.8 {d0-d1}, [r6, : 128]! + vst1.8 {d8-d9}, [r5, : 128]! + vst1.8 {d2-d3}, [r6, : 128]! + vst1.8 d12, [r5, : 64] + vst1.8 d6, [r6, : 64] + add r2, r3, #0 + add r4, r3, #240 + vld1.8 {d0-d1}, [r4, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4}, [r4, : 64] + add r4, r3, #336 + vld1.8 {d6-d7}, [r4, : 128]! + vtrn.32 q0, q3 + vld1.8 {d8-d9}, [r4, : 128]! + vshl.i32 q5, q0, #4 + vtrn.32 q1, q4 + vshl.i32 q6, q3, #4 + vadd.i32 q5, q5, q0 + vadd.i32 q6, q6, q3 + vshl.i32 q7, q1, #4 + vld1.8 {d5}, [r4, : 64] + vshl.i32 q8, q4, #4 + vtrn.32 d4, d5 + vadd.i32 q7, q7, q1 + vadd.i32 q8, q8, q4 + vld1.8 {d18-d19}, [r2, : 128]! + vshl.i32 q10, q2, #4 + vld1.8 {d22-d23}, [r2, : 128]! + vadd.i32 q10, q10, q2 + vld1.8 {d24}, [r2, : 64] + vadd.i32 q5, q5, q0 + add r2, r3, #288 + vld1.8 {d26-d27}, [r2, : 128]! + vadd.i32 q6, q6, q3 + vld1.8 {d28-d29}, [r2, : 128]! + vadd.i32 q8, q8, q4 + vld1.8 {d25}, [r2, : 64] + vadd.i32 q10, q10, q2 + vtrn.32 q9, q13 + vadd.i32 q7, q7, q1 + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 + add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 + vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 + vmlal.s32 q8, d27, d3 + vmlal.s32 q8, d22, d8 + vmlal.s32 q8, d28, d2 + vmlal.s32 q8, d23, d7 + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 + vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 + vmlal.s32 q2, d19, d3 + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 + vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 + vmlal.s32 q7, d27, d2 + vmlal.s32 q7, d22, d7 + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 + vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 + vmlal.s32 q4, d27, d1 + vmlal.s32 q4, d22, d6 + vmlal.s32 q4, d28, d0 + vmull.s32 q8, d18, d7 + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 + add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 + vmlal.s32 q4, d23, d21 + vmlal.s32 q4, d29, d20 + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 + add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 + add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 + add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 + vst1.8 {d8-d9}, [r2, : 128] + add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 + vmull.s32 q1, d18, d2 + vmlal.s32 q1, d19, d1 + vmlal.s32 q1, d22, d0 + vmlal.s32 q1, d24, d27 + vmlal.s32 q1, d23, d20 + vmlal.s32 q1, d12, d7 + vmlal.s32 q1, d13, d6 + vmull.s32 q6, d18, d1 + vmlal.s32 q6, d19, d0 + vmlal.s32 q6, d23, d27 + vmlal.s32 q6, d22, d20 + vmlal.s32 q6, d24, d26 + vmull.s32 q0, d18, d0 + vmlal.s32 q0, d22, d27 + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 + add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 + vmlal.s32 q5, d18, d6 + vmlal.s32 q1, d18, d21 + vmlal.s32 q0, d18, d28 + vmlal.s32 q6, d18, d29 + vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 + add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] + add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 + add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 + add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 + add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 + add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 + vadd.i64 q14, q5, q11 + vmlal.s32 q6, d30, d9 + vshr.s64 q4, q13, #26 + vshr.s64 q13, q14, #26 + vadd.i64 q7, q7, q4 + vshl.i64 q4, q4, #26 + vadd.i64 q14, q7, q3 + vadd.i64 q9, q9, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q9, q3 + vsub.i64 q0, q0, q4 + vshr.s64 q4, q14, #25 + vsub.i64 q5, q5, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q4 + vshl.i64 q4, q4, #25 + vadd.i64 q14, q6, q11 + vadd.i64 q2, q2, q13 + vsub.i64 q4, q7, q4 + vshr.s64 q7, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q11 + vadd.i64 q8, q8, q7 + vshl.i64 q7, q7, #26 + vadd.i64 q15, q8, q3 + vsub.i64 q9, q9, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q7 + vshr.s64 q7, q15, #25 + vadd.i64 q10, q10, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q10, q3 + vadd.i64 q1, q1, q7 + add r2, r3, #288 + vshl.i64 q7, q7, #25 + add r4, r3, #96 + vadd.i64 q15, q1, q11 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + add r4, r4, #8 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q8, q7 + vshr.s64 q8, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q12, q12, q8 + vtrn.32 d12, d14 + vshl.i64 q8, q8, #26 + vtrn.32 d13, d15 + vadd.i64 q3, q12, q3 + vadd.i64 q0, q0, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q7, q13, #4 + vst1.8 d13, [r4, : 64]! + vsub.i64 q1, q1, q8 + vshr.s64 q3, q3, #25 + vadd.i64 q0, q0, q7 + vadd.i64 q5, q5, q3 + vshl.i64 q3, q3, #25 + vadd.i64 q6, q5, q11 + vadd.i64 q0, q0, q13 + vshl.i64 q7, q13, #25 + vadd.i64 q8, q0, q11 + vsub.i64 q3, q12, q3 + vshr.s64 q6, q6, #26 + vsub.i64 q7, q10, q7 + vtrn.32 d2, d6 + vshr.s64 q8, q8, #26 + vtrn.32 d3, d7 + vadd.i64 q3, q9, q6 + vst1.8 d2, [r2, : 64] + vshl.i64 q6, q6, #26 + vst1.8 d3, [r4, : 64] + vadd.i64 q1, q4, q8 + vtrn.32 d4, d14 + vshl.i64 q4, q8, #26 + vtrn.32 d5, d15 + vsub.i64 q5, q5, q6 + add r2, r2, #16 + vsub.i64 q0, q0, q4 + vst1.8 d4, [r2, : 64] + add r4, r4, #16 + vst1.8 d5, [r4, : 64] + vtrn.32 d10, d6 + vtrn.32 d11, d7 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vst1.8 d10, [r2, : 64] + vst1.8 d11, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] + add r2, sp, #512 + add r4, r3, #144 + add r5, r3, #192 + vld1.8 {d0-d1}, [r2, : 128] + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4-d5}, [r5, : 128]! + vzip.i32 q1, q2 + vld1.8 {d6-d7}, [r4, : 128]! + vld1.8 {d8-d9}, [r5, : 128]! + vshl.i32 q5, q1, #1 + vzip.i32 q3, q4 + vshl.i32 q6, q2, #1 + vld1.8 {d14}, [r4, : 64] + vshl.i32 q8, q3, #1 + vld1.8 {d15}, [r5, : 64] + vshl.i32 q9, q4, #1 + vmul.i32 d21, d7, d1 + vtrn.32 d14, d15 + vmul.i32 q11, q4, q0 + vmul.i32 q0, q7, q0 + vmull.s32 q12, d2, d2 + vmlal.s32 q12, d11, d1 + vmlal.s32 q12, d12, d0 + vmlal.s32 q12, d13, d23 + vmlal.s32 q12, d16, d22 + vmlal.s32 q12, d7, d21 + vmull.s32 q10, d2, d11 + vmlal.s32 q10, d4, d1 + vmlal.s32 q10, d13, d0 + vmlal.s32 q10, d6, d23 + vmlal.s32 q10, d17, d22 + vmull.s32 q13, d10, d4 + vmlal.s32 q13, d11, d3 + vmlal.s32 q13, d13, d1 + vmlal.s32 q13, d16, d0 + vmlal.s32 q13, d17, d23 + vmlal.s32 q13, d8, d22 + vmull.s32 q1, d10, d5 + vmlal.s32 q1, d11, d4 + vmlal.s32 q1, d6, d1 + vmlal.s32 q1, d17, d0 + vmlal.s32 q1, d8, d23 + vmull.s32 q14, d10, d6 + vmlal.s32 q14, d11, d13 + vmlal.s32 q14, d4, d4 + vmlal.s32 q14, d17, d1 + vmlal.s32 q14, d18, d0 + vmlal.s32 q14, d9, d23 + vmull.s32 q11, d10, d7 + vmlal.s32 q11, d11, d6 + vmlal.s32 q11, d12, d5 + vmlal.s32 q11, d8, d1 + vmlal.s32 q11, d19, d0 + vmull.s32 q15, d10, d8 + vmlal.s32 q15, d11, d17 + vmlal.s32 q15, d12, d6 + vmlal.s32 q15, d13, d5 + vmlal.s32 q15, d19, d1 + vmlal.s32 q15, d14, d0 + vmull.s32 q2, d10, d9 + vmlal.s32 q2, d11, d8 + vmlal.s32 q2, d12, d7 + vmlal.s32 q2, d13, d6 + vmlal.s32 q2, d14, d1 + vmull.s32 q0, d15, d1 + vmlal.s32 q0, d10, d14 + vmlal.s32 q0, d11, d19 + vmlal.s32 q0, d12, d8 + vmlal.s32 q0, d13, d17 + vmlal.s32 q0, d6, d6 + add r2, sp, #480 + vld1.8 {d18-d19}, [r2, : 128]! + vmull.s32 q3, d16, d7 + vmlal.s32 q3, d10, d15 + vmlal.s32 q3, d11, d14 + vmlal.s32 q3, d12, d9 + vmlal.s32 q3, d13, d8 + vld1.8 {d8-d9}, [r2, : 128] + vadd.i64 q5, q12, q9 + vadd.i64 q6, q15, q9 + vshr.s64 q5, q5, #26 + vshr.s64 q6, q6, #26 + vadd.i64 q7, q10, q5 + vshl.i64 q5, q5, #26 + vadd.i64 q8, q7, q4 + vadd.i64 q2, q2, q6 + vshl.i64 q6, q6, #26 + vadd.i64 q10, q2, q4 + vsub.i64 q5, q12, q5 + vshr.s64 q8, q8, #25 + vsub.i64 q6, q15, q6 + vshr.s64 q10, q10, #25 + vadd.i64 q12, q13, q8 + vshl.i64 q8, q8, #25 + vadd.i64 q13, q12, q9 + vadd.i64 q0, q0, q10 + vsub.i64 q7, q7, q8 + vshr.s64 q8, q13, #26 + vshl.i64 q10, q10, #25 + vadd.i64 q13, q0, q9 + vadd.i64 q1, q1, q8 + vshl.i64 q8, q8, #26 + vadd.i64 q15, q1, q4 + vsub.i64 q2, q2, q10 + vshr.s64 q10, q13, #26 + vsub.i64 q8, q12, q8 + vshr.s64 q12, q15, #25 + vadd.i64 q3, q3, q10 + vshl.i64 q10, q10, #26 + vadd.i64 q13, q3, q4 + vadd.i64 q14, q14, q12 + add r2, r3, #144 + vshl.i64 q12, q12, #25 + add r4, r3, #192 + vadd.i64 q15, q14, q9 + add r2, r2, #8 + vsub.i64 q0, q0, q10 + add r4, r4, #8 + vshr.s64 q10, q13, #25 + vsub.i64 q1, q1, q12 + vshr.s64 q12, q15, #26 + vadd.i64 q13, q10, q10 + vadd.i64 q11, q11, q12 + vtrn.32 d16, d2 + vshl.i64 q12, q12, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q11, q4 + vadd.i64 q4, q5, q13 + vst1.8 d16, [r2, : 64]! + vshl.i64 q5, q10, #4 + vst1.8 d17, [r4, : 64]! + vsub.i64 q8, q14, q12 + vshr.s64 q1, q1, #25 + vadd.i64 q4, q4, q5 + vadd.i64 q5, q6, q1 + vshl.i64 q1, q1, #25 + vadd.i64 q6, q5, q9 + vadd.i64 q4, q4, q10 + vshl.i64 q10, q10, #25 + vadd.i64 q9, q4, q9 + vsub.i64 q1, q11, q1 + vshr.s64 q6, q6, #26 + vsub.i64 q3, q3, q10 + vtrn.32 d16, d2 + vshr.s64 q9, q9, #26 + vtrn.32 d17, d3 + vadd.i64 q1, q2, q6 + vst1.8 d16, [r2, : 64] + vshl.i64 q2, q6, #26 + vst1.8 d17, [r4, : 64] + vadd.i64 q6, q7, q9 + vtrn.32 d0, d6 + vshl.i64 q7, q9, #26 + vtrn.32 d1, d7 + vsub.i64 q2, q5, q2 + add r2, r2, #16 + vsub.i64 q3, q4, q7 + vst1.8 d0, [r2, : 64] + add r4, r4, #16 + vst1.8 d1, [r4, : 64] + vtrn.32 d4, d2 + vtrn.32 d5, d3 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d6, d12 + vtrn.32 d7, d13 + vst1.8 d4, [r2, : 64] + vst1.8 d5, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d6, [r2, : 64] + vst1.8 d7, [r4, : 64] + add r2, r3, #336 + add r4, r3, #288 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vadd.i32 q0, q0, q1 + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4-d5}, [r4, : 128]! + vadd.i32 q1, q1, q2 + add r5, r3, #288 + vld1.8 {d4}, [r2, : 64] + vld1.8 {d6}, [r4, : 64] + vadd.i32 q2, q2, q3 + vst1.8 {d0-d1}, [r5, : 128]! + vst1.8 {d2-d3}, [r5, : 128]! + vst1.8 d4, [r5, : 64] + add r2, r3, #48 + add r4, r3, #144 + vld1.8 {d0-d1}, [r4, : 128]! + vld1.8 {d2-d3}, [r4, : 128]! + vld1.8 {d4}, [r4, : 64] + add r4, r3, #288 + vld1.8 {d6-d7}, [r4, : 128]! + vtrn.32 q0, q3 + vld1.8 {d8-d9}, [r4, : 128]! + vshl.i32 q5, q0, #4 + vtrn.32 q1, q4 + vshl.i32 q6, q3, #4 + vadd.i32 q5, q5, q0 + vadd.i32 q6, q6, q3 + vshl.i32 q7, q1, #4 + vld1.8 {d5}, [r4, : 64] + vshl.i32 q8, q4, #4 + vtrn.32 d4, d5 + vadd.i32 q7, q7, q1 + vadd.i32 q8, q8, q4 + vld1.8 {d18-d19}, [r2, : 128]! + vshl.i32 q10, q2, #4 + vld1.8 {d22-d23}, [r2, : 128]! + vadd.i32 q10, q10, q2 + vld1.8 {d24}, [r2, : 64] + vadd.i32 q5, q5, q0 + add r2, r3, #240 + vld1.8 {d26-d27}, [r2, : 128]! + vadd.i32 q6, q6, q3 + vld1.8 {d28-d29}, [r2, : 128]! + vadd.i32 q8, q8, q4 + vld1.8 {d25}, [r2, : 64] + vadd.i32 q10, q10, q2 + vtrn.32 q9, q13 + vadd.i32 q7, q7, q1 + vadd.i32 q5, q5, q0 + vtrn.32 q11, q14 + vadd.i32 q6, q6, q3 + add r2, sp, #528 + vadd.i32 q10, q10, q2 + vtrn.32 d24, d25 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q6, q13, #1 + vst1.8 {d20-d21}, [r2, : 128]! + vshl.i32 q10, q14, #1 + vst1.8 {d12-d13}, [r2, : 128]! + vshl.i32 q15, q12, #1 + vadd.i32 q8, q8, q4 + vext.32 d10, d31, d30, #0 + vadd.i32 q7, q7, q1 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q8, d18, d5 + vmlal.s32 q8, d26, d4 + vmlal.s32 q8, d19, d9 + vmlal.s32 q8, d27, d3 + vmlal.s32 q8, d22, d8 + vmlal.s32 q8, d28, d2 + vmlal.s32 q8, d23, d7 + vmlal.s32 q8, d29, d1 + vmlal.s32 q8, d24, d6 + vmlal.s32 q8, d25, d0 + vst1.8 {d14-d15}, [r2, : 128]! + vmull.s32 q2, d18, d4 + vmlal.s32 q2, d12, d9 + vmlal.s32 q2, d13, d8 + vmlal.s32 q2, d19, d3 + vmlal.s32 q2, d22, d2 + vmlal.s32 q2, d23, d1 + vmlal.s32 q2, d24, d0 + vst1.8 {d20-d21}, [r2, : 128]! + vmull.s32 q7, d18, d9 + vmlal.s32 q7, d26, d3 + vmlal.s32 q7, d19, d8 + vmlal.s32 q7, d27, d2 + vmlal.s32 q7, d22, d7 + vmlal.s32 q7, d28, d1 + vmlal.s32 q7, d23, d6 + vmlal.s32 q7, d29, d0 + vst1.8 {d10-d11}, [r2, : 128]! + vmull.s32 q5, d18, d3 + vmlal.s32 q5, d19, d2 + vmlal.s32 q5, d22, d1 + vmlal.s32 q5, d23, d0 + vmlal.s32 q5, d12, d8 + vst1.8 {d16-d17}, [r2, : 128]! + vmull.s32 q4, d18, d8 + vmlal.s32 q4, d26, d2 + vmlal.s32 q4, d19, d7 + vmlal.s32 q4, d27, d1 + vmlal.s32 q4, d22, d6 + vmlal.s32 q4, d28, d0 + vmull.s32 q8, d18, d7 + vmlal.s32 q8, d26, d1 + vmlal.s32 q8, d19, d6 + vmlal.s32 q8, d27, d0 + add r2, sp, #544 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q7, d24, d21 + vmlal.s32 q7, d25, d20 + vmlal.s32 q4, d23, d21 + vmlal.s32 q4, d29, d20 + vmlal.s32 q8, d22, d21 + vmlal.s32 q8, d28, d20 + vmlal.s32 q5, d24, d20 + vst1.8 {d14-d15}, [r2, : 128] + vmull.s32 q7, d18, d6 + vmlal.s32 q7, d26, d0 + add r2, sp, #624 + vld1.8 {d30-d31}, [r2, : 128] + vmlal.s32 q2, d30, d21 + vmlal.s32 q7, d19, d21 + vmlal.s32 q7, d27, d20 + add r2, sp, #592 + vld1.8 {d26-d27}, [r2, : 128] + vmlal.s32 q4, d25, d27 + vmlal.s32 q8, d29, d27 + vmlal.s32 q8, d25, d26 + vmlal.s32 q7, d28, d27 + vmlal.s32 q7, d29, d26 + add r2, sp, #576 + vld1.8 {d28-d29}, [r2, : 128] + vmlal.s32 q4, d24, d29 + vmlal.s32 q8, d23, d29 + vmlal.s32 q8, d24, d28 + vmlal.s32 q7, d22, d29 + vmlal.s32 q7, d23, d28 + vst1.8 {d8-d9}, [r2, : 128] + add r2, sp, #528 + vld1.8 {d8-d9}, [r2, : 128] + vmlal.s32 q7, d24, d9 + vmlal.s32 q7, d25, d31 + vmull.s32 q1, d18, d2 + vmlal.s32 q1, d19, d1 + vmlal.s32 q1, d22, d0 + vmlal.s32 q1, d24, d27 + vmlal.s32 q1, d23, d20 + vmlal.s32 q1, d12, d7 + vmlal.s32 q1, d13, d6 + vmull.s32 q6, d18, d1 + vmlal.s32 q6, d19, d0 + vmlal.s32 q6, d23, d27 + vmlal.s32 q6, d22, d20 + vmlal.s32 q6, d24, d26 + vmull.s32 q0, d18, d0 + vmlal.s32 q0, d22, d27 + vmlal.s32 q0, d23, d26 + vmlal.s32 q0, d24, d31 + vmlal.s32 q0, d19, d20 + add r2, sp, #608 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q2, d18, d7 + vmlal.s32 q5, d18, d6 + vmlal.s32 q1, d18, d21 + vmlal.s32 q0, d18, d28 + vmlal.s32 q6, d18, d29 + vmlal.s32 q2, d19, d6 + vmlal.s32 q5, d19, d21 + vmlal.s32 q1, d19, d29 + vmlal.s32 q0, d19, d9 + vmlal.s32 q6, d19, d28 + add r2, sp, #560 + vld1.8 {d18-d19}, [r2, : 128] + add r2, sp, #480 + vld1.8 {d22-d23}, [r2, : 128] + vmlal.s32 q5, d19, d7 + vmlal.s32 q0, d18, d21 + vmlal.s32 q0, d19, d29 + vmlal.s32 q6, d18, d6 + add r2, sp, #496 + vld1.8 {d6-d7}, [r2, : 128] + vmlal.s32 q6, d19, d21 + add r2, sp, #544 + vld1.8 {d18-d19}, [r2, : 128] + vmlal.s32 q0, d30, d8 + add r2, sp, #640 + vld1.8 {d20-d21}, [r2, : 128] + vmlal.s32 q5, d30, d29 + add r2, sp, #576 + vld1.8 {d24-d25}, [r2, : 128] + vmlal.s32 q1, d30, d28 + vadd.i64 q13, q0, q11 + vadd.i64 q14, q5, q11 + vmlal.s32 q6, d30, d9 + vshr.s64 q4, q13, #26 + vshr.s64 q13, q14, #26 + vadd.i64 q7, q7, q4 + vshl.i64 q4, q4, #26 + vadd.i64 q14, q7, q3 + vadd.i64 q9, q9, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q15, q9, q3 + vsub.i64 q0, q0, q4 + vshr.s64 q4, q14, #25 + vsub.i64 q5, q5, q13 + vshr.s64 q13, q15, #25 + vadd.i64 q6, q6, q4 + vshl.i64 q4, q4, #25 + vadd.i64 q14, q6, q11 + vadd.i64 q2, q2, q13 + vsub.i64 q4, q7, q4 + vshr.s64 q7, q14, #26 + vshl.i64 q13, q13, #25 + vadd.i64 q14, q2, q11 + vadd.i64 q8, q8, q7 + vshl.i64 q7, q7, #26 + vadd.i64 q15, q8, q3 + vsub.i64 q9, q9, q13 + vshr.s64 q13, q14, #26 + vsub.i64 q6, q6, q7 + vshr.s64 q7, q15, #25 + vadd.i64 q10, q10, q13 + vshl.i64 q13, q13, #26 + vadd.i64 q14, q10, q3 + vadd.i64 q1, q1, q7 + add r2, r3, #240 + vshl.i64 q7, q7, #25 + add r4, r3, #144 + vadd.i64 q15, q1, q11 + add r2, r2, #8 + vsub.i64 q2, q2, q13 + add r4, r4, #8 + vshr.s64 q13, q14, #25 + vsub.i64 q7, q8, q7 + vshr.s64 q8, q15, #26 + vadd.i64 q14, q13, q13 + vadd.i64 q12, q12, q8 + vtrn.32 d12, d14 + vshl.i64 q8, q8, #26 + vtrn.32 d13, d15 + vadd.i64 q3, q12, q3 + vadd.i64 q0, q0, q14 + vst1.8 d12, [r2, : 64]! + vshl.i64 q7, q13, #4 + vst1.8 d13, [r4, : 64]! + vsub.i64 q1, q1, q8 + vshr.s64 q3, q3, #25 + vadd.i64 q0, q0, q7 + vadd.i64 q5, q5, q3 + vshl.i64 q3, q3, #25 + vadd.i64 q6, q5, q11 + vadd.i64 q0, q0, q13 + vshl.i64 q7, q13, #25 + vadd.i64 q8, q0, q11 + vsub.i64 q3, q12, q3 + vshr.s64 q6, q6, #26 + vsub.i64 q7, q10, q7 + vtrn.32 d2, d6 + vshr.s64 q8, q8, #26 + vtrn.32 d3, d7 + vadd.i64 q3, q9, q6 + vst1.8 d2, [r2, : 64] + vshl.i64 q6, q6, #26 + vst1.8 d3, [r4, : 64] + vadd.i64 q1, q4, q8 + vtrn.32 d4, d14 + vshl.i64 q4, q8, #26 + vtrn.32 d5, d15 + vsub.i64 q5, q5, q6 + add r2, r2, #16 + vsub.i64 q0, q0, q4 + vst1.8 d4, [r2, : 64] + add r4, r4, #16 + vst1.8 d5, [r4, : 64] + vtrn.32 d10, d6 + vtrn.32 d11, d7 + sub r2, r2, #8 + sub r4, r4, #8 + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vst1.8 d10, [r2, : 64] + vst1.8 d11, [r4, : 64] + sub r2, r2, #24 + sub r4, r4, #24 + vst1.8 d0, [r2, : 64] + vst1.8 d1, [r4, : 64] + ldr r2, [sp, #456] + ldr r4, [sp, #460] + subs r5, r2, #1 + bge .Lmainloop + add r1, r3, #144 + add r2, r3, #336 + vld1.8 {d0-d1}, [r1, : 128]! + vld1.8 {d2-d3}, [r1, : 128]! + vld1.8 {d4}, [r1, : 64] + vst1.8 {d0-d1}, [r2, : 128]! + vst1.8 {d2-d3}, [r2, : 128]! + vst1.8 d4, [r2, : 64] + movw r1, #0 +.Linvertloop: + add r2, r3, #144 + movw r4, #0 + movw r5, #2 + cmp r1, #1 + moveq r5, #1 + addeq r2, r3, #336 + addeq r4, r3, #48 + cmp r1, #2 + moveq r5, #1 + addeq r2, r3, #48 + cmp r1, #3 + moveq r5, #5 + addeq r4, r3, #336 + cmp r1, #4 + moveq r5, #10 + cmp r1, #5 + moveq r5, #20 + cmp r1, #6 + moveq r5, #10 + addeq r2, r3, #336 + addeq r4, r3, #336 + cmp r1, #7 + moveq r5, #50 + cmp r1, #8 + moveq r5, #100 + cmp r1, #9 + moveq r5, #50 + addeq r2, r3, #336 + cmp r1, #10 + moveq r5, #5 + addeq r2, r3, #48 + cmp r1, #11 + moveq r5, #0 + addeq r2, r3, #96 + add r6, r3, #144 + add r7, r3, #288 + vld1.8 {d0-d1}, [r6, : 128]! + vld1.8 {d2-d3}, [r6, : 128]! + vld1.8 {d4}, [r6, : 64] + vst1.8 {d0-d1}, [r7, : 128]! + vst1.8 {d2-d3}, [r7, : 128]! + vst1.8 d4, [r7, : 64] + cmp r5, #0 + beq .Lskipsquaringloop +.Lsquaringloop: + add r6, r3, #288 + add r7, r3, #288 + add r8, r3, #288 + vmov.i32 q0, #19 + vmov.i32 q1, #0 + vmov.i32 q2, #1 + vzip.i32 q1, q2 + vld1.8 {d4-d5}, [r7, : 128]! + vld1.8 {d6-d7}, [r7, : 128]! + vld1.8 {d9}, [r7, : 64] + vld1.8 {d10-d11}, [r6, : 128]! + add r7, sp, #384 + vld1.8 {d12-d13}, [r6, : 128]! + vmul.i32 q7, q2, q0 + vld1.8 {d8}, [r6, : 64] + vext.32 d17, d11, d10, #1 + vmul.i32 q9, q3, q0 + vext.32 d16, d10, d8, #1 + vshl.u32 q10, q5, q1 + vext.32 d22, d14, d4, #1 + vext.32 d24, d18, d6, #1 + vshl.u32 q13, q6, q1 + vshl.u32 d28, d8, d2 + vrev64.i32 d22, d22 + vmul.i32 d1, d9, d1 + vrev64.i32 d24, d24 + vext.32 d29, d8, d13, #1 + vext.32 d0, d1, d9, #1 + vrev64.i32 d0, d0 + vext.32 d2, d9, d1, #1 + vext.32 d23, d15, d5, #1 + vmull.s32 q4, d20, d4 + vrev64.i32 d23, d23 + vmlal.s32 q4, d21, d1 + vrev64.i32 d2, d2 + vmlal.s32 q4, d26, d19 + vext.32 d3, d5, d15, #1 + vmlal.s32 q4, d27, d18 + vrev64.i32 d3, d3 + vmlal.s32 q4, d28, d15 + vext.32 d14, d12, d11, #1 + vmull.s32 q5, d16, d23 + vext.32 d15, d13, d12, #1 + vmlal.s32 q5, d17, d4 + vst1.8 d8, [r7, : 64]! + vmlal.s32 q5, d14, d1 + vext.32 d12, d9, d8, #0 + vmlal.s32 q5, d15, d19 + vmov.i64 d13, #0 + vmlal.s32 q5, d29, d18 + vext.32 d25, d19, d7, #1 + vmlal.s32 q6, d20, d5 + vrev64.i32 d25, d25 + vmlal.s32 q6, d21, d4 + vst1.8 d11, [r7, : 64]! + vmlal.s32 q6, d26, d1 + vext.32 d9, d10, d10, #0 + vmlal.s32 q6, d27, d19 + vmov.i64 d8, #0 + vmlal.s32 q6, d28, d18 + vmlal.s32 q4, d16, d24 + vmlal.s32 q4, d17, d5 + vmlal.s32 q4, d14, d4 + vst1.8 d12, [r7, : 64]! + vmlal.s32 q4, d15, d1 + vext.32 d10, d13, d12, #0 + vmlal.s32 q4, d29, d19 + vmov.i64 d11, #0 + vmlal.s32 q5, d20, d6 + vmlal.s32 q5, d21, d5 + vmlal.s32 q5, d26, d4 + vext.32 d13, d8, d8, #0 + vmlal.s32 q5, d27, d1 + vmov.i64 d12, #0 + vmlal.s32 q5, d28, d19 + vst1.8 d9, [r7, : 64]! + vmlal.s32 q6, d16, d25 + vmlal.s32 q6, d17, d6 + vst1.8 d10, [r7, : 64] + vmlal.s32 q6, d14, d5 + vext.32 d8, d11, d10, #0 + vmlal.s32 q6, d15, d4 + vmov.i64 d9, #0 + vmlal.s32 q6, d29, d1 + vmlal.s32 q4, d20, d7 + vmlal.s32 q4, d21, d6 + vmlal.s32 q4, d26, d5 + vext.32 d11, d12, d12, #0 + vmlal.s32 q4, d27, d4 + vmov.i64 d10, #0 + vmlal.s32 q4, d28, d1 + vmlal.s32 q5, d16, d0 + sub r6, r7, #32 + vmlal.s32 q5, d17, d7 + vmlal.s32 q5, d14, d6 + vext.32 d30, d9, d8, #0 + vmlal.s32 q5, d15, d5 + vld1.8 {d31}, [r6, : 64]! + vmlal.s32 q5, d29, d4 + vmlal.s32 q15, d20, d0 + vext.32 d0, d6, d18, #1 + vmlal.s32 q15, d21, d25 + vrev64.i32 d0, d0 + vmlal.s32 q15, d26, d24 + vext.32 d1, d7, d19, #1 + vext.32 d7, d10, d10, #0 + vmlal.s32 q15, d27, d23 + vrev64.i32 d1, d1 + vld1.8 {d6}, [r6, : 64] + vmlal.s32 q15, d28, d22 + vmlal.s32 q3, d16, d4 + add r6, r6, #24 + vmlal.s32 q3, d17, d2 + vext.32 d4, d31, d30, #0 + vmov d17, d11 + vmlal.s32 q3, d14, d1 + vext.32 d11, d13, d13, #0 + vext.32 d13, d30, d30, #0 + vmlal.s32 q3, d15, d0 + vext.32 d1, d8, d8, #0 + vmlal.s32 q3, d29, d3 + vld1.8 {d5}, [r6, : 64] + sub r6, r6, #16 + vext.32 d10, d6, d6, #0 + vmov.i32 q1, #0xffffffff + vshl.i64 q4, q1, #25 + add r7, sp, #480 + vld1.8 {d14-d15}, [r7, : 128] + vadd.i64 q9, q2, q7 + vshl.i64 q1, q1, #26 + vshr.s64 q10, q9, #26 + vld1.8 {d0}, [r6, : 64]! + vadd.i64 q5, q5, q10 + vand q9, q9, q1 + vld1.8 {d16}, [r6, : 64]! + add r6, sp, #496 + vld1.8 {d20-d21}, [r6, : 128] + vadd.i64 q11, q5, q10 + vsub.i64 q2, q2, q9 + vshr.s64 q9, q11, #25 + vext.32 d12, d5, d4, #0 + vand q11, q11, q4 + vadd.i64 q0, q0, q9 + vmov d19, d7 + vadd.i64 q3, q0, q7 + vsub.i64 q5, q5, q11 + vshr.s64 q11, q3, #26 + vext.32 d18, d11, d10, #0 + vand q3, q3, q1 + vadd.i64 q8, q8, q11 + vadd.i64 q11, q8, q10 + vsub.i64 q0, q0, q3 + vshr.s64 q3, q11, #25 + vand q11, q11, q4 + vadd.i64 q3, q6, q3 + vadd.i64 q6, q3, q7 + vsub.i64 q8, q8, q11 + vshr.s64 q11, q6, #26 + vand q6, q6, q1 + vadd.i64 q9, q9, q11 + vadd.i64 d25, d19, d21 + vsub.i64 q3, q3, q6 + vshr.s64 d23, d25, #25 + vand q4, q12, q4 + vadd.i64 d21, d23, d23 + vshl.i64 d25, d23, #4 + vadd.i64 d21, d21, d23 + vadd.i64 d25, d25, d21 + vadd.i64 d4, d4, d25 + vzip.i32 q0, q8 + vadd.i64 d12, d4, d14 + add r6, r8, #8 + vst1.8 d0, [r6, : 64] + vsub.i64 d19, d19, d9 + add r6, r6, #16 + vst1.8 d16, [r6, : 64] + vshr.s64 d22, d12, #26 + vand q0, q6, q1 + vadd.i64 d10, d10, d22 + vzip.i32 q3, q9 + vsub.i64 d4, d4, d0 + sub r6, r6, #8 + vst1.8 d6, [r6, : 64] + add r6, r6, #16 + vst1.8 d18, [r6, : 64] + vzip.i32 q2, q5 + sub r6, r6, #32 + vst1.8 d4, [r6, : 64] + subs r5, r5, #1 + bhi .Lsquaringloop +.Lskipsquaringloop: + mov r2, r2 + add r5, r3, #288 + add r6, r3, #144 + vmov.i32 q0, #19 + vmov.i32 q1, #0 + vmov.i32 q2, #1 + vzip.i32 q1, q2 + vld1.8 {d4-d5}, [r5, : 128]! + vld1.8 {d6-d7}, [r5, : 128]! + vld1.8 {d9}, [r5, : 64] + vld1.8 {d10-d11}, [r2, : 128]! + add r5, sp, #384 + vld1.8 {d12-d13}, [r2, : 128]! + vmul.i32 q7, q2, q0 + vld1.8 {d8}, [r2, : 64] + vext.32 d17, d11, d10, #1 + vmul.i32 q9, q3, q0 + vext.32 d16, d10, d8, #1 + vshl.u32 q10, q5, q1 + vext.32 d22, d14, d4, #1 + vext.32 d24, d18, d6, #1 + vshl.u32 q13, q6, q1 + vshl.u32 d28, d8, d2 + vrev64.i32 d22, d22 + vmul.i32 d1, d9, d1 + vrev64.i32 d24, d24 + vext.32 d29, d8, d13, #1 + vext.32 d0, d1, d9, #1 + vrev64.i32 d0, d0 + vext.32 d2, d9, d1, #1 + vext.32 d23, d15, d5, #1 + vmull.s32 q4, d20, d4 + vrev64.i32 d23, d23 + vmlal.s32 q4, d21, d1 + vrev64.i32 d2, d2 + vmlal.s32 q4, d26, d19 + vext.32 d3, d5, d15, #1 + vmlal.s32 q4, d27, d18 + vrev64.i32 d3, d3 + vmlal.s32 q4, d28, d15 + vext.32 d14, d12, d11, #1 + vmull.s32 q5, d16, d23 + vext.32 d15, d13, d12, #1 + vmlal.s32 q5, d17, d4 + vst1.8 d8, [r5, : 64]! + vmlal.s32 q5, d14, d1 + vext.32 d12, d9, d8, #0 + vmlal.s32 q5, d15, d19 + vmov.i64 d13, #0 + vmlal.s32 q5, d29, d18 + vext.32 d25, d19, d7, #1 + vmlal.s32 q6, d20, d5 + vrev64.i32 d25, d25 + vmlal.s32 q6, d21, d4 + vst1.8 d11, [r5, : 64]! + vmlal.s32 q6, d26, d1 + vext.32 d9, d10, d10, #0 + vmlal.s32 q6, d27, d19 + vmov.i64 d8, #0 + vmlal.s32 q6, d28, d18 + vmlal.s32 q4, d16, d24 + vmlal.s32 q4, d17, d5 + vmlal.s32 q4, d14, d4 + vst1.8 d12, [r5, : 64]! + vmlal.s32 q4, d15, d1 + vext.32 d10, d13, d12, #0 + vmlal.s32 q4, d29, d19 + vmov.i64 d11, #0 + vmlal.s32 q5, d20, d6 + vmlal.s32 q5, d21, d5 + vmlal.s32 q5, d26, d4 + vext.32 d13, d8, d8, #0 + vmlal.s32 q5, d27, d1 + vmov.i64 d12, #0 + vmlal.s32 q5, d28, d19 + vst1.8 d9, [r5, : 64]! + vmlal.s32 q6, d16, d25 + vmlal.s32 q6, d17, d6 + vst1.8 d10, [r5, : 64] + vmlal.s32 q6, d14, d5 + vext.32 d8, d11, d10, #0 + vmlal.s32 q6, d15, d4 + vmov.i64 d9, #0 + vmlal.s32 q6, d29, d1 + vmlal.s32 q4, d20, d7 + vmlal.s32 q4, d21, d6 + vmlal.s32 q4, d26, d5 + vext.32 d11, d12, d12, #0 + vmlal.s32 q4, d27, d4 + vmov.i64 d10, #0 + vmlal.s32 q4, d28, d1 + vmlal.s32 q5, d16, d0 + sub r2, r5, #32 + vmlal.s32 q5, d17, d7 + vmlal.s32 q5, d14, d6 + vext.32 d30, d9, d8, #0 + vmlal.s32 q5, d15, d5 + vld1.8 {d31}, [r2, : 64]! + vmlal.s32 q5, d29, d4 + vmlal.s32 q15, d20, d0 + vext.32 d0, d6, d18, #1 + vmlal.s32 q15, d21, d25 + vrev64.i32 d0, d0 + vmlal.s32 q15, d26, d24 + vext.32 d1, d7, d19, #1 + vext.32 d7, d10, d10, #0 + vmlal.s32 q15, d27, d23 + vrev64.i32 d1, d1 + vld1.8 {d6}, [r2, : 64] + vmlal.s32 q15, d28, d22 + vmlal.s32 q3, d16, d4 + add r2, r2, #24 + vmlal.s32 q3, d17, d2 + vext.32 d4, d31, d30, #0 + vmov d17, d11 + vmlal.s32 q3, d14, d1 + vext.32 d11, d13, d13, #0 + vext.32 d13, d30, d30, #0 + vmlal.s32 q3, d15, d0 + vext.32 d1, d8, d8, #0 + vmlal.s32 q3, d29, d3 + vld1.8 {d5}, [r2, : 64] + sub r2, r2, #16 + vext.32 d10, d6, d6, #0 + vmov.i32 q1, #0xffffffff + vshl.i64 q4, q1, #25 + add r5, sp, #480 + vld1.8 {d14-d15}, [r5, : 128] + vadd.i64 q9, q2, q7 + vshl.i64 q1, q1, #26 + vshr.s64 q10, q9, #26 + vld1.8 {d0}, [r2, : 64]! + vadd.i64 q5, q5, q10 + vand q9, q9, q1 + vld1.8 {d16}, [r2, : 64]! + add r2, sp, #496 + vld1.8 {d20-d21}, [r2, : 128] + vadd.i64 q11, q5, q10 + vsub.i64 q2, q2, q9 + vshr.s64 q9, q11, #25 + vext.32 d12, d5, d4, #0 + vand q11, q11, q4 + vadd.i64 q0, q0, q9 + vmov d19, d7 + vadd.i64 q3, q0, q7 + vsub.i64 q5, q5, q11 + vshr.s64 q11, q3, #26 + vext.32 d18, d11, d10, #0 + vand q3, q3, q1 + vadd.i64 q8, q8, q11 + vadd.i64 q11, q8, q10 + vsub.i64 q0, q0, q3 + vshr.s64 q3, q11, #25 + vand q11, q11, q4 + vadd.i64 q3, q6, q3 + vadd.i64 q6, q3, q7 + vsub.i64 q8, q8, q11 + vshr.s64 q11, q6, #26 + vand q6, q6, q1 + vadd.i64 q9, q9, q11 + vadd.i64 d25, d19, d21 + vsub.i64 q3, q3, q6 + vshr.s64 d23, d25, #25 + vand q4, q12, q4 + vadd.i64 d21, d23, d23 + vshl.i64 d25, d23, #4 + vadd.i64 d21, d21, d23 + vadd.i64 d25, d25, d21 + vadd.i64 d4, d4, d25 + vzip.i32 q0, q8 + vadd.i64 d12, d4, d14 + add r2, r6, #8 + vst1.8 d0, [r2, : 64] + vsub.i64 d19, d19, d9 + add r2, r2, #16 + vst1.8 d16, [r2, : 64] + vshr.s64 d22, d12, #26 + vand q0, q6, q1 + vadd.i64 d10, d10, d22 + vzip.i32 q3, q9 + vsub.i64 d4, d4, d0 + sub r2, r2, #8 + vst1.8 d6, [r2, : 64] + add r2, r2, #16 + vst1.8 d18, [r2, : 64] + vzip.i32 q2, q5 + sub r2, r2, #32 + vst1.8 d4, [r2, : 64] + cmp r4, #0 + beq .Lskippostcopy + add r2, r3, #144 + mov r4, r4 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4}, [r2, : 64] + vst1.8 {d0-d1}, [r4, : 128]! + vst1.8 {d2-d3}, [r4, : 128]! + vst1.8 d4, [r4, : 64] +.Lskippostcopy: + cmp r1, #1 + bne .Lskipfinalcopy + add r2, r3, #288 + add r4, r3, #144 + vld1.8 {d0-d1}, [r2, : 128]! + vld1.8 {d2-d3}, [r2, : 128]! + vld1.8 {d4}, [r2, : 64] + vst1.8 {d0-d1}, [r4, : 128]! + vst1.8 {d2-d3}, [r4, : 128]! + vst1.8 d4, [r4, : 64] +.Lskipfinalcopy: + add r1, r1, #1 + cmp r1, #12 + blo .Linvertloop + add r1, r3, #144 + ldr r2, [r1], #4 + ldr r3, [r1], #4 + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldr r6, [r1], #4 + ldr r7, [r1], #4 + ldr r8, [r1], #4 + ldr r9, [r1], #4 + ldr r10, [r1], #4 + ldr r1, [r1] + add r11, r1, r1, LSL #4 + add r11, r11, r1, LSL #1 + add r11, r11, #16777216 + mov r11, r11, ASR #25 + add r11, r11, r2 + mov r11, r11, ASR #26 + add r11, r11, r3 + mov r11, r11, ASR #25 + add r11, r11, r4 + mov r11, r11, ASR #26 + add r11, r11, r5 + mov r11, r11, ASR #25 + add r11, r11, r6 + mov r11, r11, ASR #26 + add r11, r11, r7 + mov r11, r11, ASR #25 + add r11, r11, r8 + mov r11, r11, ASR #26 + add r11, r11, r9 + mov r11, r11, ASR #25 + add r11, r11, r10 + mov r11, r11, ASR #26 + add r11, r11, r1 + mov r11, r11, ASR #25 + add r2, r2, r11 + add r2, r2, r11, LSL #1 + add r2, r2, r11, LSL #4 + mov r11, r2, ASR #26 + add r3, r3, r11 + sub r2, r2, r11, LSL #26 + mov r11, r3, ASR #25 + add r4, r4, r11 + sub r3, r3, r11, LSL #25 + mov r11, r4, ASR #26 + add r5, r5, r11 + sub r4, r4, r11, LSL #26 + mov r11, r5, ASR #25 + add r6, r6, r11 + sub r5, r5, r11, LSL #25 + mov r11, r6, ASR #26 + add r7, r7, r11 + sub r6, r6, r11, LSL #26 + mov r11, r7, ASR #25 + add r8, r8, r11 + sub r7, r7, r11, LSL #25 + mov r11, r8, ASR #26 + add r9, r9, r11 + sub r8, r8, r11, LSL #26 + mov r11, r9, ASR #25 + add r10, r10, r11 + sub r9, r9, r11, LSL #25 + mov r11, r10, ASR #26 + add r1, r1, r11 + sub r10, r10, r11, LSL #26 + mov r11, r1, ASR #25 + sub r1, r1, r11, LSL #25 + add r2, r2, r3, LSL #26 + mov r3, r3, LSR #6 + add r3, r3, r4, LSL #19 + mov r4, r4, LSR #13 + add r4, r4, r5, LSL #13 + mov r5, r5, LSR #19 + add r5, r5, r6, LSL #6 + add r6, r7, r8, LSL #25 + mov r7, r8, LSR #7 + add r7, r7, r9, LSL #19 + mov r8, r9, LSR #13 + add r8, r8, r10, LSL #12 + mov r9, r10, LSR #20 + add r1, r9, r1, LSL #6 + str r2, [r0] + str r3, [r0, #4] + str r4, [r0, #8] + str r5, [r0, #12] + str r6, [r0, #16] + str r7, [r0, #20] + str r8, [r0, #24] + str r1, [r0, #28] + movw r0, #0 + mov sp, ip + pop {r4-r11, pc} +ENDPROC(curve25519_neon) diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h new file mode 100644 index 000000000000..f6d66494eb8f --- /dev/null +++ b/lib/crypto/arm/curve25519.h @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been + * manually reworked for use in kernel space. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], + const u8 scalar[CURVE25519_KEY_SIZE], + const u8 point[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + curve25519_neon(out, scalar, point); + kernel_neon_end(); + } else { + curve25519_generic(out, scalar, point); + } +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_arch(pub, secret, curve25519_base_point); +} + +#define curve25519_mod_init_arch curve25519_mod_init_arch +static void curve25519_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c deleted file mode 100644 index f8aa70c9f559..000000000000 --- a/lib/crypto/curve25519-generic.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * This is an implementation of the Curve25519 ECDH algorithm, using either - * a 32-bit implementation or a 64-bit implementation with 128-bit integers, - * depending on what is supported by the target compiler. - * - * Information: https://cr.yp.to/ecdh.html - */ - -#include -#include -#include - -const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; -const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; - -EXPORT_SYMBOL(curve25519_null_point); -EXPORT_SYMBOL(curve25519_base_point); -EXPORT_SYMBOL(curve25519_generic); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Curve25519 scalar multiplication"); -MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 1b786389d714..01e265dfbcd9 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -2,8 +2,9 @@ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. * - * This is an implementation of the Curve25519 ECDH algorithm, using either - * a 32-bit implementation or a 64-bit implementation with 128-bit integers, + * This is an implementation of the Curve25519 ECDH algorithm, using either an + * architecture-optimized implementation or a generic implementation. The + * generic implementation is either 32-bit, or 64-bit with 128-bit integers, * depending on what is supported by the target compiler. * * Information: https://cr.yp.to/ecdh.html @@ -15,15 +16,32 @@ #include #include +static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; +static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; + +#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH +#include "curve25519.h" /* $(SRCARCH)/curve25519.h */ +#else +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + curve25519_generic(mypublic, secret, basepoint); +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_generic(pub, secret, curve25519_base_point); +} +#endif + bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]) { - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_arch(mypublic, secret, basepoint); - else - curve25519_generic(mypublic, secret, basepoint); + curve25519_arch(mypublic, secret, basepoint); return crypto_memneq(mypublic, curve25519_null_point, CURVE25519_KEY_SIZE); } @@ -36,27 +54,25 @@ curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], if (unlikely(!crypto_memneq(secret, curve25519_null_point, CURVE25519_KEY_SIZE))) return false; - - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_base_arch(pub, secret); - else - curve25519_generic(pub, secret, curve25519_base_point); + curve25519_base_arch(pub, secret); return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); } EXPORT_SYMBOL(curve25519_generate_public); -static int __init curve25519_init(void) +#ifdef curve25519_mod_init_arch +static int __init curve25519_mod_init(void) { + curve25519_mod_init_arch(); return 0; } +subsys_initcall(curve25519_mod_init); -static void __exit curve25519_exit(void) +static void __exit curve25519_mod_exit(void) { } - -module_init(curve25519_init); -module_exit(curve25519_exit); +module_exit(curve25519_mod_exit); +#endif MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Curve25519 scalar multiplication"); +MODULE_DESCRIPTION("Curve25519 algorithm"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/powerpc/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S new file mode 100644 index 000000000000..06c1febe24b9 --- /dev/null +++ b/lib/crypto/powerpc/curve25519-ppc64le_asm.S @@ -0,0 +1,671 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://github.com/dot-asm/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# +# ==================================================================== +# Written and Modified by Danny Tsen +# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes +# and x25519_cswap +# +# Copyright 2024- IBM Corp. +# +# X25519 lower-level primitives for PPC64. +# + +#include + +.text + +.align 5 +SYM_FUNC_START(x25519_fe51_mul) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 6,0(5) + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + + mulld 24,8,6 + mulhdu 25,8,6 + + mulld 30,11,6 + mulhdu 31,11,6 + ld 4,8(5) + mulli 11,11,19 + + mulld 26,9,6 + mulhdu 27,9,6 + + mulld 28,10,6 + mulhdu 29,10,6 + mulld 12,11,4 + mulhdu 21,11,4 + addc 22,22,12 + adde 23,23,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc 24,24,12 + adde 25,25,21 + + mulld 12,10,4 + mulhdu 21,10,4 + ld 6,16(5) + mulli 10,10,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,8,4 + mulhdu 21,8,4 + addc 26,26,12 + adde 27,27,21 + + mulld 12,9,4 + mulhdu 21,9,4 + addc 28,28,12 + adde 29,29,21 + mulld 12,10,6 + mulhdu 21,10,6 + addc 22,22,12 + adde 23,23,21 + + mulld 12,11,6 + mulhdu 21,11,6 + addc 24,24,12 + adde 25,25,21 + + mulld 12,9,6 + mulhdu 21,9,6 + ld 4,24(5) + mulli 9,9,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,7,6 + mulhdu 21,7,6 + addc 26,26,12 + adde 27,27,21 + + mulld 12,8,6 + mulhdu 21,8,6 + addc 28,28,12 + adde 29,29,21 + mulld 12,9,4 + mulhdu 21,9,4 + addc 22,22,12 + adde 23,23,21 + + mulld 12,10,4 + mulhdu 21,10,4 + addc 24,24,12 + adde 25,25,21 + + mulld 12,8,4 + mulhdu 21,8,4 + ld 6,32(5) + mulli 8,8,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,11,4 + mulhdu 21,11,4 + addc 26,26,12 + adde 27,27,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc 28,28,12 + adde 29,29,21 + mulld 12,8,6 + mulhdu 21,8,6 + addc 22,22,12 + adde 23,23,21 + + mulld 12,9,6 + mulhdu 21,9,6 + addc 24,24,12 + adde 25,25,21 + + mulld 12,10,6 + mulhdu 21,10,6 + addc 26,26,12 + adde 27,27,21 + + mulld 12,11,6 + mulhdu 21,11,6 + addc 28,28,12 + adde 29,29,21 + + mulld 12,7,6 + mulhdu 21,7,6 + addc 30,30,12 + adde 31,31,21 + +.Lfe51_reduce: + li 0,-1 + srdi 0,0,13 + + srdi 12,26,51 + and 9,26,0 + insrdi 12,27,51,0 + srdi 21,22,51 + and 7,22,0 + insrdi 21,23,51,0 + addc 28,28,12 + addze 29,29 + addc 24,24,21 + addze 25,25 + + srdi 12,28,51 + and 10,28,0 + insrdi 12,29,51,0 + srdi 21,24,51 + and 8,24,0 + insrdi 21,25,51,0 + addc 30,30,12 + addze 31,31 + add 9,9,21 + + srdi 12,30,51 + and 11,30,0 + insrdi 12,31,51,0 + mulli 12,12,19 + + add 7,7,12 + + srdi 21,9,51 + and 9,9,0 + add 10,10,21 + + srdi 12,7,51 + and 7,7,0 + add 8,8,12 + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + + ld 21,56(1) + ld 22,64(1) + ld 23,72(1) + ld 24,80(1) + ld 25,88(1) + ld 26,96(1) + ld 27,104(1) + ld 28,112(1) + ld 29,120(1) + ld 30,128(1) + ld 31,136(1) + addi 1,1,144 + blr +SYM_FUNC_END(x25519_fe51_mul) + +.align 5 +SYM_FUNC_START(x25519_fe51_sqr) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + add 6,7,7 + mulli 21,11,19 + + mulld 22,7,7 + mulhdu 23,7,7 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + add 6,8,8 + mulld 12,11,21 + mulhdu 11,11,21 + addc 28,28,12 + adde 29,29,11 + + mulli 5,10,19 + + mulld 12,8,8 + mulhdu 11,8,8 + addc 26,26,12 + adde 27,27,11 + mulld 12,9,6 + mulhdu 11,9,6 + addc 28,28,12 + adde 29,29,11 + mulld 12,10,6 + mulhdu 11,10,6 + addc 30,30,12 + adde 31,31,11 + mulld 12,21,6 + mulhdu 11,21,6 + add 6,10,10 + addc 22,22,12 + adde 23,23,11 + mulld 12,10,5 + mulhdu 10,10,5 + addc 24,24,12 + adde 25,25,10 + mulld 12,6,21 + mulhdu 10,6,21 + add 6,9,9 + addc 26,26,12 + adde 27,27,10 + + mulld 12,9,9 + mulhdu 10,9,9 + addc 30,30,12 + adde 31,31,10 + mulld 12,5,6 + mulhdu 10,5,6 + addc 22,22,12 + adde 23,23,10 + mulld 12,21,6 + mulhdu 10,21,6 + addc 24,24,12 + adde 25,25,10 + + b .Lfe51_reduce +SYM_FUNC_END(x25519_fe51_sqr) + +.align 5 +SYM_FUNC_START(x25519_fe51_mul121666) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + lis 6,1 + ori 6,6,56130 + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + + b .Lfe51_reduce +SYM_FUNC_END(x25519_fe51_mul121666) + +.align 5 +SYM_FUNC_START(x25519_fe51_sqr_times) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mtctr 5 + +.Lsqr_times_loop: + add 6,7,7 + mulli 21,11,19 + + mulld 22,7,7 + mulhdu 23,7,7 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + add 6,8,8 + mulld 12,11,21 + mulhdu 11,11,21 + addc 28,28,12 + adde 29,29,11 + + mulli 5,10,19 + + mulld 12,8,8 + mulhdu 11,8,8 + addc 26,26,12 + adde 27,27,11 + mulld 12,9,6 + mulhdu 11,9,6 + addc 28,28,12 + adde 29,29,11 + mulld 12,10,6 + mulhdu 11,10,6 + addc 30,30,12 + adde 31,31,11 + mulld 12,21,6 + mulhdu 11,21,6 + add 6,10,10 + addc 22,22,12 + adde 23,23,11 + mulld 12,10,5 + mulhdu 10,10,5 + addc 24,24,12 + adde 25,25,10 + mulld 12,6,21 + mulhdu 10,6,21 + add 6,9,9 + addc 26,26,12 + adde 27,27,10 + + mulld 12,9,9 + mulhdu 10,9,9 + addc 30,30,12 + adde 31,31,10 + mulld 12,5,6 + mulhdu 10,5,6 + addc 22,22,12 + adde 23,23,10 + mulld 12,21,6 + mulhdu 10,21,6 + addc 24,24,12 + adde 25,25,10 + + # fe51_reduce + li 0,-1 + srdi 0,0,13 + + srdi 12,26,51 + and 9,26,0 + insrdi 12,27,51,0 + srdi 21,22,51 + and 7,22,0 + insrdi 21,23,51,0 + addc 28,28,12 + addze 29,29 + addc 24,24,21 + addze 25,25 + + srdi 12,28,51 + and 10,28,0 + insrdi 12,29,51,0 + srdi 21,24,51 + and 8,24,0 + insrdi 21,25,51,0 + addc 30,30,12 + addze 31,31 + add 9,9,21 + + srdi 12,30,51 + and 11,30,0 + insrdi 12,31,51,0 + mulli 12,12,19 + + add 7,7,12 + + srdi 21,9,51 + and 9,9,0 + add 10,10,21 + + srdi 12,7,51 + and 7,7,0 + add 8,8,12 + + bdnz .Lsqr_times_loop + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + + ld 21,56(1) + ld 22,64(1) + ld 23,72(1) + ld 24,80(1) + ld 25,88(1) + ld 26,96(1) + ld 27,104(1) + ld 28,112(1) + ld 29,120(1) + ld 30,128(1) + ld 31,136(1) + addi 1,1,144 + blr +SYM_FUNC_END(x25519_fe51_sqr_times) + +.align 5 +SYM_FUNC_START(x25519_fe51_frombytes) + + li 12, -1 + srdi 12, 12, 13 # 0x7ffffffffffff + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) + + srdi 10, 5, 51 + and 5, 5, 12 # h0 + + sldi 11, 6, 13 + or 11, 10, 11 # h1t + srdi 10, 6, 38 + and 6, 11, 12 # h1 + + sldi 11, 7, 26 + or 10, 10, 11 # h2t + + srdi 11, 7, 25 + and 7, 10, 12 # h2 + sldi 10, 8, 39 + or 11, 11, 10 # h3t + + srdi 9, 8, 12 + and 8, 11, 12 # h3 + and 9, 9, 12 # h4 + + std 5, 0(3) + std 6, 8(3) + std 7, 16(3) + std 8, 24(3) + std 9, 32(3) + + blr +SYM_FUNC_END(x25519_fe51_frombytes) + +.align 5 +SYM_FUNC_START(x25519_fe51_tobytes) + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) + ld 9, 32(4) + + li 12, -1 + srdi 12, 12, 13 # 0x7ffffffffffff + + # Full reducuction + addi 10, 5, 19 + srdi 10, 10, 51 + add 10, 10, 6 + srdi 10, 10, 51 + add 10, 10, 7 + srdi 10, 10, 51 + add 10, 10, 8 + srdi 10, 10, 51 + add 10, 10, 9 + srdi 10, 10, 51 + + mulli 10, 10, 19 + add 5, 5, 10 + srdi 11, 5, 51 + add 6, 6, 11 + srdi 11, 6, 51 + add 7, 7, 11 + srdi 11, 7, 51 + add 8, 8, 11 + srdi 11, 8, 51 + add 9, 9, 11 + + and 5, 5, 12 + and 6, 6, 12 + and 7, 7, 12 + and 8, 8, 12 + and 9, 9, 12 + + sldi 10, 6, 51 + or 5, 5, 10 # s0 + + srdi 11, 6, 13 + sldi 10, 7, 38 + or 6, 11, 10 # s1 + + srdi 11, 7, 26 + sldi 10, 8, 25 + or 7, 11, 10 # s2 + + srdi 11, 8, 39 + sldi 10, 9, 12 + or 8, 11, 10 # s4 + + std 5, 0(3) + std 6, 8(3) + std 7, 16(3) + std 8, 24(3) + + blr +SYM_FUNC_END(x25519_fe51_tobytes) + +.align 5 +SYM_FUNC_START(x25519_cswap) + + li 7, 5 + neg 6, 5 + mtctr 7 + +.Lswap_loop: + ld 8, 0(3) + ld 9, 0(4) + xor 10, 8, 9 + and 10, 10, 6 + xor 11, 8, 10 + xor 12, 9, 10 + std 11, 0(3) + addi 3, 3, 8 + std 12, 0(4) + addi 4, 4, 8 + bdnz .Lswap_loop + + blr +SYM_FUNC_END(x25519_cswap) diff --git a/lib/crypto/powerpc/curve25519.h b/lib/crypto/powerpc/curve25519.h new file mode 100644 index 000000000000..dee6234c48e9 --- /dev/null +++ b/lib/crypto/powerpc/curve25519.h @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2024- IBM Corp. + * + * X25519 scalar multiplication with 51 bits limbs for PPC64le. + * Based on RFC7748 and AArch64 optimized implementation for X25519 + * - Algorithm 1 Scalar multiplication of a variable point + */ + +#include +#include +#include + +#include +#include + +typedef uint64_t fe51[5]; + +asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); +asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); +asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); +asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); +asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); +asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit); + +#define fmul x25519_fe51_mul +#define fsqr x25519_fe51_sqr +#define fmul121666 x25519_fe51_mul121666 +#define fe51_tobytes x25519_fe51_tobytes + +static void fadd(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = f[0] + g[0]; + h[1] = f[1] + g[1]; + h[2] = f[2] + g[2]; + h[3] = f[3] + g[3]; + h[4] = f[4] + g[4]; +} + +/* + * Prime = 2 ** 255 - 19, 255 bits + * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed) + * + * Prime in 5 51-bit limbs + */ +static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff}; + +static void fsub(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; + h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; + h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; + h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; + h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; +} + +static void fe51_frombytes(fe51 h, const uint8_t *s) +{ + /* + * Make sure 64-bit aligned. + */ + unsigned char sbuf[32+8]; + unsigned char *sb = PTR_ALIGN((void *)sbuf, 8); + + memcpy(sb, s, 32); + x25519_fe51_frombytes(h, sb); +} + +static void finv(fe51 o, const fe51 i) +{ + fe51 a0, b, c, t00; + + fsqr(a0, i); + x25519_fe51_sqr_times(t00, a0, 2); + + fmul(b, t00, i); + fmul(a0, b, a0); + + fsqr(t00, a0); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 5); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 10); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 20); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 10); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 50); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 100); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 50); + + fmul(t00, t00, b); + x25519_fe51_sqr_times(t00, t00, 5); + + fmul(o, t00, a0); +} + +static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe51 x1, x2, z2, x3, z3; + uint8_t s[32]; + unsigned int swap = 0; + int i; + + memcpy(s, scalar, 32); + s[0] &= 0xf8; + s[31] &= 0x7f; + s[31] |= 0x40; + fe51_frombytes(x1, point); + + z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; + x3[0] = x1[0]; + x3[1] = x1[1]; + x3[2] = x1[2]; + x3[3] = x1[3]; + x3[4] = x1[4]; + + x2[0] = z3[0] = 1; + x2[1] = z3[1] = 0; + x2[2] = z3[2] = 0; + x2[3] = z3[3] = 0; + x2[4] = z3[4] = 0; + + for (i = 254; i >= 0; --i) { + unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); + fe51 a, b, c, d, e; + fe51 da, cb, aa, bb; + fe51 dacb_p, dacb_m; + + swap ^= k_t; + x25519_cswap(x2, x3, swap); + x25519_cswap(z2, z3, swap); + swap = k_t; + + fsub(b, x2, z2); // B = x_2 - z_2 + fadd(a, x2, z2); // A = x_2 + z_2 + fsub(d, x3, z3); // D = x_3 - z_3 + fadd(c, x3, z3); // C = x_3 + z_3 + + fsqr(bb, b); // BB = B^2 + fsqr(aa, a); // AA = A^2 + fmul(da, d, a); // DA = D * A + fmul(cb, c, b); // CB = C * B + + fsub(e, aa, bb); // E = AA - BB + fmul(x2, aa, bb); // x2 = AA * BB + fadd(dacb_p, da, cb); // DA + CB + fsub(dacb_m, da, cb); // DA - CB + + fmul121666(z3, e); // 121666 * E + fsqr(z2, dacb_m); // (DA - CB)^2 + fsqr(x3, dacb_p); // x3 = (DA + CB)^2 + fadd(b, bb, z3); // BB + 121666 * E + fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2 + fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2) + } + + finv(z2, z2); + fmul(x2, x2, z2); + fe51_tobytes(out, x2); +} + +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + curve25519_fe51(mypublic, secret, basepoint); +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_fe51(pub, secret, curve25519_base_point); +} diff --git a/lib/crypto/x86/curve25519.h b/lib/crypto/x86/curve25519.h new file mode 100644 index 000000000000..5c0b8408852d --- /dev/null +++ b/lib/crypto/x86/curve25519.h @@ -0,0 +1,1613 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2020 Jason A. Donenfeld . All Rights Reserved. + * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation + */ + +#include +#include +#include + +#include +#include + +static __always_inline u64 eq_mask(u64 a, u64 b) +{ + u64 x = a ^ b; + u64 minus_x = ~x + (u64)1U; + u64 x_or_minus_x = x | minus_x; + u64 xnx = x_or_minus_x >> (u32)63U; + return xnx - (u64)1U; +} + +static __always_inline u64 gte_mask(u64 a, u64 b) +{ + u64 x = a; + u64 y = b; + u64 x_xor_y = x ^ y; + u64 x_sub_y = x - y; + u64 x_sub_y_xor_y = x_sub_y ^ y; + u64 q = x_xor_y | x_sub_y_xor_y; + u64 x_xor_q = x ^ q; + u64 x_xor_q_ = x_xor_q >> (u32)63U; + return x_xor_q_ - (u64)1U; +} + +/* Computes the addition of four-element f1 with value in f2 + * and returns the carry (if any) */ +static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) +{ + u64 carry_r; + + asm volatile( + /* Clear registers to propagate the carry bit */ + " xor %%r8d, %%r8d;" + " xor %%r9d, %%r9d;" + " xor %%r10d, %%r10d;" + " xor %%r11d, %%r11d;" + " xor %k1, %k1;" + + /* Begin addition chain */ + " addq 0(%3), %0;" + " movq %0, 0(%2);" + " adcxq 8(%3), %%r8;" + " movq %%r8, 8(%2);" + " adcxq 16(%3), %%r9;" + " movq %%r9, 16(%2);" + " adcxq 24(%3), %%r10;" + " movq %%r10, 24(%2);" + + /* Return the carry bit in a register */ + " adcx %%r11, %1;" + : "+&r"(f2), "=&r"(carry_r) + : "r"(out), "r"(f1) + : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); + + return carry_r; +} + +/* Computes the field addition of two field elements */ +static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) +{ + asm volatile( + /* Compute the raw addition of f1 + f2 */ + " movq 0(%0), %%r8;" + " addq 0(%2), %%r8;" + " movq 8(%0), %%r9;" + " adcxq 8(%2), %%r9;" + " movq 16(%0), %%r10;" + " adcxq 16(%2), %%r10;" + " movq 24(%0), %%r11;" + " adcxq 24(%2), %%r11;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute carry*38 */ + " mov $0, %%rax;" + " mov $38, %0;" + " cmovc %0, %%rax;" + + /* Step 2: Add carry*38 to the original sum */ + " xor %%ecx, %%ecx;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %0, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + : "+&r"(f2) + : "r"(out), "r"(f1) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); +} + +/* Computes the field subtraction of two field elements */ +static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) +{ + asm volatile( + /* Compute the raw subtraction of f1-f2 */ + " movq 0(%1), %%r8;" + " subq 0(%2), %%r8;" + " movq 8(%1), %%r9;" + " sbbq 8(%2), %%r9;" + " movq 16(%1), %%r10;" + " sbbq 16(%2), %%r10;" + " movq 24(%1), %%r11;" + " sbbq 24(%2), %%r11;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute carry*38 */ + " mov $0, %%rax;" + " mov $38, %%rcx;" + " cmovc %%rcx, %%rax;" + + /* Step 2: Subtract carry*38 from the original difference */ + " sub %%rax, %%r8;" + " sbb $0, %%r9;" + " sbb $0, %%r10;" + " sbb $0, %%r11;" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rcx, %%rax;" + " sub %%rax, %%r8;" + + /* Store the result */ + " movq %%r8, 0(%0);" + " movq %%r9, 8(%0);" + " movq %%r10, 16(%0);" + " movq %%r11, 24(%0);" + : + : "r"(out), "r"(f1), "r"(f2) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); +} + +/* Computes a field multiplication: out <- f1 * f2 + * Uses the 8-element buffer tmp for intermediate results */ +static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) +{ + asm volatile( + + /* Compute the raw multiplication: tmp <- src1 * src2 */ + + /* Compute src1[0] * src2 */ + " movq 0(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 0(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + + /* Compute src1[1] * src2 */ + " movq 8(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 8(%2), %%r8;" + " movq %%r8, 8(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 16(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[2] * src2 */ + " movq 16(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 16(%2), %%r8;" + " movq %%r8, 16(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 24(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[3] * src2 */ + " movq 24(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 24(%2), %%r8;" + " movq %%r8, 24(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 32(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 40(%2);" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%2);" + + /* Line up pointers */ + " mov %2, %0;" + " mov %3, %2;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %1, %%r9;" + " movq %%r9, 8(%2);" + " adcx %1, %%r10;" + " movq %%r10, 16(%2);" + " adcx %1, %%r11;" + " movq %%r11, 24(%2);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%2);" + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", + "%r14", "memory", "cc"); +} + +/* Computes two field multiplications: + * out[0] <- f1[0] * f2[0] + * out[1] <- f1[1] * f2[1] + * Uses the 16-element buffer tmp for intermediate results: */ +static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) +{ + asm volatile( + + /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ + + /* Compute src1[0] * src2 */ + " movq 0(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 0(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + + /* Compute src1[1] * src2 */ + " movq 8(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 8(%2), %%r8;" + " movq %%r8, 8(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 16(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[2] * src2 */ + " movq 16(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 16(%2), %%r8;" + " movq %%r8, 16(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 24(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[3] * src2 */ + " movq 24(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 24(%2), %%r8;" + " movq %%r8, 24(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 32(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 40(%2);" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%2);" + + /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ + + /* Compute src1[0] * src2 */ + " movq 32(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 64(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 72(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + + /* Compute src1[1] * src2 */ + " movq 40(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 72(%2), %%r8;" + " movq %%r8, 72(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 80(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[2] * src2 */ + " movq 48(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 80(%2), %%r8;" + " movq %%r8, 80(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 88(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[3] * src2 */ + " movq 56(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 88(%2), %%r8;" + " movq %%r8, 88(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 96(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 104(%2);" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 112(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 120(%2);" + + /* Line up pointers */ + " mov %2, %0;" + " mov %3, %2;" + + /* Wrap the results back into the field */ + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %1, %%r9;" + " movq %%r9, 8(%2);" + " adcx %1, %%r10;" + " movq %%r10, 16(%2);" + " adcx %1, %%r11;" + " movq %%r11, 24(%2);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%2);" + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 96(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 64(%0), %%r8;" + " mulxq 104(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 72(%0), %%r9;" + " mulxq 112(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 80(%0), %%r10;" + " mulxq 120(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 88(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %1, %%r9;" + " movq %%r9, 40(%2);" + " adcx %1, %%r10;" + " movq %%r10, 48(%2);" + " adcx %1, %%r11;" + " movq %%r11, 56(%2);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 32(%2);" + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", + "%r14", "memory", "cc"); +} + +/* Computes the field multiplication of four-element f1 with value in f2 + * Requires f2 to be smaller than 2^17 */ +static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) +{ + register u64 f2_r asm("rdx") = f2; + + asm volatile( + /* Compute the raw multiplication of f1*f2 */ + " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ + " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ + " add %%rcx, %%r9;" + " mov $0, %%rcx;" + " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ + " adcx %%rbx, %%r10;" + " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ + " adcx %%r13, %%r11;" + " adcx %%rcx, %%rax;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute carry*38 */ + " mov $38, %%rdx;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + : "+&r"(f2_r) + : "r"(out), "r"(f1) + : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", + "memory", "cc"); +} + +/* Computes p1 <- bit ? p2 : p1 in constant time */ +static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) +{ + asm volatile( + /* Transfer bit into CF flag */ + " add $18446744073709551615, %0;" + + /* cswap p1[0], p2[0] */ + " movq 0(%1), %%r8;" + " movq 0(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 0(%1);" + " movq %%r9, 0(%2);" + + /* cswap p1[1], p2[1] */ + " movq 8(%1), %%r8;" + " movq 8(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 8(%1);" + " movq %%r9, 8(%2);" + + /* cswap p1[2], p2[2] */ + " movq 16(%1), %%r8;" + " movq 16(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 16(%1);" + " movq %%r9, 16(%2);" + + /* cswap p1[3], p2[3] */ + " movq 24(%1), %%r8;" + " movq 24(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 24(%1);" + " movq %%r9, 24(%2);" + + /* cswap p1[4], p2[4] */ + " movq 32(%1), %%r8;" + " movq 32(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 32(%1);" + " movq %%r9, 32(%2);" + + /* cswap p1[5], p2[5] */ + " movq 40(%1), %%r8;" + " movq 40(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 40(%1);" + " movq %%r9, 40(%2);" + + /* cswap p1[6], p2[6] */ + " movq 48(%1), %%r8;" + " movq 48(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 48(%1);" + " movq %%r9, 48(%2);" + + /* cswap p1[7], p2[7] */ + " movq 56(%1), %%r8;" + " movq 56(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 56(%1);" + " movq %%r9, 56(%2);" + : "+&r"(bit) + : "r"(p1), "r"(p2) + : "%r8", "%r9", "%r10", "memory", "cc"); +} + +/* Computes the square of a field element: out <- f * f + * Uses the 8-element buffer tmp for intermediate results */ +static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) +{ + asm volatile( + /* Compute the raw multiplication: tmp <- f * f */ + + /* Step 1: Compute all partial products */ + " movq 0(%0), %%rdx;" /* f[0] */ + " mulxq 8(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 16(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%0), %%rdx;" /* f[3] */ + " mulxq 8(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ + " xor %%r15d, %%r15d;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%rbx;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%rbx, %%rbx;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + + /* Step 3: Compute intermediate squares */ + " movq 0(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 0(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%1);" + " movq 8(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 16(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%1);" + " movq 16(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 32(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 40(%1);" + " movq 24(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 48(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%1);" + + /* Line up pointers */ + " mov %1, %0;" + " mov %2, %1;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %%ecx, %%ecx;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + : "+&r"(f), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15", "memory", "cc"); +} + +/* Computes two field squarings: + * out[0] <- f[0] * f[0] + * out[1] <- f[1] * f[1] + * Uses the 16-element buffer tmp for intermediate results */ +static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) +{ + asm volatile( + /* Step 1: Compute all partial products */ + " movq 0(%0), %%rdx;" /* f[0] */ + " mulxq 8(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 16(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%0), %%rdx;" /* f[3] */ + " mulxq 8(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ + " xor %%r15d, %%r15d;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%rbx;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%rbx, %%rbx;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + + /* Step 3: Compute intermediate squares */ + " movq 0(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 0(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%1);" + " movq 8(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 16(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%1);" + " movq 16(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 32(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 40(%1);" + " movq 24(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 48(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%1);" + + /* Step 1: Compute all partial products */ + " movq 32(%0), %%rdx;" /* f[0] */ + " mulxq 40(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 48(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 56(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 56(%0), %%rdx;" /* f[3] */ + " mulxq 40(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 48(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 40(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 48(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ + " xor %%r15d, %%r15d;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%rbx;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%rbx, %%rbx;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + + /* Step 3: Compute intermediate squares */ + " movq 32(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 64(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 72(%1);" + " movq 40(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 80(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 88(%1);" + " movq 48(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 96(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 104(%1);" + " movq 56(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 112(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 120(%1);" + + /* Line up pointers */ + " mov %1, %0;" + " mov %2, %1;" + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %%ecx, %%ecx;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 96(%0), %%r8, %%r13;" + " xor %%ecx, %%ecx;" + " adoxq 64(%0), %%r8;" + " mulxq 104(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 72(%0), %%r9;" + " mulxq 112(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 80(%0), %%r10;" + " mulxq 120(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 88(%0), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 40(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 48(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 56(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 32(%1);" + : "+&r"(f), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15", "memory", "cc"); +} + +static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2) +{ + u64 *nq = p01_tmp1; + u64 *nq_p1 = p01_tmp1 + (u32)8U; + u64 *tmp1 = p01_tmp1 + (u32)16U; + u64 *x1 = q; + u64 *x2 = nq; + u64 *z2 = nq + (u32)4U; + u64 *z3 = nq_p1 + (u32)4U; + u64 *a = tmp1; + u64 *b = tmp1 + (u32)4U; + u64 *ab = tmp1; + u64 *dc = tmp1 + (u32)8U; + u64 *x3; + u64 *z31; + u64 *d0; + u64 *c0; + u64 *a1; + u64 *b1; + u64 *d; + u64 *c; + u64 *ab1; + u64 *dc1; + fadd(a, x2, z2); + fsub(b, x2, z2); + x3 = nq_p1; + z31 = nq_p1 + (u32)4U; + d0 = dc; + c0 = dc + (u32)4U; + fadd(c0, x3, z31); + fsub(d0, x3, z31); + fmul2(dc, dc, ab, tmp2); + fadd(x3, d0, c0); + fsub(z31, d0, c0); + a1 = tmp1; + b1 = tmp1 + (u32)4U; + d = tmp1 + (u32)8U; + c = tmp1 + (u32)12U; + ab1 = tmp1; + dc1 = tmp1 + (u32)8U; + fsqr2(dc1, ab1, tmp2); + fsqr2(nq_p1, nq_p1, tmp2); + a1[0U] = c[0U]; + a1[1U] = c[1U]; + a1[2U] = c[2U]; + a1[3U] = c[3U]; + fsub(c, d, c); + fmul_scalar(b1, c, (u64)121665U); + fadd(b1, b1, d); + fmul2(nq, dc1, ab1, tmp2); + fmul(z3, z3, x1, tmp2); +} + +static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2) +{ + u64 *x2 = nq; + u64 *z2 = nq + (u32)4U; + u64 *a = tmp1; + u64 *b = tmp1 + (u32)4U; + u64 *d = tmp1 + (u32)8U; + u64 *c = tmp1 + (u32)12U; + u64 *ab = tmp1; + u64 *dc = tmp1 + (u32)8U; + fadd(a, x2, z2); + fsub(b, x2, z2); + fsqr2(dc, ab, tmp2); + a[0U] = c[0U]; + a[1U] = c[1U]; + a[2U] = c[2U]; + a[3U] = c[3U]; + fsub(c, d, c); + fmul_scalar(b, c, (u64)121665U); + fadd(b, b, d); + fmul2(nq, dc, ab, tmp2); +} + +static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1) +{ + u64 tmp2[16U] = { 0U }; + u64 p01_tmp1_swap[33U] = { 0U }; + u64 *p0 = p01_tmp1_swap; + u64 *p01 = p01_tmp1_swap; + u64 *p03 = p01; + u64 *p11 = p01 + (u32)8U; + u64 *x0; + u64 *z0; + u64 *p01_tmp1; + u64 *p01_tmp11; + u64 *nq10; + u64 *nq_p11; + u64 *swap1; + u64 sw0; + u64 *nq1; + u64 *tmp1; + memcpy(p11, init1, (u32)8U * sizeof(init1[0U])); + x0 = p03; + z0 = p03 + (u32)4U; + x0[0U] = (u64)1U; + x0[1U] = (u64)0U; + x0[2U] = (u64)0U; + x0[3U] = (u64)0U; + z0[0U] = (u64)0U; + z0[1U] = (u64)0U; + z0[2U] = (u64)0U; + z0[3U] = (u64)0U; + p01_tmp1 = p01_tmp1_swap; + p01_tmp11 = p01_tmp1_swap; + nq10 = p01_tmp1_swap; + nq_p11 = p01_tmp1_swap + (u32)8U; + swap1 = p01_tmp1_swap + (u32)32U; + cswap2((u64)1U, nq10, nq_p11); + point_add_and_double(init1, p01_tmp11, tmp2); + swap1[0U] = (u64)1U; + { + u32 i; + for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) { + u64 *p01_tmp12 = p01_tmp1_swap; + u64 *swap2 = p01_tmp1_swap + (u32)32U; + u64 *nq2 = p01_tmp12; + u64 *nq_p12 = p01_tmp12 + (u32)8U; + u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U); + u64 sw = swap2[0U] ^ bit; + cswap2(sw, nq2, nq_p12); + point_add_and_double(init1, p01_tmp12, tmp2); + swap2[0U] = bit; + } + } + sw0 = swap1[0U]; + cswap2(sw0, nq10, nq_p11); + nq1 = p01_tmp1; + tmp1 = p01_tmp1 + (u32)16U; + point_double(nq1, tmp1, tmp2); + point_double(nq1, tmp1, tmp2); + point_double(nq1, tmp1, tmp2); + memcpy(out, p0, (u32)8U * sizeof(p0[0U])); + + memzero_explicit(tmp2, sizeof(tmp2)); + memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap)); +} + +static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1) +{ + u32 i; + fsqr(o, inp, tmp); + for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U) + fsqr(o, o, tmp); +} + +static void finv(u64 *o, const u64 *i, u64 *tmp) +{ + u64 t1[16U] = { 0U }; + u64 *a0 = t1; + u64 *b = t1 + (u32)4U; + u64 *c = t1 + (u32)8U; + u64 *t00 = t1 + (u32)12U; + u64 *tmp1 = tmp; + u64 *a; + u64 *t0; + fsquare_times(a0, i, tmp1, (u32)1U); + fsquare_times(t00, a0, tmp1, (u32)2U); + fmul(b, t00, i, tmp); + fmul(a0, b, a0, tmp); + fsquare_times(t00, a0, tmp1, (u32)1U); + fmul(b, t00, b, tmp); + fsquare_times(t00, b, tmp1, (u32)5U); + fmul(b, t00, b, tmp); + fsquare_times(t00, b, tmp1, (u32)10U); + fmul(c, t00, b, tmp); + fsquare_times(t00, c, tmp1, (u32)20U); + fmul(t00, t00, c, tmp); + fsquare_times(t00, t00, tmp1, (u32)10U); + fmul(b, t00, b, tmp); + fsquare_times(t00, b, tmp1, (u32)50U); + fmul(c, t00, b, tmp); + fsquare_times(t00, c, tmp1, (u32)100U); + fmul(t00, t00, c, tmp); + fsquare_times(t00, t00, tmp1, (u32)50U); + fmul(t00, t00, b, tmp); + fsquare_times(t00, t00, tmp1, (u32)5U); + a = t1; + t0 = t1 + (u32)12U; + fmul(o, t0, a, tmp); +} + +static void store_felem(u64 *b, u64 *f) +{ + u64 f30 = f[3U]; + u64 top_bit0 = f30 >> (u32)63U; + u64 f31; + u64 top_bit; + u64 f0; + u64 f1; + u64 f2; + u64 f3; + u64 m0; + u64 m1; + u64 m2; + u64 m3; + u64 mask; + u64 f0_; + u64 f1_; + u64 f2_; + u64 f3_; + u64 o0; + u64 o1; + u64 o2; + u64 o3; + f[3U] = f30 & (u64)0x7fffffffffffffffU; + add_scalar(f, f, (u64)19U * top_bit0); + f31 = f[3U]; + top_bit = f31 >> (u32)63U; + f[3U] = f31 & (u64)0x7fffffffffffffffU; + add_scalar(f, f, (u64)19U * top_bit); + f0 = f[0U]; + f1 = f[1U]; + f2 = f[2U]; + f3 = f[3U]; + m0 = gte_mask(f0, (u64)0xffffffffffffffedU); + m1 = eq_mask(f1, (u64)0xffffffffffffffffU); + m2 = eq_mask(f2, (u64)0xffffffffffffffffU); + m3 = eq_mask(f3, (u64)0x7fffffffffffffffU); + mask = ((m0 & m1) & m2) & m3; + f0_ = f0 - (mask & (u64)0xffffffffffffffedU); + f1_ = f1 - (mask & (u64)0xffffffffffffffffU); + f2_ = f2 - (mask & (u64)0xffffffffffffffffU); + f3_ = f3 - (mask & (u64)0x7fffffffffffffffU); + o0 = f0_; + o1 = f1_; + o2 = f2_; + o3 = f3_; + b[0U] = o0; + b[1U] = o1; + b[2U] = o2; + b[3U] = o3; +} + +static void encode_point(u8 *o, const u64 *i) +{ + const u64 *x = i; + const u64 *z = i + (u32)4U; + u64 tmp[4U] = { 0U }; + u64 tmp_w[16U] = { 0U }; + finv(tmp, z, tmp_w); + fmul(tmp, tmp, x, tmp_w); + store_felem((u64 *)o, tmp); +} + +static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub) +{ + u64 init1[8U] = { 0U }; + u64 tmp[4U] = { 0U }; + u64 tmp3; + u64 *x; + u64 *z; + { + u32 i; + for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) { + u64 *os = tmp; + const u8 *bj = pub + i * (u32)8U; + u64 u = *(u64 *)bj; + u64 r = u; + u64 x0 = r; + os[i] = x0; + } + } + tmp3 = tmp[3U]; + tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU; + x = init1; + z = init1 + (u32)4U; + z[0U] = (u64)1U; + z[1U] = (u64)0U; + z[2U] = (u64)0U; + z[3U] = (u64)0U; + x[0U] = tmp[0U]; + x[1U] = tmp[1U]; + x[2U] = tmp[2U]; + x[3U] = tmp[3U]; + montgomery_ladder(init1, priv, init1); + encode_point(out, init1); +} + +/* The below constants were generated using this sage script: + * + * #!/usr/bin/env sage + * import sys + * from sage.all import * + * def limbs(n): + * n = int(n) + * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64) + * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l + * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0]) + * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0] + * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s)) + * print("static const u64 table_ladder[] = {") + * p = ec.lift_x(9) + * for i in range(252): + * l = (p[0] + p[2]) / (p[0] - p[2]) + * print(("\t%s" + ("," if i != 251 else "")) % limbs(l)) + * p = p * 2 + * print("};") + * + */ + +static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL }; + +static const u64 table_ladder[] = { + 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL, + 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL, + 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL, + 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL, + 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL, + 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL, + 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL, + 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL, + 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL, + 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL, + 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL, + 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL, + 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL, + 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL, + 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL, + 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL, + 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL, + 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL, + 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL, + 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL, + 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL, + 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL, + 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL, + 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL, + 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL, + 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL, + 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL, + 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL, + 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL, + 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL, + 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL, + 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL, + 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL, + 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL, + 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL, + 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL, + 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL, + 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL, + 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL, + 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL, + 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL, + 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL, + 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL, + 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL, + 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL, + 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL, + 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL, + 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL, + 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL, + 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL, + 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL, + 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL, + 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL, + 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL, + 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL, + 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL, + 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL, + 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL, + 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL, + 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL, + 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL, + 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL, + 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL, + 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL, + 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL, + 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL, + 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL, + 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL, + 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL, + 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL, + 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL, + 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL, + 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL, + 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL, + 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL, + 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL, + 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL, + 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL, + 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL, + 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL, + 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL, + 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL, + 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL, + 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL, + 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL, + 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL, + 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL, + 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL, + 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL, + 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL, + 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL, + 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL, + 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL, + 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL, + 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL, + 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL, + 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL, + 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL, + 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL, + 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL, + 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL, + 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL, + 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL, + 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL, + 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL, + 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL, + 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL, + 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL, + 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL, + 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL, + 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL, + 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL, + 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL, + 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL, + 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL, + 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL, + 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL, + 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL, + 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL, + 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL, + 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL, + 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL, + 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL, + 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL, + 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL, + 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL, + 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL, + 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL, + 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL, + 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL, + 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL, + 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL, + 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL, + 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL, + 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL, + 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL, + 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL, + 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL, + 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL, + 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL, + 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL, + 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL, + 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL, + 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL, + 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL, + 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL, + 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL, + 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL, + 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL, + 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL, + 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL, + 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL, + 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL, + 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL, + 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL, + 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL, + 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL, + 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL, + 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL, + 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL, + 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL, + 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL, + 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL, + 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL, + 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL, + 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL, + 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL, + 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL, + 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL, + 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL, + 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL, + 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL, + 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL, + 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL, + 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL, + 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL, + 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL, + 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL, + 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL, + 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL, + 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL, + 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL, + 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL, + 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL, + 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL, + 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL, + 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL, + 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL, + 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL, + 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL, + 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL, + 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL, + 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL, + 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL, + 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL, + 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL, + 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL, + 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL, + 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL, + 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL, + 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL, + 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL, + 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL, + 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL, + 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL, + 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL, + 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL, + 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL, + 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL, + 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL, + 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL, + 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL, + 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL, + 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL, + 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL, + 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL, + 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL, + 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL, + 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL, + 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL, + 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL, + 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL, + 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL, + 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL, + 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL, + 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL, + 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL, + 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL, + 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL, + 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL, + 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL, + 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL, + 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL, + 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL, + 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL, + 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL, + 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL, + 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL, + 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL, + 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL, + 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL, + 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL, + 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL, + 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL, + 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL, + 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL, + 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL, + 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL, + 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL, + 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL, + 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL, + 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL +}; + +static void curve25519_ever64_base(u8 *out, const u8 *priv) +{ + u64 swap = 1; + int i, j, k; + u64 tmp[16 + 32 + 4]; + u64 *x1 = &tmp[0]; + u64 *z1 = &tmp[4]; + u64 *x2 = &tmp[8]; + u64 *z2 = &tmp[12]; + u64 *xz1 = &tmp[0]; + u64 *xz2 = &tmp[8]; + u64 *a = &tmp[0 + 16]; + u64 *b = &tmp[4 + 16]; + u64 *c = &tmp[8 + 16]; + u64 *ab = &tmp[0 + 16]; + u64 *abcd = &tmp[0 + 16]; + u64 *ef = &tmp[16 + 16]; + u64 *efgh = &tmp[16 + 16]; + u64 *key = &tmp[0 + 16 + 32]; + + memcpy(key, priv, 32); + ((u8 *)key)[0] &= 248; + ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64; + + x1[0] = 1, x1[1] = x1[2] = x1[3] = 0; + z1[0] = 1, z1[1] = z1[2] = z1[3] = 0; + z2[0] = 1, z2[1] = z2[2] = z2[3] = 0; + memcpy(x2, p_minus_s, sizeof(p_minus_s)); + + j = 3; + for (i = 0; i < 4; ++i) { + while (j < (const int[]){ 64, 64, 64, 63 }[i]) { + u64 bit = (key[i] >> j) & 1; + k = (64 * i + j - 3); + swap = swap ^ bit; + cswap2(swap, xz1, xz2); + swap = bit; + fsub(b, x1, z1); + fadd(a, x1, z1); + fmul(c, &table_ladder[4 * k], b, ef); + fsub(b, a, c); + fadd(a, a, c); + fsqr2(ab, ab, efgh); + fmul2(xz1, xz2, ab, efgh); + ++j; + } + j = 0; + } + + point_double(xz1, abcd, efgh); + point_double(xz1, abcd, efgh); + point_double(xz1, abcd, efgh); + encode_point(out, xz1); + + memzero_explicit(tmp, sizeof(tmp)); +} + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); + +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_bmi2_adx)) + curve25519_ever64(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_bmi2_adx)) + curve25519_ever64_base(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); +} + +#define curve25519_mod_init_arch curve25519_mod_init_arch +static void curve25519_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + static_branch_enable(&curve25519_use_bmi2_adx); +} -- cgit v1.2.3 From 0b6cb344829b4b8605a3f6e930b207d47dee1d12 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 22 Jul 2025 09:34:35 +0200 Subject: media: v4l2-core: v4l2-dv-timings: support DRM IFs Add support for DRM (Dynamic Range and Mastering) InfoFrames. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-dv-timings.c | 4 ++++ include/media/v4l2-dv-timings.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-dv-timings.c b/drivers/media/v4l2-core/v4l2-dv-timings.c index 7710cb26bea0..346d1b0e10ce 100644 --- a/drivers/media/v4l2-core/v4l2-dv-timings.c +++ b/drivers/media/v4l2-core/v4l2-dv-timings.c @@ -1226,6 +1226,7 @@ DEBUGFS_FOPS(avi, V4L2_DEBUGFS_IF_AVI); DEBUGFS_FOPS(audio, V4L2_DEBUGFS_IF_AUDIO); DEBUGFS_FOPS(spd, V4L2_DEBUGFS_IF_SPD); DEBUGFS_FOPS(hdmi, V4L2_DEBUGFS_IF_HDMI); +DEBUGFS_FOPS(drm, V4L2_DEBUGFS_IF_DRM); struct v4l2_debugfs_if *v4l2_debugfs_if_alloc(struct dentry *root, u32 if_types, void *priv, @@ -1255,6 +1256,9 @@ struct v4l2_debugfs_if *v4l2_debugfs_if_alloc(struct dentry *root, u32 if_types, if (if_types & V4L2_DEBUGFS_IF_HDMI) debugfs_create_file("hdmi", 0400, infoframes->if_dir, infoframes, &infoframe_hdmi_fops); + if (if_types & V4L2_DEBUGFS_IF_DRM) + debugfs_create_file("hdr_drm", 0400, infoframes->if_dir, + infoframes, &infoframe_drm_fops); return infoframes; } EXPORT_SYMBOL_GPL(v4l2_debugfs_if_alloc); diff --git a/include/media/v4l2-dv-timings.h b/include/media/v4l2-dv-timings.h index 714075c72f77..2b42e5d81f9e 100644 --- a/include/media/v4l2-dv-timings.h +++ b/include/media/v4l2-dv-timings.h @@ -275,6 +275,7 @@ int v4l2_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port); #define V4L2_DEBUGFS_IF_AUDIO BIT(1) #define V4L2_DEBUGFS_IF_SPD BIT(2) #define V4L2_DEBUGFS_IF_HDMI BIT(3) +#define V4L2_DEBUGFS_IF_DRM BIT(4) typedef ssize_t (*v4l2_debugfs_if_read_t)(u32 type, void *priv, struct file *filp, char __user *ubuf, -- cgit v1.2.3 From 43bd82eb33b2ac33232724a8ddb9e07cde492328 Mon Sep 17 00:00:00 2001 From: Denzeel Oliva Date: Thu, 4 Sep 2025 14:07:11 +0000 Subject: dt-bindings: clock: exynos990: Add PERIC0 and PERIC1 clock units Add clock management unit bindings for PERIC0 and PERIC1 blocks which provide clocks for USI, I2C and UART peripherals. Signed-off-by: Denzeel Oliva Signed-off-by: Krzysztof Kozlowski --- .../bindings/clock/samsung,exynos990-clock.yaml | 24 +++ include/dt-bindings/clock/samsung,exynos990.h | 176 +++++++++++++++++++++ 2 files changed, 200 insertions(+) (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos990-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos990-clock.yaml index c15cc1752b02..5cd2d80b8ed6 100644 --- a/Documentation/devicetree/bindings/clock/samsung,exynos990-clock.yaml +++ b/Documentation/devicetree/bindings/clock/samsung,exynos990-clock.yaml @@ -30,6 +30,8 @@ description: | properties: compatible: enum: + - samsung,exynos990-cmu-peric1 + - samsung,exynos990-cmu-peric0 - samsung,exynos990-cmu-hsi0 - samsung,exynos990-cmu-peris - samsung,exynos990-cmu-top @@ -56,6 +58,28 @@ required: - reg allOf: + - if: + properties: + compatible: + contains: + enum: + - samsung,exynos990-cmu-peric1 + - samsung,exynos990-cmu-peric0 + + then: + properties: + clocks: + items: + - description: External reference clock (26 MHz) + - description: Connectivity Peripheral 0/1 bus clock (from CMU_TOP) + - description: Connectivity Peripheral 0/1 IP clock (from CMU_TOP) + + clock-names: + items: + - const: oscclk + - const: bus + - const: ip + - if: properties: compatible: diff --git a/include/dt-bindings/clock/samsung,exynos990.h b/include/dt-bindings/clock/samsung,exynos990.h index c60f15503d5b..47540307cb52 100644 --- a/include/dt-bindings/clock/samsung,exynos990.h +++ b/include/dt-bindings/clock/samsung,exynos990.h @@ -238,6 +238,182 @@ #define CLK_GOUT_HSI0_XIU_D_HSI0_ACLK 22 #define CLK_GOUT_HSI0_LHS_ACEL_D_HSI0_CLK 23 +/* CMU_PERIC0 */ +#define CLK_MOUT_PERIC0_BUS_USER 1 +#define CLK_MOUT_PERIC0_UART_DBG 2 +#define CLK_MOUT_PERIC0_USI00_USI_USER 3 +#define CLK_MOUT_PERIC0_USI01_USI_USER 4 +#define CLK_MOUT_PERIC0_USI02_USI_USER 5 +#define CLK_MOUT_PERIC0_USI03_USI_USER 6 +#define CLK_MOUT_PERIC0_USI04_USI_USER 7 +#define CLK_MOUT_PERIC0_USI05_USI_USER 8 +#define CLK_MOUT_PERIC0_USI13_USI_USER 9 +#define CLK_MOUT_PERIC0_USI14_USI_USER 10 +#define CLK_MOUT_PERIC0_USI15_USI_USER 11 +#define CLK_MOUT_PERIC0_USI_I2C_USER 12 +#define CLK_DOUT_PERIC0_UART_DBG 13 +#define CLK_DOUT_PERIC0_USI00_USI 14 +#define CLK_DOUT_PERIC0_USI01_USI 15 +#define CLK_DOUT_PERIC0_USI02_USI 16 +#define CLK_DOUT_PERIC0_USI03_USI 17 +#define CLK_DOUT_PERIC0_USI04_USI 18 +#define CLK_DOUT_PERIC0_USI05_USI 19 +#define CLK_DOUT_PERIC0_USI13_USI 20 +#define CLK_DOUT_PERIC0_USI14_USI 21 +#define CLK_DOUT_PERIC0_USI15_USI 22 +#define CLK_DOUT_PERIC0_USI_I2C 23 +#define CLK_GOUT_PERIC0_CMU_PCLK 24 +#define CLK_GOUT_PERIC0_OSCCLK_CLK 25 +#define CLK_GOUT_PERIC0_D_TZPC_PCLK 26 +#define CLK_GOUT_PERIC0_GPIO_PCLK 27 +#define CLK_GOUT_PERIC0_LHM_AXI_P_CLK 28 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_10 29 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_11 30 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_12 31 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_13 32 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_14 33 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_15 34 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_4 35 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_5 36 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_6 37 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_7 38 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_8 39 +#define CLK_GOUT_PERIC0_TOP0_IPCLK_9 40 +#define CLK_GOUT_PERIC0_TOP0_PCLK_10 41 +#define CLK_GOUT_PERIC0_TOP0_PCLK_11 42 +#define CLK_GOUT_PERIC0_TOP0_PCLK_12 43 +#define CLK_GOUT_PERIC0_TOP0_PCLK_13 44 +#define CLK_GOUT_PERIC0_TOP0_PCLK_14 45 +#define CLK_GOUT_PERIC0_TOP0_PCLK_15 46 +#define CLK_GOUT_PERIC0_TOP0_PCLK_4 47 +#define CLK_GOUT_PERIC0_TOP0_PCLK_5 48 +#define CLK_GOUT_PERIC0_TOP0_PCLK_6 49 +#define CLK_GOUT_PERIC0_TOP0_PCLK_7 50 +#define CLK_GOUT_PERIC0_TOP0_PCLK_8 51 +#define CLK_GOUT_PERIC0_TOP0_PCLK_9 52 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_0 53 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_3 54 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_4 55 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_5 56 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_6 57 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_7 58 +#define CLK_GOUT_PERIC0_TOP1_IPCLK_8 59 +#define CLK_GOUT_PERIC0_TOP1_PCLK_0 60 +#define CLK_GOUT_PERIC0_TOP1_PCLK_15 61 +#define CLK_GOUT_PERIC0_TOP1_PCLK_3 62 +#define CLK_GOUT_PERIC0_TOP1_PCLK_4 63 +#define CLK_GOUT_PERIC0_TOP1_PCLK_5 64 +#define CLK_GOUT_PERIC0_TOP1_PCLK_6 65 +#define CLK_GOUT_PERIC0_TOP1_PCLK_7 66 +#define CLK_GOUT_PERIC0_TOP1_PCLK_8 67 +#define CLK_GOUT_PERIC0_BUSP_CLK 68 +#define CLK_GOUT_PERIC0_UART_DBG_CLK 69 +#define CLK_GOUT_PERIC0_USI00_USI_CLK 70 +#define CLK_GOUT_PERIC0_USI01_USI_CLK 71 +#define CLK_GOUT_PERIC0_USI02_USI_CLK 72 +#define CLK_GOUT_PERIC0_USI03_USI_CLK 73 +#define CLK_GOUT_PERIC0_USI04_USI_CLK 74 +#define CLK_GOUT_PERIC0_USI05_USI_CLK 75 +#define CLK_GOUT_PERIC0_USI13_USI_CLK 76 +#define CLK_GOUT_PERIC0_USI14_USI_CLK 77 +#define CLK_GOUT_PERIC0_USI15_USI_CLK 78 +#define CLK_GOUT_PERIC0_USI_I2C_CLK 79 +#define CLK_GOUT_PERIC0_SYSREG_PCLK 80 + +/* CMU_PERIC1 */ +#define CLK_MOUT_PERIC1_BUS_USER 1 +#define CLK_MOUT_PERIC1_UART_BT_USER 2 +#define CLK_MOUT_PERIC1_USI06_USI_USER 3 +#define CLK_MOUT_PERIC1_USI07_USI_USER 4 +#define CLK_MOUT_PERIC1_USI08_USI_USER 5 +#define CLK_MOUT_PERIC1_USI09_USI_USER 6 +#define CLK_MOUT_PERIC1_USI10_USI_USER 7 +#define CLK_MOUT_PERIC1_USI11_USI_USER 8 +#define CLK_MOUT_PERIC1_USI12_USI_USER 9 +#define CLK_MOUT_PERIC1_USI18_USI_USER 10 +#define CLK_MOUT_PERIC1_USI16_USI_USER 11 +#define CLK_MOUT_PERIC1_USI17_USI_USER 12 +#define CLK_MOUT_PERIC1_USI_I2C_USER 13 +#define CLK_DOUT_PERIC1_UART_BT 14 +#define CLK_DOUT_PERIC1_USI06_USI 15 +#define CLK_DOUT_PERIC1_USI07_USI 16 +#define CLK_DOUT_PERIC1_USI08_USI 17 +#define CLK_DOUT_PERIC1_USI18_USI 18 +#define CLK_DOUT_PERIC1_USI12_USI 19 +#define CLK_DOUT_PERIC1_USI09_USI 20 +#define CLK_DOUT_PERIC1_USI10_USI 21 +#define CLK_DOUT_PERIC1_USI11_USI 22 +#define CLK_DOUT_PERIC1_USI16_USI 23 +#define CLK_DOUT_PERIC1_USI17_USI 24 +#define CLK_DOUT_PERIC1_USI_I2C 25 +#define CLK_GOUT_PERIC1_CMU_PCLK 26 +#define CLK_GOUT_PERIC1_UART_BT_CLK 27 +#define CLK_GOUT_PERIC1_USI12_USI_CLK 28 +#define CLK_GOUT_PERIC1_USI18_USI_CLK 29 +#define CLK_GOUT_PERIC1_D_TZPC_PCLK 30 +#define CLK_GOUT_PERIC1_GPIO_PCLK 31 +#define CLK_GOUT_PERIC1_LHM_AXI_P_CSIS_CLK 32 +#define CLK_GOUT_PERIC1_LHM_AXI_P_CLK 33 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_10 34 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_11 35 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_12 36 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_13 37 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_14 38 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_15 39 +#define CLK_GOUT_PERIC1_TOP0_IPCLK_4 40 +#define CLK_GOUT_PERIC1_TOP0_PCLK_10 41 +#define CLK_GOUT_PERIC1_TOP0_PCLK_11 42 +#define CLK_GOUT_PERIC1_TOP0_PCLK_12 43 +#define CLK_GOUT_PERIC1_TOP0_PCLK_13 44 +#define CLK_GOUT_PERIC1_TOP0_PCLK_14 45 +#define CLK_GOUT_PERIC1_TOP0_PCLK_15 46 +#define CLK_GOUT_PERIC1_TOP0_PCLK_4 47 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_0 48 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_1 49 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_10 50 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_12 51 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_13 52 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_14 53 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_15 54 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_2 55 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_3 56 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_4 57 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_5 58 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_6 59 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_7 60 +#define CLK_GOUT_PERIC1_TOP1_IPCLK_9 61 +#define CLK_GOUT_PERIC1_TOP1_PCLK_0 62 +#define CLK_GOUT_PERIC1_TOP1_PCLK_1 63 +#define CLK_GOUT_PERIC1_TOP1_PCLK_10 64 +#define CLK_GOUT_PERIC1_TOP1_PCLK_12 65 +#define CLK_GOUT_PERIC1_TOP1_PCLK_13 66 +#define CLK_GOUT_PERIC1_TOP1_PCLK_14 67 +#define CLK_GOUT_PERIC1_TOP1_PCLK_15 68 +#define CLK_GOUT_PERIC1_TOP1_PCLK_2 69 +#define CLK_GOUT_PERIC1_TOP1_PCLK_3 70 +#define CLK_GOUT_PERIC1_TOP1_PCLK_4 71 +#define CLK_GOUT_PERIC1_TOP1_PCLK_5 72 +#define CLK_GOUT_PERIC1_TOP1_PCLK_6 73 +#define CLK_GOUT_PERIC1_TOP1_PCLK_7 74 +#define CLK_GOUT_PERIC1_TOP1_PCLK_9 75 +#define CLK_GOUT_PERIC1_BUSP_CLK 76 +#define CLK_GOUT_PERIC1_OSCCLK_CLK 77 +#define CLK_GOUT_PERIC1_USI06_USI_CLK 78 +#define CLK_GOUT_PERIC1_USI07_USI_CLK 79 +#define CLK_GOUT_PERIC1_USI08_USI_CLK 80 +#define CLK_GOUT_PERIC1_USI09_USI_CLK 81 +#define CLK_GOUT_PERIC1_USI10_USI_CLK 82 +#define CLK_GOUT_PERIC1_USI11_USI_CLK 83 +#define CLK_GOUT_PERIC1_USI16_USI_CLK 84 +#define CLK_GOUT_PERIC1_USI17_USI_CLK 85 +#define CLK_GOUT_PERIC1_USI_I2C_CLK 86 +#define CLK_GOUT_PERIC1_SYSREG_PCLK 87 +#define CLK_GOUT_PERIC1_USI16_I3C_PCLK 88 +#define CLK_GOUT_PERIC1_USI16_I3C_SCLK 89 +#define CLK_GOUT_PERIC1_USI17_I3C_PCLK 90 +#define CLK_GOUT_PERIC1_USI17_I3C_SCLK 91 +#define CLK_GOUT_PERIC1_XIU_P_ACLK 92 + /* CMU_PERIS */ #define CLK_MOUT_PERIS_BUS_USER 1 #define CLK_MOUT_PERIS_CLK_PERIS_GIC 2 -- cgit v1.2.3 From 0bcd01f757bc06471c82a137eafee281ef1b6e38 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 28 Aug 2024 21:56:57 -0700 Subject: hwmon: Introduce 64-bit energy attribute support Many chips require 64-bit variables to display the accumulated energy, even more so since the energy units are micro-Joule. Add new sensor type "energy64" to support reporting the chip energy as 64-bit values. Changing the entire hardware monitoring API is not feasible, and it is only really necessary to support reading 64-bit values for the "energyX_input" attribute. For this reason, keep the API as-is and use type casts on both ends to pass 64-bit pointers when reading the accumulated energy. On the write side (which is only useful for the energyX_enable attribute), keep passing the written value as long. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/hwmon-kernel-api.rst | 3 +++ drivers/hwmon/hwmon.c | 14 ++++++++++---- include/linux/hwmon.h | 1 + include/trace/events/hwmon.h | 10 +++++----- 4 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index e47fc757e63e..037b69c23cb5 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -159,6 +159,7 @@ It contains following fields: hwmon_curr Current sensor hwmon_power Power sensor hwmon_energy Energy sensor + hwmon_energy64 Energy sensor, reported as 64-bit signed value hwmon_humidity Humidity sensor hwmon_fan Fan speed sensor hwmon_pwm PWM control @@ -288,6 +289,8 @@ Parameters: The sensor channel number. val: Pointer to attribute value. + For hwmon_energy64, `'val`' is passed as `long *` but needs + a typecast to `s64 *`. Return value: 0 on success, a negative error number otherwise. diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 1688c210888a..2e17f3a4c59b 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -426,18 +426,22 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + s64 val64; long val; int ret; ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, - &val); + (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) return ret; + if (hattr->type != hwmon_energy64) + val64 = val; + trace_hwmon_attr_show(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, val64); - return sprintf(buf, "%ld\n", val); + return sprintf(buf, "%lld\n", val64); } static ssize_t hwmon_attr_show_string(struct device *dev, @@ -478,7 +482,7 @@ static ssize_t hwmon_attr_store(struct device *dev, return ret; trace_hwmon_attr_store(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, (s64)val); return count; } @@ -734,6 +738,7 @@ static const char * const *__templates[] = { [hwmon_curr] = hwmon_curr_attr_templates, [hwmon_power] = hwmon_power_attr_templates, [hwmon_energy] = hwmon_energy_attr_templates, + [hwmon_energy64] = hwmon_energy_attr_templates, [hwmon_humidity] = hwmon_humidity_attr_templates, [hwmon_fan] = hwmon_fan_attr_templates, [hwmon_pwm] = hwmon_pwm_attr_templates, @@ -747,6 +752,7 @@ static const int __templates_size[] = { [hwmon_curr] = ARRAY_SIZE(hwmon_curr_attr_templates), [hwmon_power] = ARRAY_SIZE(hwmon_power_attr_templates), [hwmon_energy] = ARRAY_SIZE(hwmon_energy_attr_templates), + [hwmon_energy64] = ARRAY_SIZE(hwmon_energy_attr_templates), [hwmon_humidity] = ARRAY_SIZE(hwmon_humidity_attr_templates), [hwmon_fan] = ARRAY_SIZE(hwmon_fan_attr_templates), [hwmon_pwm] = ARRAY_SIZE(hwmon_pwm_attr_templates), diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 3a63dff62d03..886fc90b2d25 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -24,6 +24,7 @@ enum hwmon_sensor_types { hwmon_curr, hwmon_power, hwmon_energy, + hwmon_energy64, hwmon_humidity, hwmon_fan, hwmon_pwm, diff --git a/include/trace/events/hwmon.h b/include/trace/events/hwmon.h index d1ff560cd9b5..3865098f21f1 100644 --- a/include/trace/events/hwmon.h +++ b/include/trace/events/hwmon.h @@ -9,14 +9,14 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val), TP_STRUCT__entry( __field(int, index) __string(attr_name, attr_name) - __field(long, val) + __field(long long, val) ), TP_fast_assign( @@ -25,20 +25,20 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, __entry->val = val; ), - TP_printk("index=%d, attr_name=%s, val=%ld", + TP_printk("index=%d, attr_name=%s, val=%lld", __entry->index, __get_str(attr_name), __entry->val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_show, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_store, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); -- cgit v1.2.3 From 2c92e2fbe9e22cefdae87d8a0d654691ee4c1957 Mon Sep 17 00:00:00 2001 From: Joris Verhaegen Date: Fri, 5 Sep 2025 10:12:54 +0100 Subject: ALSA: compress_offload: Add 64-bit safe timestamp infrastructure The copied_total field in struct snd_compr_tstamp is a 32-bit value that can overflow on long-running high-bitrate streams, leading to incorrect calculations for buffer availablility. This patch adds a 64-bit safe timestamping mechanism. A new UAPI struct, snd_compr_tstamp64, is added which uses 64-bit types for byte counters. The relevant ops structures across the ASoC and core compress code are updated to use this new struct. ASoC drivers are updated to use u64 counters. Internal timestamps being u64 now, a compatibility function is added to convert the 64-bit timestamp back to the 32-bit format for legacy ioctl callers. Reviewed-by: Miller Liang Tested-by: Joris Verhaegen Signed-off-by: Joris Verhaegen Reviewed-by: Srinivas Kandagatla Reviewed-by: Charles Keepax Acked-by: Mark Brown Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250905091301.2711705-2-verhaegen@google.com --- include/sound/compress_driver.h | 2 +- include/sound/soc-component.h | 4 +- include/sound/soc-dai.h | 7 +-- include/uapi/sound/compress_offload.h | 19 +++++++++ sound/core/compress_offload.c | 52 +++++++++++++++-------- sound/soc/codecs/wm_adsp.c | 4 +- sound/soc/codecs/wm_adsp.h | 2 +- sound/soc/intel/atom/sst-mfld-platform-compress.c | 12 +++--- sound/soc/intel/atom/sst-mfld-platform.h | 2 +- sound/soc/intel/atom/sst/sst_drv_interface.c | 9 ++-- sound/soc/intel/avs/probes.c | 2 +- sound/soc/qcom/qdsp6/q6apm-dai.c | 26 +++++++----- sound/soc/qcom/qdsp6/q6asm-dai.c | 26 +++++++----- sound/soc/soc-component.c | 2 +- sound/soc/soc-compress.c | 2 +- sound/soc/soc-dai.c | 2 +- sound/soc/sof/amd/acp-probes.c | 2 +- sound/soc/sof/compress.c | 2 +- sound/soc/sof/intel/hda-probes.c | 2 +- sound/soc/sof/sof-client-probes.c | 2 +- sound/soc/sof/sof-client-probes.h | 4 +- sound/soc/sprd/sprd-pcm-compress.c | 6 +-- sound/soc/sprd/sprd-pcm-dma.h | 4 +- sound/soc/uniphier/aio-compress.c | 2 +- 24 files changed, 125 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/sound/compress_driver.h b/include/sound/compress_driver.h index b55c9eeb2b54..9e3d801e45ec 100644 --- a/include/sound/compress_driver.h +++ b/include/sound/compress_driver.h @@ -161,7 +161,7 @@ struct snd_compr_ops { struct snd_compr_metadata *metadata); int (*trigger)(struct snd_compr_stream *stream, int cmd); int (*pointer)(struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp); + struct snd_compr_tstamp64 *tstamp); int (*copy)(struct snd_compr_stream *stream, char __user *buf, size_t count); int (*mmap)(struct snd_compr_stream *stream, diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 2caa807c6249..cdb536c4ab2b 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -47,7 +47,7 @@ struct snd_compress_ops { struct snd_compr_stream *stream, int cmd); int (*pointer)(struct snd_soc_component *component, struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp); + struct snd_compr_tstamp64 *tstamp); int (*copy)(struct snd_soc_component *component, struct snd_compr_stream *stream, char __user *buf, size_t count); @@ -498,7 +498,7 @@ int snd_soc_component_compr_get_codec_caps(struct snd_compr_stream *cstream, struct snd_compr_codec_caps *codec); int snd_soc_component_compr_ack(struct snd_compr_stream *cstream, size_t bytes); int snd_soc_component_compr_pointer(struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp); + struct snd_compr_tstamp64 *tstamp); int snd_soc_component_compr_copy(struct snd_compr_stream *cstream, char __user *buf, size_t count); int snd_soc_component_compr_set_metadata(struct snd_compr_stream *cstream, diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 166c29557e9d..224396927aef 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -256,7 +256,7 @@ int snd_soc_dai_compr_ack(struct snd_soc_dai *dai, size_t bytes); int snd_soc_dai_compr_pointer(struct snd_soc_dai *dai, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp); + struct snd_compr_tstamp64 *tstamp); int snd_soc_dai_compr_set_metadata(struct snd_soc_dai *dai, struct snd_compr_stream *cstream, struct snd_compr_metadata *metadata); @@ -383,8 +383,9 @@ struct snd_soc_cdai_ops { struct snd_compr_metadata *, struct snd_soc_dai *); int (*trigger)(struct snd_compr_stream *, int, struct snd_soc_dai *); - int (*pointer)(struct snd_compr_stream *, - struct snd_compr_tstamp *, struct snd_soc_dai *); + int (*pointer)(struct snd_compr_stream *stream, + struct snd_compr_tstamp64 *tstamp, + struct snd_soc_dai *dai); int (*ack)(struct snd_compr_stream *, size_t, struct snd_soc_dai *); }; diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index d62eb93af0ed..abd0ea3f86ee 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -56,6 +56,25 @@ struct snd_compr_tstamp { __u32 sampling_rate; } __attribute__((packed, aligned(4))); +/** + * struct snd_compr_tstamp64 - timestamp descriptor with fields in 64 bit + * @byte_offset: Byte offset in ring buffer to DSP + * @copied_total: Total number of bytes copied from/to ring buffer to/by DSP + * @pcm_frames: Frames decoded or encoded by DSP. This field will evolve by + * large steps and should only be used to monitor encoding/decoding + * progress. It shall not be used for timing estimates. + * @pcm_io_frames: Frames rendered or received by DSP into a mixer or an audio + * output/input. This field should be used for A/V sync or time estimates. + * @sampling_rate: sampling rate of audio + */ +struct snd_compr_tstamp64 { + __u32 byte_offset; + __u64 copied_total; + __u64 pcm_frames; + __u64 pcm_io_frames; + __u32 sampling_rate; +} __attribute__((packed, aligned(4))); + /** * struct snd_compr_avail - avail descriptor * @avail: Number of bytes available in ring buffer for writing/reading diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index a66f258cafaa..d3164aa07158 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -176,14 +176,25 @@ static int snd_compr_free(struct inode *inode, struct file *f) return 0; } +static void +snd_compr_tstamp32_from_64(struct snd_compr_tstamp *tstamp32, + const struct snd_compr_tstamp64 *tstamp64) +{ + tstamp32->byte_offset = tstamp64->byte_offset; + tstamp32->copied_total = (u32)tstamp64->copied_total; + tstamp32->pcm_frames = (u32)tstamp64->pcm_frames; + tstamp32->pcm_io_frames = (u32)tstamp64->pcm_io_frames; + tstamp32->sampling_rate = tstamp64->sampling_rate; +} + static int snd_compr_update_tstamp(struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { if (!stream->ops->pointer) return -ENOTSUPP; stream->ops->pointer(stream, tstamp); - pr_debug("dsp consumed till %d total %d bytes\n", - tstamp->byte_offset, tstamp->copied_total); + pr_debug("dsp consumed till %u total %llu bytes\n", tstamp->byte_offset, + tstamp->copied_total); if (stream->direction == SND_COMPRESS_PLAYBACK) stream->runtime->total_bytes_transferred = tstamp->copied_total; else @@ -194,8 +205,11 @@ static int snd_compr_update_tstamp(struct snd_compr_stream *stream, static size_t snd_compr_calc_avail(struct snd_compr_stream *stream, struct snd_compr_avail *avail) { + struct snd_compr_tstamp64 tstamp64 = { 0 }; + memset(avail, 0, sizeof(*avail)); - snd_compr_update_tstamp(stream, &avail->tstamp); + snd_compr_update_tstamp(stream, &tstamp64); + snd_compr_tstamp32_from_64(&avail->tstamp, &tstamp64); /* Still need to return avail even if tstamp can't be filled in */ if (stream->runtime->total_bytes_available == 0 && @@ -204,9 +218,9 @@ static size_t snd_compr_calc_avail(struct snd_compr_stream *stream, pr_debug("detected init and someone forgot to do a write\n"); return stream->runtime->buffer_size; } - pr_debug("app wrote %lld, DSP consumed %lld\n", - stream->runtime->total_bytes_available, - stream->runtime->total_bytes_transferred); + pr_debug("app wrote %llu, DSP consumed %llu\n", + stream->runtime->total_bytes_available, + stream->runtime->total_bytes_transferred); if (stream->runtime->total_bytes_available == stream->runtime->total_bytes_transferred) { if (stream->direction == SND_COMPRESS_PLAYBACK) { @@ -223,7 +237,7 @@ static size_t snd_compr_calc_avail(struct snd_compr_stream *stream, if (stream->direction == SND_COMPRESS_PLAYBACK) avail->avail = stream->runtime->buffer_size - avail->avail; - pr_debug("ret avail as %lld\n", avail->avail); + pr_debug("ret avail as %llu\n", avail->avail); return avail->avail; } @@ -274,8 +288,7 @@ static int snd_compr_write_data(struct snd_compr_stream *stream, (app_pointer * runtime->buffer_size); dstn = runtime->buffer + app_pointer; - pr_debug("copying %ld at %lld\n", - (unsigned long)count, app_pointer); + pr_debug("copying %lu at %llu\n", (unsigned long)count, app_pointer); if (count < runtime->buffer_size - app_pointer) { if (copy_from_user(dstn, buf, count)) return -EFAULT; @@ -318,7 +331,7 @@ static ssize_t snd_compr_write(struct file *f, const char __user *buf, } avail = snd_compr_get_avail(stream); - pr_debug("avail returned %ld\n", (unsigned long)avail); + pr_debug("avail returned %lu\n", (unsigned long)avail); /* calculate how much we can write to buffer */ if (avail > count) avail = count; @@ -374,7 +387,7 @@ static ssize_t snd_compr_read(struct file *f, char __user *buf, } avail = snd_compr_get_avail(stream); - pr_debug("avail returned %ld\n", (unsigned long)avail); + pr_debug("avail returned %lu\n", (unsigned long)avail); /* calculate how much we can read from buffer */ if (avail > count) avail = count; @@ -443,7 +456,7 @@ static __poll_t snd_compr_poll(struct file *f, poll_table *wait) #endif avail = snd_compr_get_avail(stream); - pr_debug("avail is %ld\n", (unsigned long)avail); + pr_debug("avail is %lu\n", (unsigned long)avail); /* check if we have at least one fragment to fill */ switch (runtime->state) { case SNDRV_PCM_STATE_DRAINING: @@ -726,13 +739,18 @@ snd_compr_set_metadata(struct snd_compr_stream *stream, unsigned long arg) static inline int snd_compr_tstamp(struct snd_compr_stream *stream, unsigned long arg) { - struct snd_compr_tstamp tstamp = {0}; + struct snd_compr_tstamp64 tstamp64 = { 0 }; + struct snd_compr_tstamp tstamp32 = { 0 }; int ret; - ret = snd_compr_update_tstamp(stream, &tstamp); - if (ret == 0) + ret = snd_compr_update_tstamp(stream, &tstamp64); + if (ret == 0) { + snd_compr_tstamp32_from_64(&tstamp32, &tstamp64); ret = copy_to_user((struct snd_compr_tstamp __user *)arg, - &tstamp, sizeof(tstamp)) ? -EFAULT : 0; + &tstamp32, sizeof(tstamp32)) ? + -EFAULT : + 0; + } return ret; } diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 8a1d5cc75d6c..f197034fd594 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -173,7 +173,7 @@ struct wm_adsp_compr { struct snd_compressed_buffer size; u32 *raw_buf; - unsigned int copied_total; + u64 copied_total; unsigned int sample_rate; @@ -1860,7 +1860,7 @@ static int wm_adsp_buffer_reenable_irq(struct wm_adsp_compr_buf *buf) int wm_adsp_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct wm_adsp_compr *compr = stream->runtime->private_data; struct wm_adsp *dsp = compr->dsp; diff --git a/sound/soc/codecs/wm_adsp.h b/sound/soc/codecs/wm_adsp.h index 25210d404bf1..8035fda71f8d 100644 --- a/sound/soc/codecs/wm_adsp.h +++ b/sound/soc/codecs/wm_adsp.h @@ -131,7 +131,7 @@ int wm_adsp_compr_trigger(struct snd_soc_component *component, int wm_adsp_compr_handle_irq(struct wm_adsp *dsp); int wm_adsp_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp); + struct snd_compr_tstamp64 *tstamp); int wm_adsp_compr_copy(struct snd_soc_component *component, struct snd_compr_stream *stream, char __user *buf, size_t count); diff --git a/sound/soc/intel/atom/sst-mfld-platform-compress.c b/sound/soc/intel/atom/sst-mfld-platform-compress.c index 89c9c5ad6b21..9dfb0a814b94 100644 --- a/sound/soc/intel/atom/sst-mfld-platform-compress.c +++ b/sound/soc/intel/atom/sst-mfld-platform-compress.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "sst-mfld-platform.h" /* compress stream operations */ @@ -202,15 +203,16 @@ static int sst_platform_compr_trigger(struct snd_soc_component *component, static int sst_platform_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct sst_runtime_stream *stream; + u64 temp_copied_total = tstamp->copied_total; - stream = cstream->runtime->private_data; + stream = cstream->runtime->private_data; stream->compr_ops->tstamp(sst->dev, stream->id, tstamp); - tstamp->byte_offset = tstamp->copied_total % - (u32)cstream->runtime->buffer_size; - pr_debug("calc bytes offset/copied bytes as %d\n", tstamp->byte_offset); + tstamp->byte_offset = + do_div(temp_copied_total, cstream->runtime->buffer_size); + pr_debug("calc bytes offset/copied bytes as %u\n", tstamp->byte_offset); return 0; } diff --git a/sound/soc/intel/atom/sst-mfld-platform.h b/sound/soc/intel/atom/sst-mfld-platform.h index 8b5777d3229a..a0e33f7f01c5 100644 --- a/sound/soc/intel/atom/sst-mfld-platform.h +++ b/sound/soc/intel/atom/sst-mfld-platform.h @@ -105,7 +105,7 @@ struct compress_sst_ops { int (*stream_pause_release)(struct device *dev, unsigned int str_id); int (*tstamp)(struct device *dev, unsigned int str_id, - struct snd_compr_tstamp *tstamp); + struct snd_compr_tstamp64 *tstamp); int (*ack)(struct device *dev, unsigned int str_id, unsigned long bytes); int (*close)(struct device *dev, unsigned int str_id); diff --git a/sound/soc/intel/atom/sst/sst_drv_interface.c b/sound/soc/intel/atom/sst/sst_drv_interface.c index 8bb27f86eb65..2646c4632ca1 100644 --- a/sound/soc/intel/atom/sst/sst_drv_interface.c +++ b/sound/soc/intel/atom/sst/sst_drv_interface.c @@ -326,7 +326,7 @@ static int sst_cdev_stream_partial_drain(struct device *dev, } static int sst_cdev_tstamp(struct device *dev, unsigned int str_id, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_sst_tstamp fw_tstamp = {0,}; struct stream_info *stream; @@ -349,10 +349,11 @@ static int sst_cdev_tstamp(struct device *dev, unsigned int str_id, (u64)stream->num_ch * SST_GET_BYTES_PER_SAMPLE(24)); tstamp->sampling_rate = fw_tstamp.sampling_frequency; - dev_dbg(dev, "PCM = %u\n", tstamp->pcm_io_frames); - dev_dbg(dev, "Ptr Query on strid = %d copied_total %d, decodec %d\n", + dev_dbg(dev, "PCM = %llu\n", tstamp->pcm_io_frames); + dev_dbg(dev, + "Ptr Query on strid = %d copied_total %llu, decodec %llu\n", str_id, tstamp->copied_total, tstamp->pcm_frames); - dev_dbg(dev, "rendered %d\n", tstamp->pcm_io_frames); + dev_dbg(dev, "rendered %llu\n", tstamp->pcm_io_frames); return 0; } diff --git a/sound/soc/intel/avs/probes.c b/sound/soc/intel/avs/probes.c index a42736b9aa55..b5b4b0754b71 100644 --- a/sound/soc/intel/avs/probes.c +++ b/sound/soc/intel/avs/probes.c @@ -213,7 +213,7 @@ static int avs_probe_compr_trigger(struct snd_compr_stream *cstream, int cmd, } static int avs_probe_compr_pointer(struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp, struct snd_soc_dai *dai) + struct snd_compr_tstamp64 *tstamp, struct snd_soc_dai *dai) { struct hdac_ext_stream *host_stream = avs_compr_get_host_stream(cstream); struct snd_soc_pcm_stream *pstream; diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index 2cd522108221..09da26f712a6 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -65,9 +66,9 @@ struct q6apm_dai_rtd { unsigned int pcm_size; unsigned int pcm_count; unsigned int periods; - unsigned int bytes_sent; - unsigned int bytes_received; - unsigned int copied_total; + uint64_t bytes_sent; + uint64_t bytes_received; + uint64_t copied_total; uint16_t bits_per_sample; snd_pcm_uframes_t queue_ptr; bool next_track; @@ -575,15 +576,17 @@ static int q6apm_dai_compr_get_codec_caps(struct snd_soc_component *component, static int q6apm_dai_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_compr_runtime *runtime = stream->runtime; struct q6apm_dai_rtd *prtd = runtime->private_data; unsigned long flags; + uint64_t temp_copied_total; spin_lock_irqsave(&prtd->lock, flags); tstamp->copied_total = prtd->copied_total; - tstamp->byte_offset = prtd->copied_total % prtd->pcm_size; + temp_copied_total = tstamp->copied_total; + tstamp->byte_offset = do_div(temp_copied_total, prtd->pcm_size); spin_unlock_irqrestore(&prtd->lock, flags); return 0; @@ -760,21 +763,24 @@ static int q6apm_compr_copy(struct snd_soc_component *component, size_t copy; u32 wflags = 0; u32 app_pointer; - u32 bytes_received; + uint64_t bytes_received; + uint64_t temp_bytes_received; uint32_t bytes_to_write; - int avail, bytes_in_flight = 0; + uint64_t avail, bytes_in_flight = 0; bytes_received = prtd->bytes_received; + temp_bytes_received = bytes_received; /** * Make sure that next track data pointer is aligned at 32 bit boundary * This is a Mandatory requirement from DSP data buffers alignment */ - if (prtd->next_track) + if (prtd->next_track) { bytes_received = ALIGN(prtd->bytes_received, prtd->pcm_count); + temp_bytes_received = bytes_received; + } - app_pointer = bytes_received/prtd->pcm_size; - app_pointer = bytes_received - (app_pointer * prtd->pcm_size); + app_pointer = do_div(temp_bytes_received, prtd->pcm_size); dstn = prtd->dma_buffer.area + app_pointer; if (count < prtd->pcm_size - app_pointer) { diff --git a/sound/soc/qcom/qdsp6/q6asm-dai.c b/sound/soc/qcom/qdsp6/q6asm-dai.c index a400c9a31fea..b616ce316d2f 100644 --- a/sound/soc/qcom/qdsp6/q6asm-dai.c +++ b/sound/soc/qcom/qdsp6/q6asm-dai.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -59,9 +60,9 @@ struct q6asm_dai_rtd { unsigned int pcm_count; unsigned int pcm_irq_pos; /* IRQ position */ unsigned int periods; - unsigned int bytes_sent; - unsigned int bytes_received; - unsigned int copied_total; + uint64_t bytes_sent; + uint64_t bytes_received; + uint64_t copied_total; uint16_t bits_per_sample; uint16_t source; /* Encoding source bit mask */ struct audio_client *audio_client; @@ -1026,16 +1027,18 @@ static int q6asm_dai_compr_trigger(struct snd_soc_component *component, static int q6asm_dai_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *stream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_compr_runtime *runtime = stream->runtime; struct q6asm_dai_rtd *prtd = runtime->private_data; unsigned long flags; + uint64_t temp_copied_total; spin_lock_irqsave(&prtd->lock, flags); tstamp->copied_total = prtd->copied_total; - tstamp->byte_offset = prtd->copied_total % prtd->pcm_size; + temp_copied_total = tstamp->copied_total; + tstamp->byte_offset = do_div(temp_copied_total, prtd->pcm_size); spin_unlock_irqrestore(&prtd->lock, flags); @@ -1050,23 +1053,26 @@ static int q6asm_compr_copy(struct snd_soc_component *component, struct q6asm_dai_rtd *prtd = runtime->private_data; unsigned long flags; u32 wflags = 0; - int avail, bytes_in_flight = 0; + uint64_t avail, bytes_in_flight = 0; void *dstn; size_t copy; u32 app_pointer; - u32 bytes_received; + uint64_t bytes_received; + uint64_t temp_bytes_received; bytes_received = prtd->bytes_received; + temp_bytes_received = bytes_received; /** * Make sure that next track data pointer is aligned at 32 bit boundary * This is a Mandatory requirement from DSP data buffers alignment */ - if (prtd->next_track) + if (prtd->next_track) { bytes_received = ALIGN(prtd->bytes_received, prtd->pcm_count); + temp_bytes_received = bytes_received; + } - app_pointer = bytes_received/prtd->pcm_size; - app_pointer = bytes_received - (app_pointer * prtd->pcm_size); + app_pointer = do_div(temp_bytes_received, prtd->pcm_size); dstn = prtd->dma_buffer.area + app_pointer; if (count < prtd->pcm_size - app_pointer) { diff --git a/sound/soc/soc-component.c b/sound/soc/soc-component.c index 65c495094024..c815fd1b3fd1 100644 --- a/sound/soc/soc-component.c +++ b/sound/soc/soc-component.c @@ -637,7 +637,7 @@ int snd_soc_component_compr_ack(struct snd_compr_stream *cstream, size_t bytes) EXPORT_SYMBOL_GPL(snd_soc_component_compr_ack); int snd_soc_component_compr_pointer(struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_soc_pcm_runtime *rtd = cstream->private_data; struct snd_soc_component *component; diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c index 01d1d6bee28c..7b81dffc6a93 100644 --- a/sound/soc/soc-compress.c +++ b/sound/soc/soc-compress.c @@ -457,7 +457,7 @@ err: } static int soc_compr_pointer(struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_soc_pcm_runtime *rtd = cstream->private_data; int ret; diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 32f46a38682b..f231b4174b5f 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -774,7 +774,7 @@ EXPORT_SYMBOL_GPL(snd_soc_dai_compr_ack); int snd_soc_dai_compr_pointer(struct snd_soc_dai *dai, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { int ret = 0; diff --git a/sound/soc/sof/amd/acp-probes.c b/sound/soc/sof/amd/acp-probes.c index 0d0f8ec4aed8..ce51ed108a47 100644 --- a/sound/soc/sof/amd/acp-probes.c +++ b/sound/soc/sof/amd/acp-probes.c @@ -108,7 +108,7 @@ static int acp_probes_compr_trigger(struct sof_client_dev *cdev, static int acp_probes_compr_pointer(struct sof_client_dev *cdev, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp, + struct snd_compr_tstamp64 *tstamp, struct snd_soc_dai *dai) { struct acp_dsp_stream *stream = cstream->runtime->private_data; diff --git a/sound/soc/sof/compress.c b/sound/soc/sof/compress.c index d7b044f33d79..90b932ae3bab 100644 --- a/sound/soc/sof/compress.c +++ b/sound/soc/sof/compress.c @@ -361,7 +361,7 @@ static int sof_compr_copy(struct snd_soc_component *component, static int sof_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_sof_pcm *spcm; struct snd_soc_pcm_runtime *rtd = cstream->private_data; diff --git a/sound/soc/sof/intel/hda-probes.c b/sound/soc/sof/intel/hda-probes.c index c645346c2c84..b06933cebc45 100644 --- a/sound/soc/sof/intel/hda-probes.c +++ b/sound/soc/sof/intel/hda-probes.c @@ -112,7 +112,7 @@ static int hda_probes_compr_trigger(struct sof_client_dev *cdev, static int hda_probes_compr_pointer(struct sof_client_dev *cdev, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp, + struct snd_compr_tstamp64 *tstamp, struct snd_soc_dai *dai) { struct hdac_ext_stream *hext_stream = hda_compr_get_stream(cstream); diff --git a/sound/soc/sof/sof-client-probes.c b/sound/soc/sof/sof-client-probes.c index 663c0d3c314c..1353e911501a 100644 --- a/sound/soc/sof/sof-client-probes.c +++ b/sound/soc/sof/sof-client-probes.c @@ -137,7 +137,7 @@ static int sof_probes_compr_trigger(struct snd_compr_stream *cstream, int cmd, } static int sof_probes_compr_pointer(struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp, + struct snd_compr_tstamp64 *tstamp, struct snd_soc_dai *dai) { struct snd_soc_card *card = snd_soc_component_get_drvdata(dai->component); diff --git a/sound/soc/sof/sof-client-probes.h b/sound/soc/sof/sof-client-probes.h index da04d65b8d99..8713b69cda4b 100644 --- a/sound/soc/sof/sof-client-probes.h +++ b/sound/soc/sof/sof-client-probes.h @@ -4,7 +4,7 @@ #define __SOF_CLIENT_PROBES_H struct snd_compr_stream; -struct snd_compr_tstamp; +struct snd_compr_tstamp64; struct snd_compr_params; struct sof_client_dev; struct snd_soc_dai; @@ -24,7 +24,7 @@ struct sof_probes_host_ops { int (*trigger)(struct sof_client_dev *cdev, struct snd_compr_stream *cstream, int cmd, struct snd_soc_dai *dai); int (*pointer)(struct sof_client_dev *cdev, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp, + struct snd_compr_tstamp64 *tstamp, struct snd_soc_dai *dai); }; diff --git a/sound/soc/sprd/sprd-pcm-compress.c b/sound/soc/sprd/sprd-pcm-compress.c index 57bd1a0728ac..4b6ebfa5b033 100644 --- a/sound/soc/sprd/sprd-pcm-compress.c +++ b/sound/soc/sprd/sprd-pcm-compress.c @@ -85,9 +85,9 @@ struct sprd_compr_stream { int info_size; /* Data size copied to IRAM buffer */ - int copied_total; + u64 copied_total; /* Total received data size from userspace */ - int received_total; + u64 received_total; /* Stage 0 IRAM buffer received data size */ int received_stage0; /* Stage 1 DDR buffer received data size */ @@ -513,7 +513,7 @@ static int sprd_platform_compr_trigger(struct snd_soc_component *component, static int sprd_platform_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_compr_runtime *runtime = cstream->runtime; struct sprd_compr_stream *stream = runtime->private_data; diff --git a/sound/soc/sprd/sprd-pcm-dma.h b/sound/soc/sprd/sprd-pcm-dma.h index be5e385f5e42..c5935a1367e6 100644 --- a/sound/soc/sprd/sprd-pcm-dma.h +++ b/sound/soc/sprd/sprd-pcm-dma.h @@ -19,7 +19,7 @@ struct sprd_compr_playinfo { int total_time; int current_time; int total_data_length; - int current_data_offset; + u64 current_data_offset; }; struct sprd_compr_params { @@ -46,7 +46,7 @@ struct sprd_compr_ops { int (*stop)(int str_id); int (*pause)(int str_id); int (*pause_release)(int str_id); - int (*drain)(int received_total); + int (*drain)(u64 received_total); int (*set_params)(int str_id, struct sprd_compr_params *params); }; diff --git a/sound/soc/uniphier/aio-compress.c b/sound/soc/uniphier/aio-compress.c index 4a19d4908ffd..b18af98a552b 100644 --- a/sound/soc/uniphier/aio-compress.c +++ b/sound/soc/uniphier/aio-compress.c @@ -249,7 +249,7 @@ static int uniphier_aio_compr_trigger(struct snd_soc_component *component, static int uniphier_aio_compr_pointer(struct snd_soc_component *component, struct snd_compr_stream *cstream, - struct snd_compr_tstamp *tstamp) + struct snd_compr_tstamp64 *tstamp) { struct snd_soc_pcm_runtime *rtd = cstream->private_data; struct snd_compr_runtime *runtime = cstream->runtime; -- cgit v1.2.3 From f20a53974f79619d0ef6c9f17bb8693499fb6ebb Mon Sep 17 00:00:00 2001 From: Joris Verhaegen Date: Fri, 5 Sep 2025 10:12:55 +0100 Subject: ALSA: compress_offload: Add SNDRV_COMPRESS_TSTAMP64 ioctl The previous patch introduced the internal infrastructure for handling 64-bit timestamps. This patch exposes this capability to user-space. Define the new ioctl command SNDRV_COMPRESS_TSTAMP64, which allows applications to fetch the overflow-safe struct snd_compr_tstamp64. The ioctl dispatch table is updated to handle the new command by calling a new snd_compr_tstamp64 handler, while the legacy path is renamed to snd_compr_tstamp32 for clarity. This patch bumps the SNDRV_COMPRESS_VERSION to 0.4.0. Reviewed-by: Miller Liang Tested-by: Joris Verhaegen Signed-off-by: Joris Verhaegen Reviewed-by: Charles Keepax Acked-by: Mark Brown Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250905091301.2711705-3-verhaegen@google.com --- include/uapi/sound/compress_offload.h | 5 +++-- sound/core/compress_offload.c | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index abd0ea3f86ee..70b8921601f9 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -13,8 +13,7 @@ #include #include - -#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 3, 0) +#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 0) /** * struct snd_compressed_buffer - compressed buffer * @fragment_size: size of buffer fragment in bytes @@ -208,6 +207,7 @@ struct snd_compr_task_status { * Note: only codec params can be changed runtime and stream params cant be * SNDRV_COMPRESS_GET_PARAMS: Query codec params * SNDRV_COMPRESS_TSTAMP: get the current timestamp value + * SNDRV_COMPRESS_TSTAMP64: get the current timestamp value in 64 bit format * SNDRV_COMPRESS_AVAIL: get the current buffer avail value. * This also queries the tstamp properties * SNDRV_COMPRESS_PAUSE: Pause the running stream @@ -230,6 +230,7 @@ struct snd_compr_task_status { struct snd_compr_metadata) #define SNDRV_COMPRESS_TSTAMP _IOR('C', 0x20, struct snd_compr_tstamp) #define SNDRV_COMPRESS_AVAIL _IOR('C', 0x21, struct snd_compr_avail) +#define SNDRV_COMPRESS_TSTAMP64 _IOR('C', 0x22, struct snd_compr_tstamp64) #define SNDRV_COMPRESS_PAUSE _IO('C', 0x30) #define SNDRV_COMPRESS_RESUME _IO('C', 0x31) #define SNDRV_COMPRESS_START _IO('C', 0x32) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index d3164aa07158..445220fdb6a0 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -736,18 +736,23 @@ snd_compr_set_metadata(struct snd_compr_stream *stream, unsigned long arg) return retval; } -static inline int -snd_compr_tstamp(struct snd_compr_stream *stream, unsigned long arg) +static inline int snd_compr_tstamp(struct snd_compr_stream *stream, + unsigned long arg, bool is_32bit) { struct snd_compr_tstamp64 tstamp64 = { 0 }; struct snd_compr_tstamp tstamp32 = { 0 }; + const void *copy_from = &tstamp64; + size_t copy_size = sizeof(tstamp64); int ret; ret = snd_compr_update_tstamp(stream, &tstamp64); if (ret == 0) { - snd_compr_tstamp32_from_64(&tstamp32, &tstamp64); - ret = copy_to_user((struct snd_compr_tstamp __user *)arg, - &tstamp32, sizeof(tstamp32)) ? + if (is_32bit) { + snd_compr_tstamp32_from_64(&tstamp32, &tstamp64); + copy_from = &tstamp32; + copy_size = sizeof(tstamp32); + } + ret = copy_to_user((void __user *)arg, copy_from, copy_size) ? -EFAULT : 0; } @@ -1327,7 +1332,9 @@ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg) switch (cmd) { case SNDRV_COMPRESS_TSTAMP: - return snd_compr_tstamp(stream, arg); + return snd_compr_tstamp(stream, arg, true); + case SNDRV_COMPRESS_TSTAMP64: + return snd_compr_tstamp(stream, arg, false); case SNDRV_COMPRESS_AVAIL: return snd_compr_ioctl_avail(stream, arg); case SNDRV_COMPRESS_PAUSE: -- cgit v1.2.3 From 86eec88c5bddf9a57bfebe701d9c7a4d439aed9b Mon Sep 17 00:00:00 2001 From: Joris Verhaegen Date: Fri, 5 Sep 2025 10:12:56 +0100 Subject: ALSA: compress_offload: Add SNDRV_COMPRESS_AVAIL64 ioctl The previous patch introduced a 64-bit timestamp ioctl (SNDRV_COMPRESS_TSTAMP64). To provide a consistent API, this patch adds a corresponding 64-bit version of the SNDRV_COMPRESS_AVAIL ioctl. A new struct snd_compr_avail64 is added to the UAPI, which includes the 64-bit timestamp. The existing ioctl implementation is refactored to handle both the 32-bit and 64-bit variants. Reviewed-by: Miller Liang Tested-by: Joris Verhaegen Signed-off-by: Joris Verhaegen Acked-by: Vinod Koul Reviewed-by: Charles Keepax Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250905091301.2711705-4-verhaegen@google.com --- include/uapi/sound/compress_offload.h | 11 +++++++++ sound/core/compress_offload.c | 43 +++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index 70b8921601f9..26f756cc2e62 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -84,6 +84,16 @@ struct snd_compr_avail { struct snd_compr_tstamp tstamp; } __attribute__((packed, aligned(4))); +/** + * struct snd_compr_avail64 - avail descriptor with tstamp in 64 bit format + * @avail: Number of bytes available in ring buffer for writing/reading + * @tstamp: timestamp information + */ +struct snd_compr_avail64 { + __u64 avail; + struct snd_compr_tstamp64 tstamp; +} __attribute__((packed, aligned(4))); + enum snd_compr_direction { SND_COMPRESS_PLAYBACK = 0, SND_COMPRESS_CAPTURE, @@ -231,6 +241,7 @@ struct snd_compr_task_status { #define SNDRV_COMPRESS_TSTAMP _IOR('C', 0x20, struct snd_compr_tstamp) #define SNDRV_COMPRESS_AVAIL _IOR('C', 0x21, struct snd_compr_avail) #define SNDRV_COMPRESS_TSTAMP64 _IOR('C', 0x22, struct snd_compr_tstamp64) +#define SNDRV_COMPRESS_AVAIL64 _IOR('C', 0x23, struct snd_compr_avail64) #define SNDRV_COMPRESS_PAUSE _IO('C', 0x30) #define SNDRV_COMPRESS_RESUME _IO('C', 0x31) #define SNDRV_COMPRESS_START _IO('C', 0x32) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 445220fdb6a0..da514fef45bc 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -203,13 +203,10 @@ static int snd_compr_update_tstamp(struct snd_compr_stream *stream, } static size_t snd_compr_calc_avail(struct snd_compr_stream *stream, - struct snd_compr_avail *avail) + struct snd_compr_avail64 *avail) { - struct snd_compr_tstamp64 tstamp64 = { 0 }; - memset(avail, 0, sizeof(*avail)); - snd_compr_update_tstamp(stream, &tstamp64); - snd_compr_tstamp32_from_64(&avail->tstamp, &tstamp64); + snd_compr_update_tstamp(stream, &avail->tstamp); /* Still need to return avail even if tstamp can't be filled in */ if (stream->runtime->total_bytes_available == 0 && @@ -237,28 +234,43 @@ static size_t snd_compr_calc_avail(struct snd_compr_stream *stream, if (stream->direction == SND_COMPRESS_PLAYBACK) avail->avail = stream->runtime->buffer_size - avail->avail; - pr_debug("ret avail as %llu\n", avail->avail); + pr_debug("ret avail as %zu\n", (size_t)avail->avail); return avail->avail; } static inline size_t snd_compr_get_avail(struct snd_compr_stream *stream) { - struct snd_compr_avail avail; + struct snd_compr_avail64 avail; return snd_compr_calc_avail(stream, &avail); } -static int -snd_compr_ioctl_avail(struct snd_compr_stream *stream, unsigned long arg) +static void snd_compr_avail32_from_64(struct snd_compr_avail *avail32, + const struct snd_compr_avail64 *avail64) { - struct snd_compr_avail ioctl_avail; + avail32->avail = avail64->avail; + snd_compr_tstamp32_from_64(&avail32->tstamp, &avail64->tstamp); +} + +static int snd_compr_ioctl_avail(struct snd_compr_stream *stream, + unsigned long arg, bool is_32bit) +{ + struct snd_compr_avail64 ioctl_avail64; + struct snd_compr_avail ioctl_avail32; size_t avail; + const void *copy_from = &ioctl_avail64; + size_t copy_size = sizeof(ioctl_avail64); if (stream->direction == SND_COMPRESS_ACCEL) return -EBADFD; - avail = snd_compr_calc_avail(stream, &ioctl_avail); - ioctl_avail.avail = avail; + avail = snd_compr_calc_avail(stream, &ioctl_avail64); + ioctl_avail64.avail = avail; + if (is_32bit) { + snd_compr_avail32_from_64(&ioctl_avail32, &ioctl_avail64); + copy_from = &ioctl_avail32; + copy_size = sizeof(ioctl_avail32); + } switch (stream->runtime->state) { case SNDRV_PCM_STATE_OPEN: @@ -269,8 +281,7 @@ snd_compr_ioctl_avail(struct snd_compr_stream *stream, unsigned long arg) break; } - if (copy_to_user((__u64 __user *)arg, - &ioctl_avail, sizeof(ioctl_avail))) + if (copy_to_user((__u64 __user *)arg, copy_from, copy_size)) return -EFAULT; return 0; } @@ -1336,7 +1347,9 @@ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg) case SNDRV_COMPRESS_TSTAMP64: return snd_compr_tstamp(stream, arg, false); case SNDRV_COMPRESS_AVAIL: - return snd_compr_ioctl_avail(stream, arg); + return snd_compr_ioctl_avail(stream, arg, true); + case SNDRV_COMPRESS_AVAIL64: + return snd_compr_ioctl_avail(stream, arg, false); case SNDRV_COMPRESS_PAUSE: return snd_compr_pause(stream); case SNDRV_COMPRESS_RESUME: -- cgit v1.2.3 From d364d2ad07873dc4991b2a631a8536597272418b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:11 +0200 Subject: devres: provide devm_kmemdup_const() Provide a function similar to devm_strdup_const() but for copying blocks of memory that are likely to be placed in .rodata. Reviewed-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/base/devres.c | 21 +++++++++++++++++++++ include/linux/device/devres.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/base/devres.c b/drivers/base/devres.c index ff55e1bcfa30..c948c88d3956 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -1117,6 +1117,27 @@ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL_GPL(devm_kmemdup); +/** + * devm_kmemdup_const - conditionally duplicate and manage a region of memory + * + * @dev: Device this memory belongs to + * @src: memory region to duplicate + * @len: memory region length, + * @gfp: GFP mask to use + * + * Return: source address if it is in .rodata or the return value of kmemdup() + * to which the function falls back otherwise. + */ +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)src)) + return src; + + return devm_kmemdup(dev, src, len, gfp); +} +EXPORT_SYMBOL_GPL(devm_kmemdup_const); + struct pages_devres { unsigned long addr; unsigned int order; diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index ae696d10faff..8c5f57e0d613 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -80,6 +80,8 @@ void devm_kfree(struct device *dev, const void *p); void * __realloc_size(3) devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp); static inline void *devm_kmemdup_array(struct device *dev, const void *src, size_t n, size_t size, gfp_t flags) { -- cgit v1.2.3 From 11aa02d6a9c222260490f952d041dec6d7f16a92 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:22 +0200 Subject: pinctrl: allow to mark pin functions as requestable GPIOs The name of the pin function has no real meaning to pinctrl core and is there only for human readability of device properties. Some pins are muxed as GPIOs but for "strict" pinmuxers it's impossible to request them as GPIOs if they're bound to a devide - even if their function name explicitly says "gpio". Add a new field to struct pinfunction that allows to pass additional flags to pinctrl core. While we could go with a boolean "is_gpio" field, a flags field is more future-proof. If the PINFUNCTION_FLAG_GPIO is set for a given function, the pin muxed to it can be requested as GPIO even on strict pin controllers. Add a new callback to struct pinmux_ops - function_is_gpio() - that allows pinmux core to inspect a function and see if it's a GPIO one. Provide a generic implementation of this callback. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinmux.c | 46 ++++++++++++++++++++++++++++++++++++++--- drivers/pinctrl/pinmux.h | 3 +++ include/linux/pinctrl/pinctrl.h | 14 +++++++++++++ include/linux/pinctrl/pinmux.h | 2 ++ 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 07ec93f09334..3a8dd184ba3d 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -89,13 +89,20 @@ bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned int pin) { struct pin_desc *desc = pin_desc_get(pctldev, pin); const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; + bool func_is_gpio = false; /* Can't inspect pin, assume it can be used */ if (!desc || !ops) return true; + mux_setting = desc->mux_setting; + guard(mutex)(&desc->mux_lock); - if (ops->strict && desc->mux_usecount) + if (mux_setting && ops->function_is_gpio) + func_is_gpio = ops->function_is_gpio(pctldev, mux_setting->func); + + if (ops->strict && desc->mux_usecount && !func_is_gpio) return false; return !(ops->strict && !!desc->gpio_owner); @@ -116,7 +123,9 @@ static int pin_request(struct pinctrl_dev *pctldev, { struct pin_desc *desc; const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; int status = -EINVAL; + bool gpio_ok = false; desc = pin_desc_get(pctldev, pin); if (desc == NULL) { @@ -126,11 +135,21 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } + mux_setting = desc->mux_setting; + dev_dbg(pctldev->dev, "request pin %d (%s) for %s\n", pin, desc->name, owner); scoped_guard(mutex, &desc->mux_lock) { - if ((!gpio_range || ops->strict) && + if (mux_setting) { + if (ops->function_is_gpio) + gpio_ok = ops->function_is_gpio(pctldev, + mux_setting->func); + } else { + gpio_ok = true; + } + + if ((!gpio_range || ops->strict) && !gpio_ok && desc->mux_usecount && strcmp(desc->mux_owner, owner)) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", @@ -138,7 +157,7 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } - if ((gpio_range || ops->strict) && desc->gpio_owner) { + if ((gpio_range || ops->strict) && !gpio_ok && desc->gpio_owner) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", desc->name, desc->gpio_owner, owner); @@ -861,6 +880,27 @@ pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector) } EXPORT_SYMBOL_GPL(pinmux_generic_get_function); +/** + * pinmux_generic_function_is_gpio() - returns true if given function is a GPIO + * @pctldev: pin controller device + * @selector: function number + * + * Returns: + * True if given function is a GPIO, false otherwise. + */ +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct function_desc *function; + + function = radix_tree_lookup(&pctldev->pin_function_tree, selector); + if (!function) + return false; + + return function->func->flags & PINFUNCTION_FLAG_GPIO; +} +EXPORT_SYMBOL_GPL(pinmux_generic_function_is_gpio); + /** * pinmux_generic_add_function() - adds a function group * @pctldev: pin controller device diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h index 653684290666..4e826c1a5246 100644 --- a/drivers/pinctrl/pinmux.h +++ b/drivers/pinctrl/pinmux.h @@ -169,6 +169,9 @@ int pinmux_generic_remove_function(struct pinctrl_dev *pctldev, void pinmux_generic_free_functions(struct pinctrl_dev *pctldev); +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector); + #else static inline void pinmux_generic_free_functions(struct pinctrl_dev *pctldev) diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index d138e1815645..1a8084e29405 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -11,6 +11,7 @@ #ifndef __LINUX_PINCTRL_PINCTRL_H #define __LINUX_PINCTRL_PINCTRL_H +#include #include struct device; @@ -206,16 +207,20 @@ extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev, const char *pin_group, const unsigned int **pins, unsigned int *num_pins); +#define PINFUNCTION_FLAG_GPIO BIT(0) + /** * struct pinfunction - Description about a function * @name: Name of the function * @groups: An array of groups for this function * @ngroups: Number of groups in @groups + * @flags: Additional pin function flags */ struct pinfunction { const char *name; const char * const *groups; size_t ngroups; + unsigned long flags; }; /* Convenience macro to define a single named pinfunction */ @@ -226,6 +231,15 @@ struct pinfunction { .ngroups = (_ngroups), \ } +/* Same as PINCTRL_PINFUNCTION() but for the GPIO category of functions */ +#define PINCTRL_GPIO_PINFUNCTION(_name, _groups, _ngroups) \ +(struct pinfunction) { \ + .name = (_name), \ + .groups = (_groups), \ + .ngroups = (_ngroups), \ + .flags = PINFUNCTION_FLAG_GPIO, \ + } + #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_PINCTRL) extern struct pinctrl_dev *of_pinctrl_get(struct device_node *np); #else diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index d6f7b58d6ad0..6db6c3e1ccc2 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -66,6 +66,8 @@ struct pinmux_ops { unsigned int selector, const char * const **groups, unsigned int *num_groups); + bool (*function_is_gpio) (struct pinctrl_dev *pctldev, + unsigned int selector); int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector, unsigned int group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 203a83112e097a501fbe12722b6342787497efe0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 5 Sep 2025 11:21:50 +0200 Subject: pinctrl: generic: rename PIN_CONFIG_OUTPUT to LEVEL This generic pin config property is confusingly named so let's rename it to make things clearer. There are already drivers in the tree that use PIN_CONFIG_OUTPUT to *read* the value of an output driven pin, which is a big semantic confusion for the head: are we then reading the setting of the output or the actual value/level that is put out on the pin? We already have PIN_CONFIG_OUTPUT_ENABLE that turns on driver buffers for output, so this can by logical conclusion only drive the voltage level if it should be any different. But if we read the pin, are we then reading the *setting* of the output value or the *actual* value we can see on the line? If the pin has not first been set into output mode with PIN_CONFIG_OUTPUT_ENABLE, but is instead in some input mode or tristate, what will reading this property actually return? Reading the current users reading this property it is clear that what we read is the logical level of the pin as 0 or 1 depending on if it is low or high. Rename it to PIN_CONFIG_LEVEL so it is crystal clear that we set or read the voltage level of the pin and nothing else. Acked-by: Sudeep Holla Signed-off-by: Linus Walleij --- Documentation/driver-api/pin-control.rst | 4 ++-- drivers/gpio/gpio-rockchip.c | 2 +- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 6 +++--- drivers/pinctrl/cirrus/pinctrl-madera-core.c | 4 ++-- drivers/pinctrl/mediatek/pinctrl-airoha.c | 4 ++-- drivers/pinctrl/mediatek/pinctrl-moore.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 2 +- drivers/pinctrl/mediatek/pinctrl-paris.c | 4 ++-- drivers/pinctrl/meson/pinctrl-amlogic-a4.c | 6 +++--- drivers/pinctrl/meson/pinctrl-meson.c | 6 +++--- drivers/pinctrl/nomadik/pinctrl-abx500.c | 6 +++--- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 6 +++--- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 6 +++--- drivers/pinctrl/pinconf-generic.c | 6 +++--- drivers/pinctrl/pinctrl-at91-pio4.c | 2 +- drivers/pinctrl/pinctrl-aw9523.c | 6 +++--- drivers/pinctrl/pinctrl-cy8c95x0.c | 2 +- drivers/pinctrl/pinctrl-ingenic.c | 4 ++-- drivers/pinctrl/pinctrl-k210.c | 2 +- drivers/pinctrl/pinctrl-microchip-sgpio.c | 4 ++-- drivers/pinctrl/pinctrl-ocelot.c | 4 ++-- drivers/pinctrl/pinctrl-pic32.c | 4 ++-- drivers/pinctrl/pinctrl-rk805.c | 4 ++-- drivers/pinctrl/pinctrl-rockchip.c | 6 +++--- drivers/pinctrl/pinctrl-rp1.c | 2 +- drivers/pinctrl/pinctrl-scmi.c | 2 +- drivers/pinctrl/pinctrl-stmfx.c | 4 ++-- drivers/pinctrl/pinctrl-sx150x.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-lpass-lpi.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-msm.c | 6 +++--- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-spmi-mpp.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c | 4 ++-- drivers/pinctrl/renesas/pinctrl-rza1.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32.c | 2 +- drivers/pinctrl/sunplus/sppctl.c | 4 ++-- include/linux/pinctrl/pinconf-generic.h | 12 ++++++++---- sound/hda/codecs/side-codecs/cirrus_scodec_test.c | 2 +- 39 files changed, 91 insertions(+), 87 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/pin-control.rst b/Documentation/driver-api/pin-control.rst index 27ea1236307e..8208924e513e 100644 --- a/Documentation/driver-api/pin-control.rst +++ b/Documentation/driver-api/pin-control.rst @@ -863,7 +863,7 @@ has to be handled by the ```` interface. Instead view thi a certain pin config setting. Look in e.g. ```` and you find this in the documentation: - PIN_CONFIG_OUTPUT: + PIN_CONFIG_LEVEL: this will configure the pin in output, use argument 1 to indicate high level, argument 0 to indicate low level. @@ -897,7 +897,7 @@ And your machine configuration may look like this: }; static unsigned long uart_sleep_mode[] = { - PIN_CONF_PACKED(PIN_CONFIG_OUTPUT, 0), + PIN_CONF_PACKED(PIN_CONFIG_LEVEL, 0), }; static struct pinctrl_map pinmap[] __initdata = { diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index bcfc323a8315..47174eb3ba76 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -769,7 +769,7 @@ static int rockchip_gpio_probe(struct platform_device *pdev) list_del(&cfg->head); switch (cfg->param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = rockchip_gpio_direction_output(&bank->gpio_chip, cfg->pin, cfg->arg); if (ret) dev_warn(dev, "setting output pin %u to %u failed\n", cfg->pin, diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index 7dbf079739bc..c165674c5b4d 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -1023,7 +1023,7 @@ static int bcm2835_pinconf_get(struct pinctrl_dev *pctldev, /* No way to read back bias config in HW */ switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (fsel != BCM2835_FSEL_GPIO_OUT) return -EINVAL; @@ -1091,7 +1091,7 @@ static int bcm2835_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; @@ -1202,7 +1202,7 @@ static int bcm2711_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; diff --git a/drivers/pinctrl/cirrus/pinctrl-madera-core.c b/drivers/pinctrl/cirrus/pinctrl-madera-core.c index d19ef13224cc..1d9481b17091 100644 --- a/drivers/pinctrl/cirrus/pinctrl-madera-core.c +++ b/drivers/pinctrl/cirrus/pinctrl-madera-core.c @@ -804,7 +804,7 @@ static int madera_pin_conf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (conf[0] & MADERA_GP1_IP_CFG_MASK) result = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if ((conf[1] & MADERA_GP1_DIR_MASK) && (conf[0] & MADERA_GP1_LVL_MASK)) result = 1; @@ -902,7 +902,7 @@ static int madera_pin_conf_set(struct pinctrl_dev *pctldev, unsigned int pin, mask[1] |= MADERA_GP1_DIR_MASK; conf[1] |= MADERA_GP1_DIR; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val = pinconf_to_config_argument(*configs); mask[0] |= MADERA_GP1_LVL_MASK; if (val) diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 8fb3b65a1b77..c1e736786f5c 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -2765,7 +2765,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, break; case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: { + case PIN_CONFIG_LEVEL: { bool input = param == PIN_CONFIG_INPUT_ENABLE; int err; @@ -2774,7 +2774,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, if (err) return err; - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { err = airoha_pinconf_set_pin_value(pctrl_dev, pin, !!arg); if (err) diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 11dc525eb3a2..70f608347a5f 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -332,7 +332,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DIR, MTK_OUTPUT); if (err) diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c index d10306024111..d6a46fe0cda8 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c @@ -384,7 +384,7 @@ static int mtk_pconf_parse_conf(struct pinctrl_dev *pctldev, mtk_pmx_gpio_set_direction(pctldev, NULL, pin, true); ret = mtk_pconf_set_ies_smt(pctl, pin, arg, param); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: mtk_gpio_set(pctl->chip, pin, arg); ret = mtk_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c index 3e714554789d..6bf37d8085fa 100644 --- a/drivers/pinctrl/mediatek/pinctrl-paris.c +++ b/drivers/pinctrl/mediatek/pinctrl-paris.c @@ -169,7 +169,7 @@ static int mtk_pinconf_get(struct pinctrl_dev *pctldev, if (!ret) err = -EINVAL; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_get_value(hw, desc, PINCTRL_PIN_REG_DIR, &ret); if (err) break; @@ -292,7 +292,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, /* regard all non-zero value as enable */ err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_SR, !!arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DO, arg); if (err) diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c index e34e984c2b38..d2b15f133564 100644 --- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c +++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c @@ -422,7 +422,7 @@ static int aml_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_get_output(info, pin); if (ret <= 0) return -EINVAL; @@ -568,7 +568,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -592,7 +592,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = aml_pinconf_set_output(info, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_set_output_drive(info, pin, arg); break; default: diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index 277e9c40490d..18295b15ecd9 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -360,7 +360,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -384,7 +384,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = meson_pinconf_set_output(pc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_set_output_drive(pc, pin, arg); break; default: @@ -502,7 +502,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_get_output(pc, pin); if (ret <= 0) return -EINVAL; diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c index 7b5f94d8cb23..fc7ebeda8440 100644 --- a/drivers/pinctrl/nomadik/pinctrl-abx500.c +++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c @@ -860,8 +860,8 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, dev_dbg(chip->parent, "pin %d [%#lx]: %s %s\n", pin, configs[i], - (param == PIN_CONFIG_OUTPUT) ? "output " : "input", - (param == PIN_CONFIG_OUTPUT) ? + (param == PIN_CONFIG_LEVEL) ? "output " : "input", + (param == PIN_CONFIG_LEVEL) ? str_high_low(argument) : (argument ? "pull up" : "pull down")); @@ -907,7 +907,7 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, ret = abx500_gpio_direction_input(chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = abx500_gpio_direction_output(chip, offset, argument); break; diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index c2ca71ebb973..7f45c2897c3f 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1700,13 +1700,13 @@ static int npcm7xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = (!pu && pd); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM7XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM7XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -1765,7 +1765,7 @@ static int npcm7xx_config_set_one(struct npcm7xx_pinctrl *npcm, iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM7XX_GP_N_OES); break; diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 0f155a685bba..920dd2077925 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2187,13 +2187,13 @@ static int npcm8xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = !pu && pd; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM8XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM8XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -2251,7 +2251,7 @@ static int npcm8xx_config_set_one(struct npcm8xx_pinctrl *npcm, iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM8XX_GP_N_OES); break; diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index d67838afb085..5de6ff62c69b 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -48,7 +48,7 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT_ENABLE, "input schmitt enabled", NULL, false), PCONFDUMP(PIN_CONFIG_MODE_LOW_POWER, "pin low power", "mode", true), PCONFDUMP(PIN_CONFIG_OUTPUT_ENABLE, "output enabled", NULL, false), - PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true), + PCONFDUMP(PIN_CONFIG_LEVEL, "pin output", "level", true), PCONFDUMP(PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, "output impedance", "ohms", true), PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true), PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false), @@ -183,9 +183,9 @@ static const struct pinconf_generic_params dt_params[] = { { "low-power-enable", PIN_CONFIG_MODE_LOW_POWER, 1 }, { "output-disable", PIN_CONFIG_OUTPUT_ENABLE, 0 }, { "output-enable", PIN_CONFIG_OUTPUT_ENABLE, 1 }, - { "output-high", PIN_CONFIG_OUTPUT, 1, }, + { "output-high", PIN_CONFIG_LEVEL, 1, }, { "output-impedance-ohms", PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, 0 }, - { "output-low", PIN_CONFIG_OUTPUT, 0, }, + { "output-low", PIN_CONFIG_LEVEL, 0, }, { "power-source", PIN_CONFIG_POWER_SOURCE, 0 }, { "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 }, { "slew-rate", PIN_CONFIG_SLEW_RATE, 0 }, diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 35ea3414cb96..ec5351fc282e 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -862,7 +862,7 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, conf |= ATMEL_PIO_IFSCEN_MASK; } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: conf |= ATMEL_PIO_DIR_MASK; bank = ATMEL_PIO_BANK(pin_id); pin = ATMEL_PIO_LINE(pin_id); diff --git a/drivers/pinctrl/pinctrl-aw9523.c b/drivers/pinctrl/pinctrl-aw9523.c index 890b83fddea3..479553a79216 100644 --- a/drivers/pinctrl/pinctrl-aw9523.c +++ b/drivers/pinctrl/pinctrl-aw9523.c @@ -215,7 +215,7 @@ static int aw9523_pcfg_param_to_reg(enum pin_config_param pcp, int pin, u8 *r) case PIN_CONFIG_OUTPUT_ENABLE: reg = AW9523_REG_CONF_STATE(pin); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = AW9523_REG_OUT_STATE(pin); break; default: @@ -249,7 +249,7 @@ static int aw9523_pconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val &= BIT(regbit); break; case PIN_CONFIG_BIAS_PULL_DOWN: @@ -301,7 +301,7 @@ static int aw9523_pconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto end; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* First, enable pin output */ rc = regmap_update_bits(awi->regmap, AW9523_REG_CONF_STATE(pin), diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index cf7f80497fde..a4b04bf6d081 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -808,7 +808,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, case PIN_CONFIG_MODE_PWM: reg = CY8C95X0_SELPWM; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = CY8C95X0_OUTPUT; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index e5b24fab12e1..c7f14546de05 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -4264,7 +4264,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_INPUT_SCHMITT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_SLEW_RATE: continue; default: @@ -4305,7 +4305,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, ingenic_set_schmitt_trigger(jzpc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = pinctrl_gpio_direction_output(jzpc->gc, pin - jzpc->gc->base); if (ret) diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index 66c04120c29d..ddd6d6bfd513 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -551,7 +551,7 @@ static int k210_pinconf_set_param(struct pinctrl_dev *pctldev, else val &= ~K210_PC_ST; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: k210_pinmux_set_pin_function(pctldev, pin, K210_PCF_CONSTANT); val = readl(&pdata->fpioa->pins[pin]); val |= K210_PC_MODE_OUT; diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 069cc665a402..b6363f3cdce9 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -371,7 +371,7 @@ static int sgpio_pinconf_get(struct pinctrl_dev *pctldev, val = !bank->is_input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; val = sgpio_output_get(priv, &addr); @@ -402,7 +402,7 @@ static int sgpio_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[cfg]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; err = sgpio_output_set(priv, &addr, arg); diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index b82bf83fed25..70da3f37567a 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -1656,7 +1656,7 @@ static int ocelot_pinconf_get(struct pinctrl_dev *pctldev, return err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = regmap_read(info->map, REG(OCELOT_GPIO_OUT, info, pin), &val); if (err) @@ -1735,7 +1735,7 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: p = pin % 32; if (arg) regmap_write(info->map, diff --git a/drivers/pinctrl/pinctrl-pic32.c b/drivers/pinctrl/pinctrl-pic32.c index 37c2bf752154..e8b481e87c77 100644 --- a/drivers/pinctrl/pinctrl-pic32.c +++ b/drivers/pinctrl/pinctrl-pic32.c @@ -1905,7 +1905,7 @@ static int pic32_pinconf_get(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: arg = !!(readl(bank->reg_base + TRIS_REG) & mask); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = !(readl(bank->reg_base + TRIS_REG) & mask); break; default: @@ -1960,7 +1960,7 @@ static int pic32_pinconf_set(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: pic32_gpio_direction_input(&bank->gpio_chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pic32_gpio_direction_output(&bank->gpio_chip, offset, arg); break; diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c index 3acf770316c1..22f576337faa 100644 --- a/drivers/pinctrl/pinctrl-rk805.c +++ b/drivers/pinctrl/pinctrl-rk805.c @@ -541,7 +541,7 @@ static int rk805_pinconf_get(struct pinctrl_dev *pctldev, u32 arg = 0; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: arg = rk805_gpio_get(&pci->gpio_chip, pin); break; @@ -568,7 +568,7 @@ static int rk805_pinconf_set(struct pinctrl_dev *pctldev, arg = pinconf_to_config_argument(configs[i]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rk805_gpio_set(&pci->gpio_chip, pin, arg); rk805_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 930c454e0cec..7a68a6237649 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3272,7 +3272,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, param = pinconf_to_config_param(configs[i]); arg = pinconf_to_config_argument(configs[i]); - if (param == PIN_CONFIG_OUTPUT || param == PIN_CONFIG_INPUT_ENABLE) { + if (param == PIN_CONFIG_LEVEL || param == PIN_CONFIG_INPUT_ENABLE) { /* * Check for gpio driver not being probed yet. * The lock makes sure that either gpio-probe has completed @@ -3313,7 +3313,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (rc) return rc; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_set_mux(bank, pin - bank->pin_base, RK_FUNC_GPIO); if (rc != RK_FUNC_GPIO) @@ -3392,7 +3392,7 @@ static int rockchip_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_get_mux(bank, pin - bank->pin_base); if (rc != RK_FUNC_GPIO) return -EINVAL; diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c index 605105aa0cbc..ffc2f0b460a6 100644 --- a/drivers/pinctrl/pinctrl-rp1.c +++ b/drivers/pinctrl/pinctrl-rp1.c @@ -1440,7 +1440,7 @@ static int rp1_pinconf_set(struct pinctrl_dev *pctldev, unsigned int offset, rp1_output_enable(pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rp1_set_value(pin, arg); rp1_set_dir(pin, RP1_DIR_OUTPUT); rp1_set_fsel(pin, RP1_FSEL_GPIO); diff --git a/drivers/pinctrl/pinctrl-scmi.c b/drivers/pinctrl/pinctrl-scmi.c index 383681041e4c..d14528b9aa31 100644 --- a/drivers/pinctrl/pinctrl-scmi.c +++ b/drivers/pinctrl/pinctrl-scmi.c @@ -253,7 +253,7 @@ static int pinctrl_scmi_map_pinconf_type(enum pin_config_param param, case PIN_CONFIG_MODE_LOW_POWER: *type = SCMI_PIN_LOW_POWER_MODE; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: *type = SCMI_PIN_OUTPUT_VALUE; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c index c89b99003b71..03ee13844b50 100644 --- a/drivers/pinctrl/pinctrl-stmfx.c +++ b/drivers/pinctrl/pinctrl-stmfx.c @@ -267,7 +267,7 @@ static int stmfx_pinconf_get(struct pinctrl_dev *pctldev, if ((!dir && !type) || (dir && type)) arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (dir) return -EINVAL; @@ -334,7 +334,7 @@ static int stmfx_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (ret) return ret; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = stmfx_gpio_direction_output(&pctl->gpio_chip, pin, arg); if (ret) diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c index b613568b42b7..1d6760ffe809 100644 --- a/drivers/pinctrl/pinctrl-sx150x.c +++ b/drivers/pinctrl/pinctrl-sx150x.c @@ -611,7 +611,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (sx150x_pin_is_oscio(pctl, pin)) { switch (param) { case PIN_CONFIG_DRIVE_PUSH_PULL: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = regmap_read(pctl->regmap, pctl->data->pri.x789.reg_clock, &data); @@ -705,7 +705,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_get_direction(&pctl->gpio, pin); if (ret < 0) return ret; @@ -744,7 +744,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[i]); if (sx150x_pin_is_oscio(pctl, pin)) { - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) @@ -816,7 +816,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c index 54c77e0b96e9..28eb92a2b249 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c @@ -174,7 +174,7 @@ static int lpi_config_get(struct pinctrl_dev *pctldev, arg = 1; break; case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (is_out) arg = 1; break; @@ -252,7 +252,7 @@ static int lpi_config_set(struct pinctrl_dev *pctldev, unsigned int group, case PIN_CONFIG_INPUT_ENABLE: output_enabled = false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: output_enabled = true; value = arg; break; @@ -314,7 +314,7 @@ static int lpi_gpio_direction_output(struct gpio_chip *chip, struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return lpi_config_set(state->ctrl, pin, &config, 1); } @@ -332,7 +332,7 @@ static int lpi_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return lpi_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 1751d838ce95..67525d542c5b 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -295,7 +295,7 @@ static int msm_config_reg(struct msm_pinctrl *pctrl, *bit = g->drv_bit; *mask = 7; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: case PIN_CONFIG_OUTPUT_ENABLE: *bit = g->oe_bit; @@ -385,7 +385,7 @@ static int msm_config_group_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_DRIVE_STRENGTH: arg = msm_regval_to_drive(arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* Pin is not output */ if (!arg) return -EINVAL; @@ -464,7 +464,7 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev, else arg = (arg / 2) - 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* set output value */ raw_spin_lock_irqsave(&pctrl->lock, flags); val = msm_readl_io(pctrl, g); diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index b7b15874e488..485b68cc93f8 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -438,7 +438,7 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: arg = pad->output_enabled; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_GPIO_CONF_PULL_UP: @@ -530,7 +530,7 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: pad->output_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -737,7 +737,7 @@ static int pmic_gpio_direction_output(struct gpio_chip *chip, struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } @@ -769,7 +769,7 @@ static int pmic_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c index 22d76b1013a3..64f8024a865c 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c @@ -370,7 +370,7 @@ static int pmic_mpp_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_MPP_CONF_DTEST_SELECTOR: @@ -447,7 +447,7 @@ static int pmic_mpp_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_INPUT_ENABLE: pad->input_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -576,7 +576,7 @@ static int pmic_mpp_direction_output(struct gpio_chip *chip, struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } @@ -605,7 +605,7 @@ static int pmic_mpp_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c index fb37b1c1acb4..5c966d51eda7 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c @@ -282,7 +282,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (pin->mode & PM8XXX_GPIO_MODE_OUTPUT) arg = pin->output_value; else @@ -364,7 +364,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, pin->mode = PM8XXX_GPIO_MODE_INPUT; banks |= BIT(0) | BIT(1); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->mode = PM8XXX_GPIO_MODE_OUTPUT; pin->output_value = !!arg; banks |= BIT(0) | BIT(1); diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c index 6103849af042..7970fa6e1557 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c @@ -337,7 +337,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: arg = pin->input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pin->output_value; break; case PIN_CONFIG_POWER_SOURCE: @@ -392,7 +392,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: pin->input = true; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->output = true; pin->output_value = !!arg; break; diff --git a/drivers/pinctrl/renesas/pinctrl-rza1.c b/drivers/pinctrl/renesas/pinctrl-rza1.c index 70f22e0ef307..f24e5915cbe4 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza1.c +++ b/drivers/pinctrl/renesas/pinctrl-rza1.c @@ -933,7 +933,7 @@ static int rza1_parse_pinmux_node(struct rza1_pinctrl *rza1_pctl, case PIN_CONFIG_INPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_INPUT; break; - case PIN_CONFIG_OUTPUT: /* for DT backwards compatibility */ + case PIN_CONFIG_LEVEL: /* for DT backwards compatibility */ case PIN_CONFIG_OUTPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_OUTPUT; break; diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 823c8fe758e2..3ebb468de830 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1236,7 +1236,7 @@ static int stm32_pconf_parse_conf(struct pinctrl_dev *pctldev, case PIN_CONFIG_BIAS_PULL_DOWN: ret = stm32_pconf_set_bias(bank, offset, 2); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: __stm32_gpio_set(bank, offset, arg); ret = stm32_pmx_gpio_set_direction(pctldev, range, pin, false); break; diff --git a/drivers/pinctrl/sunplus/sppctl.c b/drivers/pinctrl/sunplus/sppctl.c index 3e924aa86cc2..fabe7efaa837 100644 --- a/drivers/pinctrl/sunplus/sppctl.c +++ b/drivers/pinctrl/sunplus/sppctl.c @@ -488,7 +488,7 @@ static int sppctl_gpio_set_config(struct gpio_chip *chip, unsigned int offset, case PIN_CONFIG_INPUT_ENABLE: break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: return sppctl_gpio_direction_output(chip, offset, 0); case PIN_CONFIG_PERSIST_STATE: @@ -580,7 +580,7 @@ static int sppctl_pin_config_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 0; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (!sppctl_first_get(&pctl->spp_gchip->chip, pin)) return -EINVAL; if (!sppctl_master_get(&pctl->spp_gchip->chip, pin)) diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 1bcf071b860e..d9245ecec71d 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -88,9 +88,13 @@ struct pinctrl_map; * passed in the argument on a custom form, else just use argument 1 * to indicate low power mode, argument 0 turns low power mode off. * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM - * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a - * value on the line. Use argument 1 to indicate high level, argument 0 to - * indicate low level. (Please see Documentation/driver-api/pin-control.rst, + * @PIN_CONFIG_LEVEL: setting this will configure the pin as an output and + * drive a value on the line. Use argument 1 to indicate high level, + * argument 0 to indicate low level. Conversely the value of the line + * can be read using this parameter, if and only if that value can be + * represented as a binary 0 or 1 where 0 indicate a low voltage level + * and 1 indicate a high voltage level. + * (Please see Documentation/driver-api/pin-control.rst, * section "GPIO mode pitfalls" for a discussion around this parameter.) * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode * without driving a value there. For most platforms this reduces to @@ -137,7 +141,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_SCHMITT_UV, PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_MODE_PWM, - PIN_CONFIG_OUTPUT, + PIN_CONFIG_LEVEL, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, PIN_CONFIG_PERSIST_STATE, diff --git a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c index 9ba14c09c07f..3cca750857b6 100644 --- a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c +++ b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c @@ -69,7 +69,7 @@ static int cirrus_scodec_test_gpio_set_config(struct gpio_chip *gc, unsigned long config) { switch (pinconf_to_config_param(config)) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_OUTPUT_ENABLE: return -EOPNOTSUPP; default: -- cgit v1.2.3 From 3ad2a7b9b15d5072139a20be84adb36776eb6c9b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 5 Jun 2025 16:23:57 -0700 Subject: hwmon: Serialize accesses in hwmon core Implement locking in the hardware monitoring core for drivers using the _with_info() API functions. Most hardware monitoring drivers need to support locking to protect against parallel accesses from userspace. With older API functions, such locking had to be implemented in the driver code since sysfs attributes were created by the driver. However, the _with_info() API creates sysfs attributes in the hardware monitoring core. This makes it easy to move the locking primitives into that code. This has the benefit of simplifying driver code while at the same time reducing the risk of incomplete of bad locking implementations in hardware monitoring drivers. While this means that all accesses are forced to be synchronized, this has little if any practical impact since accesses are expected to be low frequency and are typically synchronized from userspace anyway since only a single process is accessing the data. On top of that, many drivers use regmap, which also has its own locking scheme and already serializes accesses. Signed-off-by: Guenter Roeck --- Documentation/hwmon/hwmon-kernel-api.rst | 10 ++++++++ drivers/hwmon/hwmon.c | 42 ++++++++++++++++++++++++++------ include/linux/hwmon.h | 3 +++ 3 files changed, 48 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index 037b69c23cb5..1d7f1397a827 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -42,6 +42,9 @@ register/unregister functions:: char *devm_hwmon_sanitize_name(struct device *dev, const char *name); + void hwmon_lock(struct device *dev); + void hwmon_unlock(struct device *dev); + hwmon_device_register_with_info registers a hardware monitoring device. It creates the standard sysfs attributes in the hardware monitoring core, letting the driver focus on reading from and writing to the chip instead @@ -79,6 +82,13 @@ devm_hwmon_sanitize_name is the resource managed version of hwmon_sanitize_name; the memory will be freed automatically on device removal. +When using ``[devm_]hwmon_device_register_with_info()`` to register the +hardware monitoring device, accesses using the associated access functions +are serialised by the hardware monitoring core. If a driver needs locking +for other functions such as interrupt handlers or for attributes which are +fully implemented in the driver, hwmon_lock() and hwmon_unlock() can be used +to ensure that calls to those functions are serialized. + Using devm_hwmon_device_register_with_info() -------------------------------------------- diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 2e17f3a4c59b..0b4bdcd33c7b 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ struct hwmon_device { const char *label; struct device dev; const struct hwmon_chip_info *chip; + struct mutex lock; struct list_head tzdata; struct attribute_group group; const struct attribute_group **groups; @@ -165,6 +167,8 @@ static int hwmon_thermal_get_temp(struct thermal_zone_device *tz, int *temp) int ret; long t; + guard(mutex)(&hwdev->lock); + ret = hwdev->chip->ops->read(tdata->dev, hwmon_temp, hwmon_temp_input, tdata->index, &t); if (ret < 0) @@ -193,6 +197,8 @@ static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int if (!info[i]) return 0; + guard(mutex)(&hwdev->lock); + if (info[i]->config[tdata->index] & HWMON_T_MIN) { err = chip->ops->write(tdata->dev, hwmon_temp, hwmon_temp_min, tdata->index, low); @@ -330,8 +336,6 @@ static int hwmon_attr_base(enum hwmon_sensor_types type) * attached to an i2c client device. */ -static DEFINE_MUTEX(hwmon_pec_mutex); - static int hwmon_match_device(struct device *dev, const void *data) { return dev->class == &hwmon_class; @@ -362,17 +366,16 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, if (!hdev) return -ENODEV; - mutex_lock(&hwmon_pec_mutex); - /* * If there is no write function, we assume that chip specific * handling is not required. */ hwdev = to_hwmon_device(hdev); + guard(mutex)(&hwdev->lock); if (hwdev->chip->ops->write) { err = hwdev->chip->ops->write(hdev, hwmon_chip, hwmon_chip_pec, 0, val); if (err && err != -EOPNOTSUPP) - goto unlock; + goto put; } if (!val) @@ -381,8 +384,7 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, client->flags |= I2C_CLIENT_PEC; err = count; -unlock: - mutex_unlock(&hwmon_pec_mutex); +put: put_device(hdev); return err; @@ -426,10 +428,13 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); s64 val64; long val; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) @@ -449,10 +454,13 @@ static ssize_t hwmon_attr_show_string(struct device *dev, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); enum hwmon_sensor_types type = hattr->type; const char *s; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read_string(dev, hattr->type, hattr->attr, hattr->index, &s); if (ret < 0) @@ -469,6 +477,7 @@ static ssize_t hwmon_attr_store(struct device *dev, const char *buf, size_t count) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); long val; int ret; @@ -476,6 +485,8 @@ static ssize_t hwmon_attr_store(struct device *dev, if (ret < 0) return ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->write(dev, hattr->type, hattr->attr, hattr->index, val); if (ret < 0) @@ -791,6 +802,22 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, } EXPORT_SYMBOL_GPL(hwmon_notify_event); +void hwmon_lock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_lock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_lock); + +void hwmon_unlock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_unlock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_unlock); + static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) { int i, n; @@ -951,6 +978,7 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, tdev = tdev->parent; hdev->of_node = tdev ? tdev->of_node : NULL; hwdev->chip = chip; + mutex_init(&hwdev->lock); dev_set_drvdata(hdev, drvdata); dev_set_name(hdev, HWMON_ID_FORMAT, id); err = device_register(hdev); diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 886fc90b2d25..301a83afbd66 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -492,6 +492,9 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, char *hwmon_sanitize_name(const char *name); char *devm_hwmon_sanitize_name(struct device *dev, const char *name); +void hwmon_lock(struct device *dev); +void hwmon_unlock(struct device *dev); + /** * hwmon_is_bad_char - Is the char invalid in a hwmon name * @ch: the char to be considered -- cgit v1.2.3 From ca734f54b346db170be20ff3382bc3da8eb65904 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 17 Aug 2025 14:53:15 -0700 Subject: Input: pxa27x-keypad - drop support for platform data There are no in-kernel users of pxa27x_keypad_platform_data in the kernel, and the driver supports configuration via device tree, so drop support of static platform data and move properties parsing from OF-specific methods to generic ones. Link: https://lore.kernel.org/r/20250817215316.1872689-3-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/pxa27x_keypad.c | 378 +++++++++------------------- include/linux/platform_data/keypad-pxa27x.h | 73 ------ 2 files changed, 121 insertions(+), 330 deletions(-) delete mode 100644 include/linux/platform_data/keypad-pxa27x.h (limited to 'include') diff --git a/drivers/input/keyboard/pxa27x_keypad.c b/drivers/input/keyboard/pxa27x_keypad.c index e6f6adb56de4..4519eecb317b 100644 --- a/drivers/input/keyboard/pxa27x_keypad.c +++ b/drivers/input/keyboard/pxa27x_keypad.c @@ -21,13 +21,13 @@ #include #include #include +#include #include #include #include #include #include -#include /* * Keypad Controller registers */ @@ -100,54 +100,69 @@ #define keypad_readl(off) __raw_readl(keypad->mmio_base + (off)) #define keypad_writel(off, v) __raw_writel((v), keypad->mmio_base + (off)) +#define MAX_MATRIX_KEY_ROWS 8 +#define MAX_MATRIX_KEY_COLS 8 +#define MAX_DIRECT_KEY_NUM 8 +#define MAX_ROTARY_ENCODERS 2 + #define MAX_MATRIX_KEY_NUM (MAX_MATRIX_KEY_ROWS * MAX_MATRIX_KEY_COLS) #define MAX_KEYPAD_KEYS (MAX_MATRIX_KEY_NUM + MAX_DIRECT_KEY_NUM) -struct pxa27x_keypad { - const struct pxa27x_keypad_platform_data *pdata; +struct pxa27x_keypad_rotary { + unsigned short *key_codes; + int rel_code; + bool enabled; +}; +struct pxa27x_keypad { struct clk *clk; struct input_dev *input_dev; void __iomem *mmio_base; int irq; - unsigned short keycodes[MAX_KEYPAD_KEYS]; - int rotary_rel_code[2]; - + unsigned int matrix_key_rows; + unsigned int matrix_key_cols; unsigned int row_shift; + unsigned int direct_key_num; + unsigned int direct_key_mask; + bool direct_key_low_active; + + /* key debounce interval */ + unsigned int debounce_interval; + + unsigned short keycodes[MAX_KEYPAD_KEYS]; + /* state row bits of each column scan */ u32 matrix_key_state[MAX_MATRIX_KEY_COLS]; u32 direct_key_state; - unsigned int direct_key_mask; + struct pxa27x_keypad_rotary rotary[MAX_ROTARY_ENCODERS]; }; -#ifdef CONFIG_OF -static int pxa27x_keypad_matrix_key_parse_dt(struct pxa27x_keypad *keypad, - struct pxa27x_keypad_platform_data *pdata) +static int pxa27x_keypad_matrix_key_parse(struct pxa27x_keypad *keypad) { struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - u32 rows, cols; int error; - error = matrix_keypad_parse_properties(dev, &rows, &cols); + error = matrix_keypad_parse_properties(dev, &keypad->matrix_key_rows, + &keypad->matrix_key_cols); if (error) return error; - if (rows > MAX_MATRIX_KEY_ROWS || cols > MAX_MATRIX_KEY_COLS) { + if (keypad->matrix_key_rows > MAX_MATRIX_KEY_ROWS || + keypad->matrix_key_cols > MAX_MATRIX_KEY_COLS) { dev_err(dev, "rows or cols exceeds maximum value\n"); return -EINVAL; } - pdata->matrix_key_rows = rows; - pdata->matrix_key_cols = cols; + keypad->row_shift = get_count_order(keypad->matrix_key_cols); error = matrix_keypad_build_keymap(NULL, NULL, - pdata->matrix_key_rows, - pdata->matrix_key_cols, + keypad->matrix_key_rows, + keypad->matrix_key_cols, keypad->keycodes, input_dev); if (error) return error; @@ -155,20 +170,17 @@ static int pxa27x_keypad_matrix_key_parse_dt(struct pxa27x_keypad *keypad, return 0; } -static int pxa27x_keypad_direct_key_parse_dt(struct pxa27x_keypad *keypad, - struct pxa27x_keypad_platform_data *pdata) +static int pxa27x_keypad_direct_key_parse(struct pxa27x_keypad *keypad) { struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - struct device_node *np = dev->of_node; - const __be16 *prop; unsigned short code; - unsigned int proplen, size; + int count; int i; int error; - error = of_property_read_u32(np, "marvell,direct-key-count", - &pdata->direct_key_num); + error = device_property_read_u32(dev, "marvell,direct-key-count", + &keypad->direct_key_num); if (error) { /* * If do not have marvel,direct-key-count defined, @@ -177,151 +189,121 @@ static int pxa27x_keypad_direct_key_parse_dt(struct pxa27x_keypad *keypad, return error == -EINVAL ? 0 : error; } - error = of_property_read_u32(np, "marvell,direct-key-mask", - &pdata->direct_key_mask); + error = device_property_read_u32(dev, "marvell,direct-key-mask", + &keypad->direct_key_mask); if (error) { if (error != -EINVAL) return error; /* * If marvell,direct-key-mask is not defined, driver will use - * default value. Default value is set when configure the keypad. + * a default value based on number of direct keys set up. + * The default value is calculated in pxa27x_keypad_config(). */ - pdata->direct_key_mask = 0; + keypad->direct_key_mask = 0; } - pdata->direct_key_low_active = of_property_read_bool(np, - "marvell,direct-key-low-active"); - - prop = of_get_property(np, "marvell,direct-key-map", &proplen); - if (!prop) - return -EINVAL; + keypad->direct_key_low_active = + device_property_read_bool(dev, "marvell,direct-key-low-active"); - if (proplen % sizeof(u16)) + count = device_property_count_u16(dev, "marvell,direct-key-map"); + if (count <= 0 || count > MAX_DIRECT_KEY_NUM) return -EINVAL; - size = proplen / sizeof(u16); - - /* Only MAX_DIRECT_KEY_NUM is accepted.*/ - if (size > MAX_DIRECT_KEY_NUM) - return -EINVAL; + error = device_property_read_u16_array(dev, "marvell,direct-key-map", + &keypad->keycodes[MAX_MATRIX_KEY_NUM], + count); - for (i = 0; i < size; i++) { - code = be16_to_cpup(prop + i); - keypad->keycodes[MAX_MATRIX_KEY_NUM + i] = code; + for (i = 0; i < count; i++) { + code = keypad->keycodes[MAX_MATRIX_KEY_NUM + i]; __set_bit(code, input_dev->keybit); } return 0; } -static int pxa27x_keypad_rotary_parse_dt(struct pxa27x_keypad *keypad, - struct pxa27x_keypad_platform_data *pdata) +static int pxa27x_keypad_rotary_parse(struct pxa27x_keypad *keypad) { - const __be32 *prop; - int i, relkey_ret; - unsigned int code, proplen; - const char *rotaryname[2] = { - "marvell,rotary0", "marvell,rotary1"}; - const char relkeyname[] = {"marvell,rotary-rel-key"}; + static const char * const rotaryname[] = { "marvell,rotary0", "marvell,rotary1" }; struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - struct device_node *np = dev->of_node; - - relkey_ret = of_property_read_u32(np, relkeyname, &code); - /* if can read correct rotary key-code, we do not need this. */ - if (relkey_ret == 0) { - unsigned short relcode; + struct pxa27x_keypad_rotary *encoder; + unsigned int code; + int i; + int error; - /* rotary0 taks lower half, rotary1 taks upper half. */ - relcode = code & 0xffff; - pdata->rotary0_rel_code = (code & 0xffff); - __set_bit(relcode, input_dev->relbit); + error = device_property_read_u32(dev, "marvell,rotary-rel-key", &code); + if (!error) { + for (i = 0; i < MAX_ROTARY_ENCODERS; i++, code >>= 16) { + encoder = &keypad->rotary[i]; + encoder->enabled = true; + encoder->rel_code = code & 0xffff; + input_set_capability(input_dev, EV_REL, encoder->rel_code); + } - relcode = code >> 16; - pdata->rotary1_rel_code = relcode; - __set_bit(relcode, input_dev->relbit); + return 0; } - for (i = 0; i < 2; i++) { - prop = of_get_property(np, rotaryname[i], &proplen); + for (i = 0; i < MAX_ROTARY_ENCODERS; i++) { + encoder = &keypad->rotary[i]; + /* * If the prop is not set, it means keypad does not need * initialize the rotaryX. */ - if (!prop) + if (!device_property_present(dev, rotaryname[i])) continue; - code = be32_to_cpup(prop); + error = device_property_read_u32(dev, rotaryname[i], &code); + if (error) + return error; + /* * Not all up/down key code are valid. * Now we depends on direct-rel-code. */ - if ((!(code & 0xffff) || !(code >> 16)) && relkey_ret) { - return relkey_ret; - } else { - unsigned int n = MAX_MATRIX_KEY_NUM + (i << 1); - unsigned short keycode; - - keycode = code & 0xffff; - keypad->keycodes[n] = keycode; - __set_bit(keycode, input_dev->keybit); - - keycode = code >> 16; - keypad->keycodes[n + 1] = keycode; - __set_bit(keycode, input_dev->keybit); - - if (i == 0) - pdata->rotary0_rel_code = -1; - else - pdata->rotary1_rel_code = -1; - } - if (i == 0) - pdata->enable_rotary0 = 1; - else - pdata->enable_rotary1 = 1; - } + if (!(code & 0xffff) || !(code >> 16)) + return -EINVAL; + + encoder->enabled = true; + encoder->rel_code = -1; + encoder->key_codes = &keypad->keycodes[MAX_MATRIX_KEY_NUM + i * 2]; + encoder->key_codes[0] = code & 0xffff; + encoder->key_codes[1] = code >> 16; - keypad->rotary_rel_code[0] = pdata->rotary0_rel_code; - keypad->rotary_rel_code[1] = pdata->rotary1_rel_code; + input_set_capability(input_dev, EV_KEY, encoder->key_codes[0]); + input_set_capability(input_dev, EV_KEY, encoder->key_codes[1]); + } return 0; } -static int pxa27x_keypad_build_keycode_from_dt(struct pxa27x_keypad *keypad) +static int pxa27x_keypad_parse_properties(struct pxa27x_keypad *keypad) { struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - struct device_node *np = dev->of_node; - struct pxa27x_keypad_platform_data *pdata; int error; - pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - dev_err(dev, "failed to allocate memory for pdata\n"); - return -ENOMEM; - } - - error = pxa27x_keypad_matrix_key_parse_dt(keypad, pdata); + error = pxa27x_keypad_matrix_key_parse(keypad); if (error) { dev_err(dev, "failed to parse matrix key\n"); return error; } - error = pxa27x_keypad_direct_key_parse_dt(keypad, pdata); + error = pxa27x_keypad_direct_key_parse(keypad); if (error) { dev_err(dev, "failed to parse direct key\n"); return error; } - error = pxa27x_keypad_rotary_parse_dt(keypad, pdata); + error = pxa27x_keypad_rotary_parse(keypad); if (error) { dev_err(dev, "failed to parse rotary key\n"); return error; } - error = of_property_read_u32(np, "marvell,debounce-interval", - &pdata->debounce_interval); + error = device_property_read_u32(dev, "marvell,debounce-interval", + &keypad->debounce_interval); if (error) { dev_err(dev, "failed to parse debounce-interval\n"); return error; @@ -333,91 +315,11 @@ static int pxa27x_keypad_build_keycode_from_dt(struct pxa27x_keypad *keypad) */ input_dev->keycodemax = ARRAY_SIZE(keypad->keycodes); - keypad->pdata = pdata; - return 0; -} - -#else - -static int pxa27x_keypad_build_keycode_from_dt(struct pxa27x_keypad *keypad) -{ - dev_info(keypad->input_dev->dev.parent, "missing platform data\n"); - - return -EINVAL; -} - -#endif - -static int pxa27x_keypad_build_keycode(struct pxa27x_keypad *keypad) -{ - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; - struct input_dev *input_dev = keypad->input_dev; - unsigned short keycode; - int i; - int error; - - error = matrix_keypad_build_keymap(pdata->matrix_keymap_data, NULL, - pdata->matrix_key_rows, - pdata->matrix_key_cols, - keypad->keycodes, input_dev); - if (error) - return error; - - /* - * The keycodes may not only include matrix keys but also the direct - * or rotary keys. - */ - input_dev->keycodemax = ARRAY_SIZE(keypad->keycodes); - - /* For direct keys. */ - for (i = 0; i < pdata->direct_key_num; i++) { - keycode = pdata->direct_key_map[i]; - keypad->keycodes[MAX_MATRIX_KEY_NUM + i] = keycode; - __set_bit(keycode, input_dev->keybit); - } - - if (pdata->enable_rotary0) { - if (pdata->rotary0_up_key && pdata->rotary0_down_key) { - keycode = pdata->rotary0_up_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 0] = keycode; - __set_bit(keycode, input_dev->keybit); - - keycode = pdata->rotary0_down_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 1] = keycode; - __set_bit(keycode, input_dev->keybit); - - keypad->rotary_rel_code[0] = -1; - } else { - keypad->rotary_rel_code[0] = pdata->rotary0_rel_code; - __set_bit(pdata->rotary0_rel_code, input_dev->relbit); - } - } - - if (pdata->enable_rotary1) { - if (pdata->rotary1_up_key && pdata->rotary1_down_key) { - keycode = pdata->rotary1_up_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 2] = keycode; - __set_bit(keycode, input_dev->keybit); - - keycode = pdata->rotary1_down_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 3] = keycode; - __set_bit(keycode, input_dev->keybit); - - keypad->rotary_rel_code[1] = -1; - } else { - keypad->rotary_rel_code[1] = pdata->rotary1_rel_code; - __set_bit(pdata->rotary1_rel_code, input_dev->relbit); - } - } - - __clear_bit(KEY_RESERVED, input_dev->keybit); - return 0; } static void pxa27x_keypad_scan_matrix(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; struct input_dev *input_dev = keypad->input_dev; int row, col, num_keys_pressed = 0; u32 new_state[MAX_MATRIX_KEY_COLS]; @@ -435,8 +337,8 @@ static void pxa27x_keypad_scan_matrix(struct pxa27x_keypad *keypad) row = KPAS_RP(kpas); /* if invalid row/col, treat as no key pressed */ - if (col >= pdata->matrix_key_cols || - row >= pdata->matrix_key_rows) + if (col >= keypad->matrix_key_cols || + row >= keypad->matrix_key_rows) goto scan; new_state[col] = BIT(row); @@ -459,7 +361,7 @@ static void pxa27x_keypad_scan_matrix(struct pxa27x_keypad *keypad) new_state[7] = (kpasmkp3 >> 16) & KPASMKP_MKC_MASK; } scan: - for (col = 0; col < pdata->matrix_key_cols; col++) { + for (col = 0; col < keypad->matrix_key_cols; col++) { u32 bits_changed; int code; @@ -467,7 +369,7 @@ scan: if (bits_changed == 0) continue; - for (row = 0; row < pdata->matrix_key_rows; row++) { + for (row = 0; row < keypad->matrix_key_rows; row++) { if ((bits_changed & BIT(row)) == 0) continue; @@ -496,14 +398,16 @@ static inline int rotary_delta(u32 kprec) static void report_rotary_event(struct pxa27x_keypad *keypad, int r, int delta) { + struct pxa27x_keypad_rotary *encoder = &keypad->rotary[r]; struct input_dev *dev = keypad->input_dev; - if (delta == 0) + if (!encoder->enabled || delta == 0) return; - if (keypad->rotary_rel_code[r] == -1) { - int code = MAX_MATRIX_KEY_NUM + 2 * r + (delta > 0 ? 0 : 1); - unsigned char keycode = keypad->keycodes[code]; + if (encoder->rel_code == -1) { + int idx = delta > 0 ? 0 : 1; + int code = MAX_MATRIX_KEY_NUM + 2 * r + idx; + unsigned char keycode = encoder->key_codes[idx]; /* simulate a press-n-release */ input_event(dev, EV_MSC, MSC_SCAN, code); @@ -513,30 +417,28 @@ static void report_rotary_event(struct pxa27x_keypad *keypad, int r, int delta) input_report_key(dev, keycode, 0); input_sync(dev); } else { - input_report_rel(dev, keypad->rotary_rel_code[r], delta); + input_report_rel(dev, encoder->rel_code, delta); input_sync(dev); } } static void pxa27x_keypad_scan_rotary(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; u32 kprec; + int i; /* read and reset to default count value */ kprec = keypad_readl(KPREC); keypad_writel(KPREC, DEFAULT_KPREC); - if (pdata->enable_rotary0) + for (i = 0; i < MAX_ROTARY_ENCODERS; i++) { report_rotary_event(keypad, 0, rotary_delta(kprec)); - - if (pdata->enable_rotary1) - report_rotary_event(keypad, 1, rotary_delta(kprec >> 16)); + kprec >>= 16; + } } static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; struct input_dev *input_dev = keypad->input_dev; unsigned int new_state; u32 kpdk, bits_changed; @@ -544,14 +446,14 @@ static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) kpdk = keypad_readl(KPDK); - if (pdata->enable_rotary0 || pdata->enable_rotary1) + if (keypad->rotary[0].enabled || keypad->rotary[1].enabled) pxa27x_keypad_scan_rotary(keypad); /* * The KPDR_DK only output the key pin level, so it relates to board, * and low level may be active. */ - if (pdata->direct_key_low_active) + if (keypad->direct_key_low_active) new_state = ~KPDK_DK(kpdk) & keypad->direct_key_mask; else new_state = KPDK_DK(kpdk) & keypad->direct_key_mask; @@ -561,7 +463,7 @@ static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) if (bits_changed == 0) return; - for (i = 0; i < pdata->direct_key_num; i++) { + for (i = 0; i < keypad->direct_key_num; i++) { if (bits_changed & BIT(i)) { int code = MAX_MATRIX_KEY_NUM + i; @@ -574,21 +476,11 @@ static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) keypad->direct_key_state = new_state; } -static void clear_wakeup_event(struct pxa27x_keypad *keypad) -{ - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; - - if (pdata->clear_wakeup_event) - (pdata->clear_wakeup_event)(); -} - static irqreturn_t pxa27x_keypad_irq_handler(int irq, void *dev_id) { struct pxa27x_keypad *keypad = dev_id; unsigned long kpc = keypad_readl(KPC); - clear_wakeup_event(keypad); - if (kpc & KPC_DI) pxa27x_keypad_scan_direct(keypad); @@ -600,7 +492,6 @@ static irqreturn_t pxa27x_keypad_irq_handler(int irq, void *dev_id) static void pxa27x_keypad_config(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; unsigned int mask = 0, direct_key_num = 0; unsigned long kpc = 0; @@ -608,35 +499,33 @@ static void pxa27x_keypad_config(struct pxa27x_keypad *keypad) keypad_readl(KPC); /* enable matrix keys with automatic scan */ - if (pdata->matrix_key_rows && pdata->matrix_key_cols) { + if (keypad->matrix_key_rows && keypad->matrix_key_cols) { kpc |= KPC_ASACT | KPC_MIE | KPC_ME | KPC_MS_ALL; - kpc |= KPC_MKRN(pdata->matrix_key_rows) | - KPC_MKCN(pdata->matrix_key_cols); + kpc |= KPC_MKRN(keypad->matrix_key_rows) | + KPC_MKCN(keypad->matrix_key_cols); } /* enable rotary key, debounce interval same as direct keys */ - if (pdata->enable_rotary0) { + if (keypad->rotary[0].enabled) { mask |= 0x03; direct_key_num = 2; kpc |= KPC_REE0; } - if (pdata->enable_rotary1) { + if (keypad->rotary[1].enabled) { mask |= 0x0c; direct_key_num = 4; kpc |= KPC_REE1; } - if (pdata->direct_key_num > direct_key_num) - direct_key_num = pdata->direct_key_num; + if (keypad->direct_key_num > direct_key_num) + direct_key_num = keypad->direct_key_num; /* * Direct keys usage may not start from KP_DKIN0, check the platfrom * mask data to config the specific. */ - if (pdata->direct_key_mask) - keypad->direct_key_mask = pdata->direct_key_mask; - else + if (!keypad->direct_key_mask) keypad->direct_key_mask = GENMASK(direct_key_num - 1, 0) & ~mask; /* enable direct key */ @@ -645,7 +534,7 @@ static void pxa27x_keypad_config(struct pxa27x_keypad *keypad) keypad_writel(KPC, kpc | KPC_RE_ZERO_DEB); keypad_writel(KPREC, DEFAULT_KPREC); - keypad_writel(KPKDI, pdata->debounce_interval); + keypad_writel(KPKDI, keypad->debounce_interval); } static int pxa27x_keypad_open(struct input_dev *dev) @@ -719,19 +608,12 @@ static int pxa27x_keypad_resume(struct device *dev) static DEFINE_SIMPLE_DEV_PM_OPS(pxa27x_keypad_pm_ops, pxa27x_keypad_suspend, pxa27x_keypad_resume); - static int pxa27x_keypad_probe(struct platform_device *pdev) { - const struct pxa27x_keypad_platform_data *pdata = - dev_get_platdata(&pdev->dev); - struct device_node *np = pdev->dev.of_node; struct pxa27x_keypad *keypad; struct input_dev *input_dev; - int irq, error; - - /* Driver need build keycode from device tree or pdata */ - if (!np && !pdata) - return -EINVAL; + int irq; + int error; irq = platform_get_irq(pdev, 0); if (irq < 0) @@ -746,7 +628,6 @@ static int pxa27x_keypad_probe(struct platform_device *pdev) if (!input_dev) return -ENOMEM; - keypad->pdata = pdata; keypad->input_dev = input_dev; keypad->irq = irq; @@ -775,29 +656,12 @@ static int pxa27x_keypad_probe(struct platform_device *pdev) input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_set_capability(input_dev, EV_MSC, MSC_SCAN); - if (pdata) { - error = pxa27x_keypad_build_keycode(keypad); - } else { - error = pxa27x_keypad_build_keycode_from_dt(keypad); - /* - * Data that we get from DT resides in dynamically - * allocated memory so we need to update our pdata - * pointer. - */ - pdata = keypad->pdata; - } + error = pxa27x_keypad_parse_properties(keypad); if (error) { - dev_err(&pdev->dev, "failed to build keycode\n"); + dev_err(&pdev->dev, "failed to parse keypad properties\n"); return error; } - keypad->row_shift = get_count_order(pdata->matrix_key_cols); - - if ((pdata->enable_rotary0 && keypad->rotary_rel_code[0] != -1) || - (pdata->enable_rotary1 && keypad->rotary_rel_code[1] != -1)) { - input_dev->evbit[0] |= BIT_MASK(EV_REL); - } - error = devm_request_irq(&pdev->dev, irq, pxa27x_keypad_irq_handler, 0, pdev->name, keypad); if (error) { diff --git a/include/linux/platform_data/keypad-pxa27x.h b/include/linux/platform_data/keypad-pxa27x.h deleted file mode 100644 index a376442b9935..000000000000 --- a/include/linux/platform_data/keypad-pxa27x.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_ARCH_PXA27x_KEYPAD_H -#define __ASM_ARCH_PXA27x_KEYPAD_H - -#include -#include - -#define MAX_MATRIX_KEY_ROWS (8) -#define MAX_MATRIX_KEY_COLS (8) -#define MATRIX_ROW_SHIFT (3) -#define MAX_DIRECT_KEY_NUM (8) - -/* pxa3xx keypad platform specific parameters - * - * NOTE: - * 1. direct_key_num indicates the number of keys in the direct keypad - * _plus_ the number of rotary-encoder sensor inputs, this can be - * left as 0 if only rotary encoders are enabled, the driver will - * automatically calculate this - * - * 2. direct_key_map is the key code map for the direct keys, if rotary - * encoder(s) are enabled, direct key 0/1(2/3) will be ignored - * - * 3. rotary can be either interpreted as a relative input event (e.g. - * REL_WHEEL/REL_HWHEEL) or specific keys (e.g. UP/DOWN/LEFT/RIGHT) - * - * 4. matrix key and direct key will use the same debounce_interval by - * default, which should be sufficient in most cases - * - * pxa168 keypad platform specific parameter - * - * NOTE: - * clear_wakeup_event callback is a workaround required to clear the - * keypad interrupt. The keypad wake must be cleared in addition to - * reading the MI/DI bits in the KPC register. - */ -struct pxa27x_keypad_platform_data { - - /* code map for the matrix keys */ - const struct matrix_keymap_data *matrix_keymap_data; - unsigned int matrix_key_rows; - unsigned int matrix_key_cols; - - /* direct keys */ - int direct_key_num; - unsigned int direct_key_map[MAX_DIRECT_KEY_NUM]; - /* the key output may be low active */ - int direct_key_low_active; - /* give board a chance to choose the start direct key */ - unsigned int direct_key_mask; - - /* rotary encoders 0 */ - int enable_rotary0; - int rotary0_rel_code; - int rotary0_up_key; - int rotary0_down_key; - - /* rotary encoders 1 */ - int enable_rotary1; - int rotary1_rel_code; - int rotary1_up_key; - int rotary1_down_key; - - /* key debounce interval */ - unsigned int debounce_interval; - - /* clear wakeup event requirement for pxa168 */ - void (*clear_wakeup_event)(void); -}; - -extern void pxa_set_keypad_info(struct pxa27x_keypad_platform_data *info); - -#endif /* __ASM_ARCH_PXA27x_KEYPAD_H */ -- cgit v1.2.3 From 890ba82a60cb7a6584968a4ac15546f434afa40b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 17 Aug 2025 14:39:06 -0700 Subject: Input: spear-keyboard - drop support for platform data There are no in-kernel users of spear kbd_platform_data in the kernel, and the driver supports configuration via device tree, so drop support of static platform data and move properties parsing from OF-specific methods to generic ones. Link: https://lore.kernel.org/r/vppjxui76im26uamznx7evm5lmbe3d6v3oxsa7mqyytykh4zm6@nhlf33v3hp6g Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/spear-keyboard.c | 71 +++--------- include/linux/platform_data/keyboard-spear.h | 164 --------------------------- 2 files changed, 15 insertions(+), 220 deletions(-) delete mode 100644 include/linux/platform_data/keyboard-spear.h (limited to 'include') diff --git a/drivers/input/keyboard/spear-keyboard.c b/drivers/input/keyboard/spear-keyboard.c index 2fae337562a2..53f3ac64c980 100644 --- a/drivers/input/keyboard/spear-keyboard.c +++ b/drivers/input/keyboard/spear-keyboard.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include /* Keyboard Registers */ #define MODE_CTL_REG 0x00 @@ -56,13 +56,12 @@ struct spear_kbd { void __iomem *io_base; struct clk *clk; unsigned int irq; - unsigned int mode; - unsigned int suspended_rate; + u32 mode; + u32 suspended_rate; + u32 mode_ctl_reg; unsigned short last_key; unsigned short keycodes[NUM_ROWS * NUM_COLS]; - bool rep; bool irq_wake_enabled; - u32 mode_ctl_reg; }; static irqreturn_t spear_kbd_interrupt(int irq, void *dev_id) @@ -143,46 +142,8 @@ static void spear_kbd_close(struct input_dev *dev) kbd->last_key = KEY_RESERVED; } -#ifdef CONFIG_OF -static int spear_kbd_parse_dt(struct platform_device *pdev, - struct spear_kbd *kbd) -{ - struct device_node *np = pdev->dev.of_node; - int error; - u32 val, suspended_rate; - - if (!np) { - dev_err(&pdev->dev, "Missing DT data\n"); - return -EINVAL; - } - - if (of_property_read_bool(np, "autorepeat")) - kbd->rep = true; - - if (of_property_read_u32(np, "suspended_rate", &suspended_rate)) - kbd->suspended_rate = suspended_rate; - - error = of_property_read_u32(np, "st,mode", &val); - if (error) { - dev_err(&pdev->dev, "DT: Invalid or missing mode\n"); - return error; - } - - kbd->mode = val; - return 0; -} -#else -static inline int spear_kbd_parse_dt(struct platform_device *pdev, - struct spear_kbd *kbd) -{ - return -ENOSYS; -} -#endif - static int spear_kbd_probe(struct platform_device *pdev) { - struct kbd_platform_data *pdata = dev_get_platdata(&pdev->dev); - const struct matrix_keymap_data *keymap = pdata ? pdata->keymap : NULL; struct spear_kbd *kbd; struct input_dev *input_dev; int irq; @@ -198,6 +159,14 @@ static int spear_kbd_probe(struct platform_device *pdev) return -ENOMEM; } + error = device_property_read_u32(&pdev->dev, "st,mode", &kbd->mode); + if (error) { + dev_err(&pdev->dev, "Invalid or missing mode\n"); + return error; + } + + device_property_read_u32(&pdev->dev, "suspended_rate", &kbd->suspended_rate); + input_dev = devm_input_allocate_device(&pdev->dev); if (!input_dev) { dev_err(&pdev->dev, "unable to allocate input device\n"); @@ -207,16 +176,6 @@ static int spear_kbd_probe(struct platform_device *pdev) kbd->input = input_dev; kbd->irq = irq; - if (!pdata) { - error = spear_kbd_parse_dt(pdev, kbd); - if (error) - return error; - } else { - kbd->mode = pdata->mode; - kbd->rep = pdata->rep; - kbd->suspended_rate = pdata->suspended_rate; - } - kbd->io_base = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(kbd->io_base)) return PTR_ERR(kbd->io_base); @@ -234,21 +193,21 @@ static int spear_kbd_probe(struct platform_device *pdev) input_dev->open = spear_kbd_open; input_dev->close = spear_kbd_close; - error = matrix_keypad_build_keymap(keymap, NULL, NUM_ROWS, NUM_COLS, + error = matrix_keypad_build_keymap(NULL, NULL, NUM_ROWS, NUM_COLS, kbd->keycodes, input_dev); if (error) { dev_err(&pdev->dev, "Failed to build keymap\n"); return error; } - if (kbd->rep) + if (device_property_read_bool(&pdev->dev, "autorepeat")) __set_bit(EV_REP, input_dev->evbit); input_set_capability(input_dev, EV_MSC, MSC_SCAN); input_set_drvdata(input_dev, kbd); error = devm_request_irq(&pdev->dev, irq, spear_kbd_interrupt, 0, - "keyboard", kbd); + "keyboard", kbd); if (error) { dev_err(&pdev->dev, "request_irq failed\n"); return error; diff --git a/include/linux/platform_data/keyboard-spear.h b/include/linux/platform_data/keyboard-spear.h deleted file mode 100644 index 5e3ff653900c..000000000000 --- a/include/linux/platform_data/keyboard-spear.h +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (C) 2010 ST Microelectronics - * Rajeev Kumar - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#ifndef __PLAT_KEYBOARD_H -#define __PLAT_KEYBOARD_H - -#include -#include -#include -#include - -#define DECLARE_9x9_KEYMAP(_name) \ -int _name[] = { \ - KEY(0, 0, KEY_ESC), \ - KEY(0, 1, KEY_1), \ - KEY(0, 2, KEY_2), \ - KEY(0, 3, KEY_3), \ - KEY(0, 4, KEY_4), \ - KEY(0, 5, KEY_5), \ - KEY(0, 6, KEY_6), \ - KEY(0, 7, KEY_7), \ - KEY(0, 8, KEY_8), \ - KEY(1, 0, KEY_9), \ - KEY(1, 1, KEY_MINUS), \ - KEY(1, 2, KEY_EQUAL), \ - KEY(1, 3, KEY_BACKSPACE), \ - KEY(1, 4, KEY_TAB), \ - KEY(1, 5, KEY_Q), \ - KEY(1, 6, KEY_W), \ - KEY(1, 7, KEY_E), \ - KEY(1, 8, KEY_R), \ - KEY(2, 0, KEY_T), \ - KEY(2, 1, KEY_Y), \ - KEY(2, 2, KEY_U), \ - KEY(2, 3, KEY_I), \ - KEY(2, 4, KEY_O), \ - KEY(2, 5, KEY_P), \ - KEY(2, 6, KEY_LEFTBRACE), \ - KEY(2, 7, KEY_RIGHTBRACE), \ - KEY(2, 8, KEY_ENTER), \ - KEY(3, 0, KEY_LEFTCTRL), \ - KEY(3, 1, KEY_A), \ - KEY(3, 2, KEY_S), \ - KEY(3, 3, KEY_D), \ - KEY(3, 4, KEY_F), \ - KEY(3, 5, KEY_G), \ - KEY(3, 6, KEY_H), \ - KEY(3, 7, KEY_J), \ - KEY(3, 8, KEY_K), \ - KEY(4, 0, KEY_L), \ - KEY(4, 1, KEY_SEMICOLON), \ - KEY(4, 2, KEY_APOSTROPHE), \ - KEY(4, 3, KEY_GRAVE), \ - KEY(4, 4, KEY_LEFTSHIFT), \ - KEY(4, 5, KEY_BACKSLASH), \ - KEY(4, 6, KEY_Z), \ - KEY(4, 7, KEY_X), \ - KEY(4, 8, KEY_C), \ - KEY(5, 0, KEY_V), \ - KEY(5, 1, KEY_B), \ - KEY(5, 2, KEY_N), \ - KEY(5, 3, KEY_M), \ - KEY(5, 4, KEY_COMMA), \ - KEY(5, 5, KEY_DOT), \ - KEY(5, 6, KEY_SLASH), \ - KEY(5, 7, KEY_RIGHTSHIFT), \ - KEY(5, 8, KEY_KPASTERISK), \ - KEY(6, 0, KEY_LEFTALT), \ - KEY(6, 1, KEY_SPACE), \ - KEY(6, 2, KEY_CAPSLOCK), \ - KEY(6, 3, KEY_F1), \ - KEY(6, 4, KEY_F2), \ - KEY(6, 5, KEY_F3), \ - KEY(6, 6, KEY_F4), \ - KEY(6, 7, KEY_F5), \ - KEY(6, 8, KEY_F6), \ - KEY(7, 0, KEY_F7), \ - KEY(7, 1, KEY_F8), \ - KEY(7, 2, KEY_F9), \ - KEY(7, 3, KEY_F10), \ - KEY(7, 4, KEY_NUMLOCK), \ - KEY(7, 5, KEY_SCROLLLOCK), \ - KEY(7, 6, KEY_KP7), \ - KEY(7, 7, KEY_KP8), \ - KEY(7, 8, KEY_KP9), \ - KEY(8, 0, KEY_KPMINUS), \ - KEY(8, 1, KEY_KP4), \ - KEY(8, 2, KEY_KP5), \ - KEY(8, 3, KEY_KP6), \ - KEY(8, 4, KEY_KPPLUS), \ - KEY(8, 5, KEY_KP1), \ - KEY(8, 6, KEY_KP2), \ - KEY(8, 7, KEY_KP3), \ - KEY(8, 8, KEY_KP0), \ -} - -#define DECLARE_6x6_KEYMAP(_name) \ -int _name[] = { \ - KEY(0, 0, KEY_RESERVED), \ - KEY(0, 1, KEY_1), \ - KEY(0, 2, KEY_2), \ - KEY(0, 3, KEY_3), \ - KEY(0, 4, KEY_4), \ - KEY(0, 5, KEY_5), \ - KEY(1, 0, KEY_Q), \ - KEY(1, 1, KEY_W), \ - KEY(1, 2, KEY_E), \ - KEY(1, 3, KEY_R), \ - KEY(1, 4, KEY_T), \ - KEY(1, 5, KEY_Y), \ - KEY(2, 0, KEY_D), \ - KEY(2, 1, KEY_F), \ - KEY(2, 2, KEY_G), \ - KEY(2, 3, KEY_H), \ - KEY(2, 4, KEY_J), \ - KEY(2, 5, KEY_K), \ - KEY(3, 0, KEY_B), \ - KEY(3, 1, KEY_N), \ - KEY(3, 2, KEY_M), \ - KEY(3, 3, KEY_COMMA), \ - KEY(3, 4, KEY_DOT), \ - KEY(3, 5, KEY_SLASH), \ - KEY(4, 0, KEY_F6), \ - KEY(4, 1, KEY_F7), \ - KEY(4, 2, KEY_F8), \ - KEY(4, 3, KEY_F9), \ - KEY(4, 4, KEY_F10), \ - KEY(4, 5, KEY_NUMLOCK), \ - KEY(5, 0, KEY_KP2), \ - KEY(5, 1, KEY_KP3), \ - KEY(5, 2, KEY_KP0), \ - KEY(5, 3, KEY_KPDOT), \ - KEY(5, 4, KEY_RO), \ - KEY(5, 5, KEY_ZENKAKUHANKAKU), \ -} - -#define KEYPAD_9x9 0 -#define KEYPAD_6x6 1 -#define KEYPAD_2x2 2 - -/** - * struct kbd_platform_data - spear keyboard platform data - * keymap: pointer to keymap data (table and size) - * rep: enables key autorepeat - * mode: choose keyboard support(9x9, 6x6, 2x2) - * suspended_rate: rate at which keyboard would operate in suspended mode - * - * This structure is supposed to be used by platform code to supply - * keymaps to drivers that implement keyboards. - */ -struct kbd_platform_data { - const struct matrix_keymap_data *keymap; - bool rep; - unsigned int mode; - unsigned int suspended_rate; -}; - -#endif /* __PLAT_KEYBOARD_H */ -- cgit v1.2.3 From ad0d05dbddc1bf86e92220fea873176de6b12f78 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:21 +0800 Subject: blk-mq: Defer freeing of tags page_list to SRCU callback Tag iterators can race with the freeing of the request pages(tags->page_list), potentially leading to use-after-free issues. Defer the freeing of the page list and the tags structure itself until after an SRCU grace period has passed. This ensures that any concurrent tag iterators have completed before the memory is released. With this way, we can replace the big tags->lock in tags iterator code path with srcu for solving the issue. This is achieved by: - Adding a new `srcu_struct tags_srcu` to `blk_mq_tag_set` to protect tag map iteration. - Adding an `rcu_head` to `struct blk_mq_tags` to be used with `call_srcu`. - Moving the page list freeing logic and the `kfree(tags)` call into a new callback function, `blk_mq_free_tags_callback`. - In `blk_mq_free_tags`, invoking `call_srcu` to schedule the new callback for deferred execution. The read-side protection for the tag iterators will be added in a subsequent patch. Reviewed-by: Hannes Reinecke Reviewed-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 24 +++++++++++++++++++++++- block/blk-mq.c | 26 +++++++++++++------------- include/linux/blk-mq.h | 2 ++ 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f09a4cbe486f..3c2ec6e86d54 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -8,6 +8,9 @@ */ #include #include +#include +#include +#include #include #include "blk.h" @@ -576,11 +579,30 @@ out_free_tags: return NULL; } +static void blk_mq_free_tags_callback(struct rcu_head *head) +{ + struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags, + rcu_head); + struct page *page; + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_alloc_rqs(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } + kfree(tags); +} + void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); - kfree(tags); + call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq.c b/block/blk-mq.c index 5efa0712aac7..e1b44173029c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3454,7 +3454,6 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; - struct page *page; if (list_empty(&tags->page_list)) return; @@ -3478,17 +3477,10 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } blk_mq_clear_rq_mapping(drv_tags, tags); - - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_alloc_rqs(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* + * Free request pages in SRCU callback, which is called from + * blk_mq_free_tags(). + */ } void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) @@ -4834,6 +4826,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_srcu; } + ret = init_srcu_struct(&set->tags_srcu); + if (ret) + goto out_cleanup_srcu; init_rwsem(&set->update_nr_hwq_lock); @@ -4842,7 +4837,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_srcu; + goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4871,6 +4866,8 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_tags_srcu: + cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -4916,6 +4913,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + + srcu_barrier(&set->tags_srcu); + cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a5a828f19a0..1325ceeb743a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -531,6 +531,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + struct srcu_struct tags_srcu; struct rw_semaphore update_nr_hwq_lock; }; @@ -767,6 +768,7 @@ struct blk_mq_tags { * request pool */ spinlock_t lock; + struct rcu_head rcu_head; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, -- cgit v1.2.3 From c265ae75f900cea4e415230a77b5d152377627dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Sep 2025 00:03:00 +0100 Subject: io_uring: introduce io_uring querying There are many parameters users might want to query about io_uring like available request types or the ring sizes. This patch introduces an interface for such slow path queries. It was written with several requirements in mind: - Can be used with or without an io_uring instance. Asking for supported setup flags before creating an instance as well as qeurying info about an already created ring are valid use cases. - Should be moderately fast. For example, users might use it to periodically retrieve ring attributes at runtime. As a consequence, it should be able to query multiple attributes in a single syscall. - Backward and forward compatible. - Should be reasobably easy to use. - Reduce the kernel code size for introducing new query types. It's implemented as a new registration opcode IORING_REGISTER_QUERY. The user passes one or more query strutctures linked together, each represented by struct io_uring_query_hdr. The header stores common control fields needed for processing and points to query type specific information. The header contains - The query type - The result field, which on return contains the error code for the query - Pointer to the query type specific information - The size of the query structure. The kernel will only populate up to the size, which helps with backward compatibility. The kernel can also reduce the size, so if the current kernel is older than the inteface the user tries to use, it'll get only the supported bits. - next_entry field is used to chain multiple queries. Apart from common registeration syscall failures, it can only immediately return an error code in case when the headers are incorrect or any other addresses and invalid. That usually mean that the userspace doesn't use the API right and should be corrected. All query type specific errors are returned in the header's result field. As an example, the patch adds a single query type for now, i.e. IO_URING_QUERY_OPCODES, which tells what register / request / etc. opcodes are supported, but there are particular plans to extend it. Note: there is a request probing interface via IORING_REGISTER_PROBE, but it's a mess. It requires the user to create a ring first, it only works for requests, and requires dynamic allocations. Reviewed-by: Martin K. Petersen Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 ++ include/uapi/linux/io_uring/query.h | 41 ++++++++++++++++ io_uring/Makefile | 2 +- io_uring/query.c | 93 +++++++++++++++++++++++++++++++++++++ io_uring/query.h | 9 ++++ io_uring/register.c | 6 +++ 6 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/io_uring/query.h create mode 100644 io_uring/query.c create mode 100644 io_uring/query.h (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 04ebff33d0e6..1ce17c535944 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -686,6 +686,9 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h new file mode 100644 index 000000000000..5d754322a27c --- /dev/null +++ b/include/uapi/linux/io_uring/query.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring query interface. + */ +#ifndef LINUX_IO_URING_QUERY_H +#define LINUX_IO_URING_QUERY_H + +#include + +struct io_uring_query_hdr { + __u64 next_entry; + __u64 query_data; + __u32 query_op; + __u32 size; + __s32 result; + __u32 __resv[3]; +}; + +enum { + IO_URING_QUERY_OPCODES = 0, + + __IO_URING_QUERY_MAX, +}; + +/* Doesn't require a ring */ +struct io_uring_query_opcode { + /* The number of supported IORING_OP_* opcodes */ + __u32 nr_request_opcodes; + /* The number of supported IORING_[UN]REGISTER_* opcodes */ + __u32 nr_register_opcodes; + /* Bitmask of all supported IORING_FEAT_* flags */ + __u64 feature_flags; + /* Bitmask of all supported IORING_SETUP_* flags */ + __u64 ring_setup_flags; + /* Bitmask of all supported IORING_ENTER_** flags */ + __u64 enter_flags; + /* Bitmask of all supported IOSQE_* flags */ + __u64 sqe_flags; +}; + +#endif diff --git a/io_uring/Makefile b/io_uring/Makefile index b3f1bd492804..bc4e4a3fa0a5 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ sync.o msg_ring.o advise.o openclose.o \ statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ - memmap.o alloc_cache.o + memmap.o alloc_cache.o query.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/query.c b/io_uring/query.c new file mode 100644 index 000000000000..9eed0f371956 --- /dev/null +++ b/io_uring/query.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "linux/io_uring/query.h" + +#include "query.h" +#include "io_uring.h" + +#define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) + +static ssize_t io_query_ops(void *data) +{ + struct io_uring_query_opcode *e = data; + + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); + + e->nr_request_opcodes = IORING_OP_LAST; + e->nr_register_opcodes = IORING_REGISTER_LAST; + e->feature_flags = IORING_FEAT_FLAGS; + e->ring_setup_flags = IORING_SETUP_FLAGS; + e->enter_flags = IORING_ENTER_FLAGS; + e->sqe_flags = SQE_VALID_FLAGS; + return sizeof(*e); +} + +static int io_handle_query_entry(struct io_ring_ctx *ctx, + void *data, void __user *uhdr, + u64 *next_entry) +{ + struct io_uring_query_hdr hdr; + size_t usize, res_size = 0; + ssize_t ret = -EINVAL; + void __user *udata; + + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + return -EFAULT; + usize = hdr.size; + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); + udata = u64_to_user_ptr(hdr.query_data); + + if (hdr.query_op >= __IO_URING_QUERY_MAX) { + ret = -EOPNOTSUPP; + goto out; + } + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) + goto out; + if (copy_from_user(data, udata, hdr.size)) + return -EFAULT; + + switch (hdr.query_op) { + case IO_URING_QUERY_OPCODES: + ret = io_query_ops(data); + break; + } + + if (ret >= 0) { + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) + return -EFAULT; + res_size = ret; + ret = 0; + } +out: + hdr.result = ret; + hdr.size = min_t(size_t, usize, res_size); + + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) + return -EFAULT; + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) + return -EFAULT; + *next_entry = hdr.next_entry; + return 0; +} + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + char entry_buffer[IO_MAX_QUERY_SIZE]; + void __user *uhdr = arg; + int ret; + + memset(entry_buffer, 0, sizeof(entry_buffer)); + + if (nr_args) + return -EINVAL; + + while (uhdr) { + u64 next_hdr; + + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); + if (ret) + return ret; + uhdr = u64_to_user_ptr(next_hdr); + } + return 0; +} diff --git a/io_uring/query.h b/io_uring/query.h new file mode 100644 index 000000000000..171d47ccaaba --- /dev/null +++ b/io_uring/query.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IORING_QUERY_H +#define IORING_QUERY_H + +#include + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); + +#endif diff --git a/io_uring/register.c b/io_uring/register.c index 50ce87928be0..9c31a8afb83d 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -31,6 +31,7 @@ #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" +#include "query.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -832,6 +833,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_mem_region(ctx, arg); break; + case IORING_REGISTER_QUERY: + ret = io_query(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -901,6 +905,8 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, switch (opcode) { case IORING_REGISTER_SEND_MSG_RING: return io_uring_register_send_msg_ring(arg, nr_args); + case IORING_REGISTER_QUERY: + return io_query(NULL, arg, nr_args); } return -EINVAL; } -- cgit v1.2.3 From 473efbc3ca29f725abfb7b323798df596ef41f3e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Sep 2025 08:18:15 -0600 Subject: io_uring/uring_cmd: fix __io_uring_cmd_do_in_task !CONFIG_IO_URING typo A manual application of this patch resulted in a typo for the stub function __io_uring_cmd_do_in_task(), for the case where CONFIG_IO_URING isn't true. Fix that up. Reported-by: Klara Modin Fixes: df3a7762ee24 ("io_uring/uring_cmd: add io_uring_cmd_tw_t type alias") Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 50dd6a53cb5e..1350af846ddd 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -109,8 +109,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_tw_t task_work_cb, - unsigned flags) + io_uring_cmd_tw_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, -- cgit v1.2.3 From 34c605fe53d49886d2741223b12950a33bdf2acf Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Aug 2025 16:56:06 +0200 Subject: xen: rework xen_pv_domain() Rework xen_pv_domain() to no longer use the xen_domain_type variable, but the artificial X86_FEATURE_XENPV cpu feature. On non-x86 architectures xen_pv_domain() can be defined as "0". This has the advantage that a kernel not built with CONFIG_XEN_PV will be smaller due to dead code elimination. Set the X86_FEATURE_XENPV feature very early, as xen_pv_domain() is used rather early, too. Reviewed-by: Jason Andryuk Signed-off-by: Juergen Gross Message-ID: <20250826145608.10352-2-jgross@suse.com> --- arch/x86/xen/enlighten_pv.c | 2 +- include/xen/xen.h | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 26bbaf4b7330..4806cc28d7ca 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -382,7 +382,6 @@ static bool __init xen_check_xsave(void) static void __init xen_init_capabilities(void) { - setup_force_cpu_cap(X86_FEATURE_XENPV); setup_clear_cpu_cap(X86_FEATURE_DCA); setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); setup_clear_cpu_cap(X86_FEATURE_MTRR); @@ -1402,6 +1401,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) JMP32_INSN_SIZE); xen_domain_type = XEN_PV_DOMAIN; + setup_force_cpu_cap(X86_FEATURE_XENPV); xen_start_flags = xen_start_info->flags; /* Interrupts are guaranteed to be off initially. */ early_boot_irqs_disabled = true; diff --git a/include/xen/xen.h b/include/xen/xen.h index a1e5b3f18d69..61854e3f2837 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -22,8 +22,15 @@ extern bool xen_pvh; #define xen_pvh 0 #endif +#ifdef CONFIG_X86 +#include + +#define xen_pv_domain() (cpu_feature_enabled(X86_FEATURE_XENPV)) +#else +#define xen_pv_domain() 0 +#endif + #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) #define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #define xen_pvh_domain() (xen_pvh) -- cgit v1.2.3 From 0f4283123fe1e6016296048d0fdcfce615047a13 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Aug 2025 16:56:07 +0200 Subject: xen: replace XENFEAT_auto_translated_physmap with xen_pv_domain() Instead of testing the XENFEAT_auto_translated_physmap feature, just use !xen_pv_domain() which is equivalent. This has the advantage that a kernel not built with CONFIG_XEN_PV will be smaller due to dead code elimination. Reviewed-by: Jason Andryuk Signed-off-by: Juergen Gross Message-ID: <20250826145608.10352-3-jgross@suse.com> --- arch/x86/include/asm/xen/page.h | 14 +++++++------- arch/x86/xen/mmu.c | 2 +- arch/x86/xen/p2m.c | 4 ++-- drivers/xen/balloon.c | 4 ++-- drivers/xen/gntdev.c | 2 +- drivers/xen/grant-table.c | 6 +++--- drivers/xen/privcmd.c | 14 ++++++-------- drivers/xen/unpopulated-alloc.c | 4 ++-- drivers/xen/xenbus/xenbus_client.c | 2 +- include/xen/grant_table.h | 4 ++-- include/xen/mem-reservation.h | 4 ++-- include/xen/xen-ops.h | 7 ++++--- 12 files changed, 33 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 85e63d58c074..59f642a94b9d 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,9 +12,9 @@ #include #include +#include #include #include -#include /* Xen machine address */ typedef struct xmaddr { @@ -162,7 +162,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) * pfn_to_mfn. This will have to be removed when we figured * out which call. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return pfn; mfn = __pfn_to_mfn(pfn); @@ -175,7 +175,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) static inline int phys_to_machine_mapping_valid(unsigned long pfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 1; return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; @@ -210,7 +210,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * gfn_to_pfn. This will have to be removed when we figure * out which call. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); @@ -242,7 +242,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) /* Pseudo-physical <-> Guest conversion */ static inline unsigned long pfn_to_gfn(unsigned long pfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return pfn; else return pfn_to_mfn(pfn); @@ -250,7 +250,7 @@ static inline unsigned long pfn_to_gfn(unsigned long pfn) static inline unsigned long gfn_to_pfn(unsigned long gfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return gfn; else return mfn_to_pfn(gfn); @@ -284,7 +284,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) { unsigned long pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return mfn; pfn = mfn_to_pfn(mfn); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c4c479373249..3be45bf4bc79 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int nr, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return xen_xlate_unmap_gfn_range(vma, nr, pages); if (!pages) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 56914e21e303..2dd12b61a230 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -686,7 +686,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, int i, ret = 0; pte_t *pte; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 0; if (kmap_ops) { @@ -769,7 +769,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, { int i, ret = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 0; for (i = 0; i < count; i++) { diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2de37dcd7556..49c3f9926394 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -302,7 +302,7 @@ static enum bp_state reserve_additional_memory(void) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { unsigned long pfn, i; pfn = PFN_DOWN(resource->start); @@ -626,7 +626,7 @@ int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages) */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(page)); if (ret < 0) goto out_undo; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1f2160765618..74491967f2ae 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1183,7 +1183,7 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); + use_ptemod = xen_pv_domain(); err = misc_register(&gntdev_miscdev); if (err != 0) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 04a6b470b15d..478d2ad725ac 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1449,7 +1449,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; @@ -1570,7 +1570,7 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + if (!xen_pv_domain() && gnttab_shared.addr == NULL) { gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { pr_warn("gnttab share frames is not mapped!\n"); @@ -1588,7 +1588,7 @@ int gnttab_resume(void) int gnttab_suspend(void) { - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) gnttab_interface->unmap_frames(); return 0; } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 13a10f3294a8..f52a457b302d 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -271,7 +271,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct mmap_gfn_state state; /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) @@ -353,7 +353,7 @@ static int mmap_batch_fn(void *data, int nr, void *state) struct page **cur_pages = NULL; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) cur_pages = &pages[st->index]; BUG_ON(nr < 0); @@ -535,7 +535,7 @@ static long privcmd_ioctl_mmap_batch( ret = -EINVAL; goto out_unlock; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { ret = alloc_empty_pages(vma, nr_pages); if (ret < 0) goto out_unlock; @@ -779,8 +779,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, goto out; } - if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && - xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); struct page **pages; unsigned int i; @@ -811,8 +810,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, if (rc) goto out; - if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && - xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); } else { unsigned int domid = @@ -1591,7 +1589,7 @@ static void privcmd_close(struct vm_area_struct *vma) int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; int rc; - if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + if (xen_pv_domain() || !numpgs || !pages) return; rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index a39f2d36dd9c..d6fc2aefe264 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -105,7 +105,7 @@ static int fill_list(unsigned int nr_pages) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { xen_pfn_t pfn = PFN_DOWN(res->start); for (i = 0; i < alloc_pages; i++) { @@ -184,7 +184,7 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) pages[i] = pg; #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(pg)); if (ret < 0) { unsigned int j; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index e73ec225d4a6..2dc874fb5506 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -955,7 +955,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { #ifdef CONFIG_XEN_PV - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) ring_ops = &ring_ops_pv; else #endif diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index e279be353e3f..69ac6d80a006 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -164,7 +164,7 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, { if (flags & GNTMAP_contains_pte) map->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else if (!xen_pv_domain()) map->host_addr = __pa(addr); else map->host_addr = addr; @@ -181,7 +181,7 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, { if (flags & GNTMAP_contains_pte) unmap->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else if (!xen_pv_domain()) unmap->host_addr = __pa(addr); else unmap->host_addr = addr; diff --git a/include/xen/mem-reservation.h b/include/xen/mem-reservation.h index a2ab516fcd2c..3cbe3df0dfd4 100644 --- a/include/xen/mem-reservation.h +++ b/include/xen/mem-reservation.h @@ -39,7 +39,7 @@ static inline void xenmem_reservation_va_mapping_update(unsigned long count, xen_pfn_t *frames) { #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) __xenmem_reservation_va_mapping_update(count, pages, frames); #endif } @@ -48,7 +48,7 @@ static inline void xenmem_reservation_va_mapping_reset(unsigned long count, struct page **pages) { #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) __xenmem_reservation_va_mapping_reset(count, pages); #endif } diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 9e2a769b0d96..496e6013c689 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -116,7 +117,7 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, unsigned int domid, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, prot, domid, pages); @@ -150,7 +151,7 @@ static inline int xen_remap_domain_mfn_array(struct vm_area_struct *vma, int nr, int *err_ptr, pgprot_t prot, unsigned int domid) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid, @@ -175,7 +176,7 @@ static inline int xen_remap_domain_gfn_range(struct vm_area_struct *vma, pgprot_t prot, unsigned int domid, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false); -- cgit v1.2.3 From 9cf93a8fa9513c6d3cc65bdd50e05c1355cef322 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:04:24 -0500 Subject: ipmi: Allow an SMI sender to return an error Getting ready for handling when a BMC is non-responsive or broken, allow the sender operation to fail in an SMI. If it was a user-generated message it will return the error. The powernv code was already doing this internally, but the way it was written could result in deep stack descent if there were a lot of messages queued. Have its send return an error in this case. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ipmb.c | 4 ++-- drivers/char/ipmi/ipmi_msghandler.c | 16 +++++++++++++--- drivers/char/ipmi/ipmi_powernv.c | 17 +++++++++-------- drivers/char/ipmi/ipmi_si_intf.c | 8 +++----- drivers/char/ipmi/ipmi_ssif.c | 4 ++-- include/linux/ipmi_smi.h | 10 ++++++---- 6 files changed, 35 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index 6a4f279c7c1f..3a51e58b2487 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -404,8 +404,7 @@ static void ipmi_ipmb_shutdown(void *send_info) ipmi_ipmb_stop_thread(iidev); } -static void ipmi_ipmb_sender(void *send_info, - struct ipmi_smi_msg *msg) +static int ipmi_ipmb_sender(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_ipmb_dev *iidev = send_info; unsigned long flags; @@ -417,6 +416,7 @@ static void ipmi_ipmb_sender(void *send_info, spin_unlock_irqrestore(&iidev->lock, flags); up(&iidev->wake_thread); + return IPMI_CC_NO_ERROR; } static void ipmi_ipmb_request_events(void *send_info) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index c6a5673b6f01..a739c8822c91 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4803,6 +4803,7 @@ static void smi_work(struct work_struct *t) int run_to_completion = READ_ONCE(intf->run_to_completion); struct ipmi_smi_msg *newmsg = NULL; struct ipmi_recv_msg *msg, *msg2; + int cc; /* * Start the next message if available. @@ -4811,7 +4812,7 @@ static void smi_work(struct work_struct *t) * because the lower layer is allowed to hold locks while calling * message delivery. */ - +restart: if (!run_to_completion) spin_lock_irqsave(&intf->xmit_msgs_lock, flags); if (intf->curr_msg == NULL && !intf->in_shutdown) { @@ -4832,8 +4833,17 @@ static void smi_work(struct work_struct *t) if (!run_to_completion) spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); - if (newmsg) - intf->handlers->sender(intf->send_info, newmsg); + if (newmsg) { + cc = intf->handlers->sender(intf->send_info, newmsg); + if (cc) { + if (newmsg->user_data) + deliver_err_response(intf, + newmsg->user_data, cc); + else + ipmi_free_smi_msg(newmsg); + goto restart; + } + } handle_new_recv_msgs(intf); diff --git a/drivers/char/ipmi/ipmi_powernv.c b/drivers/char/ipmi/ipmi_powernv.c index 4a2efafcd1f8..52a1130defe5 100644 --- a/drivers/char/ipmi/ipmi_powernv.c +++ b/drivers/char/ipmi/ipmi_powernv.c @@ -51,7 +51,7 @@ static void send_error_reply(struct ipmi_smi_powernv *smi, ipmi_smi_msg_received(smi->intf, msg); } -static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) +static int ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_smi_powernv *smi = send_info; struct opal_ipmi_msg *opal_msg; @@ -93,18 +93,19 @@ static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) smi->interface_id, opal_msg, size); rc = opal_ipmi_send(smi->interface_id, opal_msg, size); pr_devel("%s: -> %d\n", __func__, rc); - - if (!rc) { - smi->cur_msg = msg; - spin_unlock_irqrestore(&smi->msg_lock, flags); - return; + if (rc) { + comp = IPMI_ERR_UNSPECIFIED; + goto err_unlock; } - comp = IPMI_ERR_UNSPECIFIED; + smi->cur_msg = msg; + spin_unlock_irqrestore(&smi->msg_lock, flags); + return IPMI_CC_NO_ERROR; + err_unlock: spin_unlock_irqrestore(&smi->msg_lock, flags); err: - send_error_reply(smi, msg, comp); + return comp; } static int ipmi_powernv_recv(struct ipmi_smi_powernv *smi) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 7b2ba318c547..88d62ef71b1b 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -809,8 +809,6 @@ restart: * this if there is not yet an upper layer to handle anything. */ if (si_sm_result == SI_SM_ATTN || smi_info->got_attn) { - unsigned char msg[2]; - if (smi_info->si_state != SI_NORMAL) { /* * We got an ATTN, but we are doing something else. @@ -907,8 +905,7 @@ static void flush_messages(void *send_info) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct smi_info *smi_info = send_info; unsigned long flags; @@ -921,7 +918,7 @@ static void sender(void *send_info, * layer will call flush_messages to clear it out. */ smi_info->waiting_msg = msg; - return; + return IPMI_CC_NO_ERROR; } spin_lock_irqsave(&smi_info->si_lock, flags); @@ -936,6 +933,7 @@ static void sender(void *send_info, smi_info->waiting_msg = msg; check_start_timer_thread(smi_info); spin_unlock_irqrestore(&smi_info->si_lock, flags); + return IPMI_CC_NO_ERROR; } static void set_run_to_completion(void *send_info, bool i_run_to_completion) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 1bc42830444d..1b63f7d2fcda 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1068,8 +1068,7 @@ static void start_next_msg(struct ssif_info *ssif_info, unsigned long *flags) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct ssif_info *ssif_info = send_info; unsigned long oflags, *flags; @@ -1089,6 +1088,7 @@ static void sender(void *send_info, msg->data[0], msg->data[1], (long long)t.tv_sec, (long)t.tv_nsec / NSEC_PER_USEC); } + return IPMI_CC_NO_ERROR; } static int get_smi_info(void *send_info, struct ipmi_smi_info *data) diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 5d69820d8b02..c2d975bbff60 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -109,8 +109,8 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; - long msgid; - void *user_data; + long msgid; + void *user_data; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; @@ -168,9 +168,11 @@ struct ipmi_smi_handlers { * are held when this is run. Message are delivered one at * a time by the message handler, a new message will not be * delivered until the previous message is returned. + * + * This can return an error if the SMI is not in a state where it + * can send a message. */ - void (*sender)(void *send_info, - struct ipmi_smi_msg *msg); + int (*sender)(void *send_info, struct ipmi_smi_msg *msg); /* * Called by the upper layer to request that we try to get -- cgit v1.2.3 From 3bc54ab3b9790ca92f197e9822e486665daa321c Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:09:11 -0500 Subject: ipmi: Rename "user_data" to "recv_msg" in an SMI message It's only used to hold the corresponding receive message, so fix the name to make that clear and the type so nothing else can be accidentally assigned to it. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 26 +++++++++++++------------- include/linux/ipmi_smi.h | 3 ++- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a739c8822c91..a0b67a35a5f0 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -1959,7 +1959,7 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, smi_msg->data[0] = (msg->netfn << 2) | (smi_addr->lun & 0x3); smi_msg->data[1] = msg->cmd; smi_msg->msgid = msgid; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; if (msg->data_len > 0) memcpy(&smi_msg->data[2], msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 2; @@ -2040,7 +2040,7 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { mutex_lock(&intf->seq_lock); @@ -2153,7 +2153,7 @@ static int i_ipmi_req_ipmb_direct(struct ipmi_smi *intf, memcpy(smi_msg->data + 4, msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 4; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; return 0; } @@ -2216,7 +2216,7 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { mutex_lock(&intf->seq_lock); @@ -4066,7 +4066,7 @@ static int handle_ipmb_direct_rcv_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_ipmb_direct_addr *daddr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI direct message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4489,7 +4489,7 @@ static int handle_bmc_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_system_interface_addr *smi_addr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI SMI message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4563,14 +4563,14 @@ return_unspecified: } else if ((msg->data_size >= 2) && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2)) && (msg->data[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data == NULL)) { + && (msg->recv_msg == NULL)) { if (intf->in_shutdown || intf->run_to_completion) goto out; /* * This is the local response to a command send, start - * the timer for these. The user_data will not be + * the timer for these. The recv_msg will not be * NULL if this is a response send, and we will let * response sends just go through. */ @@ -4630,7 +4630,7 @@ return_unspecified: requeue = handle_ipmb_direct_rcv_rsp(intf, msg); } else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2)) && (msg->rsp[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data != NULL)) { + && (msg->recv_msg != NULL)) { /* * It's a response to a response we sent. For this we * deliver a send message response to the user. @@ -4647,7 +4647,7 @@ return_unspecified: cc = msg->rsp[2]; process_response_response: - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; requeue = 0; if (!recv_msg) @@ -4836,9 +4836,9 @@ restart: if (newmsg) { cc = intf->handlers->sender(intf->send_info, newmsg); if (cc) { - if (newmsg->user_data) + if (newmsg->recv_msg) deliver_err_response(intf, - newmsg->user_data, cc); + newmsg->recv_msg, cc); else ipmi_free_smi_msg(newmsg); goto restart; @@ -5185,7 +5185,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void) rv = kmalloc(sizeof(struct ipmi_smi_msg), GFP_ATOMIC); if (rv) { rv->done = free_smi_msg; - rv->user_data = NULL; + rv->recv_msg = NULL; rv->type = IPMI_SMI_MSG_TYPE_NORMAL; atomic_inc(&smi_msg_inuse_count); } diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index c2d975bbff60..892e2d656e1e 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -110,7 +110,8 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; long msgid; - void *user_data; + /* Response to this message, will be NULL if not from a user request. */ + struct ipmi_recv_msg *recv_msg; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; -- cgit v1.2.3 From 06aba2126b414248a34b13584f22a78787c95450 Mon Sep 17 00:00:00 2001 From: Alex Tran Date: Mon, 1 Sep 2025 11:40:08 -0700 Subject: ASoC: codecs: tlv320dac33: Remove unused struct tlv320dac33_platform_data and header file tlv320dac33-plat.h Remove the tlv320dac33_platform_data struct and header file tlv320dac33-plat.h as they are not used anywhere in the kernel or outside this driver. Signed-off-by: Alex Tran Message-ID: <20250901184008.1249535-3-alex.t.tran@gmail.com> Signed-off-by: Mark Brown --- include/sound/tlv320dac33-plat.h | 21 --------------------- sound/soc/codecs/tlv320dac33.c | 12 ------------ 2 files changed, 33 deletions(-) delete mode 100644 include/sound/tlv320dac33-plat.h (limited to 'include') diff --git a/include/sound/tlv320dac33-plat.h b/include/sound/tlv320dac33-plat.h deleted file mode 100644 index 7a7249a896e3..000000000000 --- a/include/sound/tlv320dac33-plat.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Platform header for Texas Instruments TLV320DAC33 codec driver - * - * Author: Peter Ujfalusi - * - * Copyright: (C) 2009 Nokia Corporation - */ - -#ifndef __TLV320DAC33_PLAT_H -#define __TLV320DAC33_PLAT_H - -struct tlv320dac33_platform_data { - int power_gpio; - int mode1_latency; /* latency caused by the i2c writes in us */ - int auto_fifo_config; /* FIFO config based on the period size */ - int keep_bclk; /* Keep the BCLK running in FIFO modes */ - u8 burst_bclkdiv; -}; - -#endif /* __TLV320DAC33_PLAT_H */ diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c index 423b9264a205..36b3de75a3f4 100644 --- a/sound/soc/codecs/tlv320dac33.c +++ b/sound/soc/codecs/tlv320dac33.c @@ -24,7 +24,6 @@ #include #include -#include #include "tlv320dac33.h" /* @@ -1462,16 +1461,9 @@ static struct snd_soc_dai_driver dac33_dai = { static int dac33_i2c_probe(struct i2c_client *client) { - struct tlv320dac33_platform_data *pdata; struct tlv320dac33_priv *dac33; int ret, i; - if (client->dev.platform_data == NULL) { - dev_err(&client->dev, "Platform data not set\n"); - return -ENODEV; - } - pdata = client->dev.platform_data; - dac33 = devm_kzalloc(&client->dev, sizeof(struct tlv320dac33_priv), GFP_KERNEL); if (dac33 == NULL) @@ -1488,10 +1480,6 @@ static int dac33_i2c_probe(struct i2c_client *client) i2c_set_clientdata(client, dac33); - dac33->power_gpio = pdata->power_gpio; - dac33->burst_bclkdiv = pdata->burst_bclkdiv; - dac33->keep_bclk = pdata->keep_bclk; - dac33->mode1_latency = pdata->mode1_latency; if (!dac33->mode1_latency) dac33->mode1_latency = 10000; /* 10ms */ dac33->irq = client->irq; -- cgit v1.2.3 From e465ad7ef57aa1ec4122fd5b34c182d59629cb91 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Mon, 11 Aug 2025 08:48:08 -0400 Subject: clk: ti: dpll: convert from round_rate() to determine_rate() The round_rate() clk ops is deprecated, so migrate this driver from round_rate() to determine_rate(). Part of these changes were done using the Coccinelle semantic patch on the cover letter of this series, and the rest of the changes were manually done. omap4_dpll_regm4xen_round_rate() is now only called by omap4_dpll_regm4xen_determine_rate(), so let's merge that functionality into one function. This is needed for another cleanup to completely remove the round_rate() clk ops from the clk core. Tested-by: Anddreas Kemnade # OMAP3 GTA04, OMAP4 Panda Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Brian Masney --- drivers/clk/ti/clkt_dpll.c | 28 +++++++-------- drivers/clk/ti/clock.h | 6 +--- drivers/clk/ti/dpll.c | 4 +-- drivers/clk/ti/dpll3xxx.c | 7 ++-- drivers/clk/ti/dpll44xx.c | 89 ++++++++++++++++------------------------------ include/linux/clk/ti.h | 8 ++--- 6 files changed, 57 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c index a8c0fc20f791..2ecd66968af4 100644 --- a/drivers/clk/ti/clkt_dpll.c +++ b/drivers/clk/ti/clkt_dpll.c @@ -268,10 +268,9 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) /* DPLL rate rounding code */ /** - * omap2_dpll_round_rate - round a target rate for an OMAP DPLL + * omap2_dpll_determine_rate - round a target rate for an OMAP DPLL * @hw: struct clk_hw containing the struct clk * for a DPLL - * @target_rate: desired DPLL clock rate - * @parent_rate: parent's DPLL clock rate + * @req: rate request * * Given a DPLL and a desired target rate, round the target rate to a * possible, programmable rate for this DPLL. Attempts to select the @@ -280,8 +279,7 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) * (expensive) function again. Returns -EINVAL if the target rate * cannot be rounded, or the rounded rate upon success. */ -long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, - unsigned long *parent_rate) +int omap2_dpll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { struct clk_hw_omap *clk = to_clk_hw_omap(hw); int m, n, r, scaled_max_m; @@ -299,15 +297,15 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, dd = clk->dpll_data; - if (dd->max_rate && target_rate > dd->max_rate) - target_rate = dd->max_rate; + if (dd->max_rate && req->rate > dd->max_rate) + req->rate = dd->max_rate; ref_rate = clk_hw_get_rate(dd->clk_ref); clk_name = clk_hw_get_name(hw); pr_debug("clock: %s: starting DPLL round_rate, target rate %lu\n", - clk_name, target_rate); + clk_name, req->rate); - scaled_rt_rp = target_rate / (ref_rate / DPLL_SCALE_FACTOR); + scaled_rt_rp = req->rate / (ref_rate / DPLL_SCALE_FACTOR); scaled_max_m = dd->max_multiplier * DPLL_SCALE_FACTOR; dd->last_rounded_rate = 0; @@ -332,7 +330,7 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, if (m > scaled_max_m) break; - r = _dpll_test_mult(&m, n, &new_rate, target_rate, + r = _dpll_test_mult(&m, n, &new_rate, req->rate, ref_rate); /* m can't be set low enough for this n - try with a larger n */ @@ -340,7 +338,7 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, continue; /* skip rates above our target rate */ - delta = target_rate - new_rate; + delta = req->rate - new_rate; if (delta < 0) continue; @@ -359,13 +357,15 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, if (prev_min_delta == LONG_MAX) { pr_debug("clock: %s: cannot round to rate %lu\n", - clk_name, target_rate); + clk_name, req->rate); return -EINVAL; } dd->last_rounded_m = min_delta_m; dd->last_rounded_n = min_delta_n; - dd->last_rounded_rate = target_rate - prev_min_delta; + dd->last_rounded_rate = req->rate - prev_min_delta; - return dd->last_rounded_rate; + req->rate = dd->last_rounded_rate; + + return 0; } diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h index 2de7acea1ea0..d5e24fe4ae3a 100644 --- a/drivers/clk/ti/clock.h +++ b/drivers/clk/ti/clock.h @@ -273,8 +273,7 @@ int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw, u8 index); int omap3_noncore_dpll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req); -long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, - unsigned long *parent_rate); +int omap2_dpll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req); unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw, unsigned long parent_rate); @@ -296,9 +295,6 @@ void omap3_clk_lock_dpll5(void); unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw, unsigned long parent_rate); -long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw, - unsigned long target_rate, - unsigned long *parent_rate); int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, struct clk_rate_request *req); int omap2_clk_for_each(int (*fn)(struct clk_hw_omap *hw)); diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c index 1f55554e0d73..971adafd9a8b 100644 --- a/drivers/clk/ti/dpll.c +++ b/drivers/clk/ti/dpll.c @@ -77,7 +77,7 @@ const struct clk_hw_omap_ops clkhwops_omap3_dpll = {}; static const struct clk_ops omap2_dpll_core_ck_ops = { .get_parent = &omap2_init_dpll_parent, .recalc_rate = &omap2_dpllcore_recalc, - .round_rate = &omap2_dpll_round_rate, + .determine_rate = &omap2_dpll_determine_rate, .set_rate = &omap2_reprogram_dpllcore, }; #else @@ -88,7 +88,7 @@ static const struct clk_ops omap2_dpll_core_ck_ops = {}; static const struct clk_ops omap3_dpll_core_ck_ops = { .get_parent = &omap2_init_dpll_parent, .recalc_rate = &omap3_dpll_recalc, - .round_rate = &omap2_dpll_round_rate, + .determine_rate = &omap2_dpll_determine_rate, }; static const struct clk_ops omap3_dpll_ck_ops = { diff --git a/drivers/clk/ti/dpll3xxx.c b/drivers/clk/ti/dpll3xxx.c index 00680486b1bd..8c51b988a04f 100644 --- a/drivers/clk/ti/dpll3xxx.c +++ b/drivers/clk/ti/dpll3xxx.c @@ -587,6 +587,7 @@ int omap3_noncore_dpll_determine_rate(struct clk_hw *hw, { struct clk_hw_omap *clk = to_clk_hw_omap(hw); struct dpll_data *dd; + int ret; if (!req->rate) return -EINVAL; @@ -599,8 +600,10 @@ int omap3_noncore_dpll_determine_rate(struct clk_hw *hw, (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) { req->best_parent_hw = dd->clk_bypass; } else { - req->rate = omap2_dpll_round_rate(hw, req->rate, - &req->best_parent_rate); + ret = omap2_dpll_determine_rate(hw, req); + if (ret != 0) + return ret; + req->best_parent_hw = dd->clk_ref; } diff --git a/drivers/clk/ti/dpll44xx.c b/drivers/clk/ti/dpll44xx.c index 3fc2cab69a3f..08ed57f181b4 100644 --- a/drivers/clk/ti/dpll44xx.c +++ b/drivers/clk/ti/dpll44xx.c @@ -133,61 +133,6 @@ unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw, return rate; } -/** - * omap4_dpll_regm4xen_round_rate - round DPLL rate, considering REGM4XEN bit - * @hw: struct hw_clk containing the struct clk * of the DPLL to round a rate for - * @target_rate: the desired rate of the DPLL - * @parent_rate: clock rate of the DPLL parent - * - * Compute the rate that would be programmed into the DPLL hardware - * for @clk if set_rate() were to be provided with the rate - * @target_rate. Takes the REGM4XEN bit into consideration, which is - * needed for the OMAP4 ABE DPLL. Returns the rounded rate (before - * M-dividers) upon success, -EINVAL if @clk is null or not a DPLL, or - * ~0 if an error occurred in omap2_dpll_round_rate(). - */ -long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw, - unsigned long target_rate, - unsigned long *parent_rate) -{ - struct clk_hw_omap *clk = to_clk_hw_omap(hw); - struct dpll_data *dd; - long r; - - if (!clk || !clk->dpll_data) - return -EINVAL; - - dd = clk->dpll_data; - - dd->last_rounded_m4xen = 0; - - /* - * First try to compute the DPLL configuration for - * target rate without using the 4X multiplier. - */ - r = omap2_dpll_round_rate(hw, target_rate, NULL); - if (r != ~0) - goto out; - - /* - * If we did not find a valid DPLL configuration, try again, but - * this time see if using the 4X multiplier can help. Enabling the - * 4X multiplier is equivalent to dividing the target rate by 4. - */ - r = omap2_dpll_round_rate(hw, target_rate / OMAP4430_REGM4XEN_MULT, - NULL); - if (r == ~0) - return r; - - dd->last_rounded_rate *= OMAP4430_REGM4XEN_MULT; - dd->last_rounded_m4xen = 1; - -out: - omap4_dpll_lpmode_recalc(dd); - - return dd->last_rounded_rate; -} - /** * omap4_dpll_regm4xen_determine_rate - determine rate for a DPLL * @hw: pointer to the clock to determine rate for @@ -195,7 +140,7 @@ out: * * Determines which DPLL mode to use for reaching a desired rate. * Checks whether the DPLL shall be in bypass or locked mode, and if - * locked, calculates the M,N values for the DPLL via round-rate. + * locked, calculates the M,N values for the DPLL. * Returns 0 on success and a negative error value otherwise. */ int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, @@ -215,8 +160,36 @@ int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) { req->best_parent_hw = dd->clk_bypass; } else { - req->rate = omap4_dpll_regm4xen_round_rate(hw, req->rate, - &req->best_parent_rate); + struct clk_rate_request tmp_req; + long r; + + clk_hw_init_rate_request(hw, &tmp_req, req->rate); + dd->last_rounded_m4xen = 0; + + /* + * First try to compute the DPLL configuration for + * target rate without using the 4X multiplier. + */ + + r = omap2_dpll_determine_rate(hw, &tmp_req); + if (r < 0) { + /* + * If we did not find a valid DPLL configuration, try again, but + * this time see if using the 4X multiplier can help. Enabling the + * 4X multiplier is equivalent to dividing the target rate by 4. + */ + tmp_req.rate /= OMAP4430_REGM4XEN_MULT; + r = omap2_dpll_determine_rate(hw, &tmp_req); + if (r < 0) + return r; + + dd->last_rounded_rate *= OMAP4430_REGM4XEN_MULT; + dd->last_rounded_m4xen = 1; + } + + omap4_dpll_lpmode_recalc(dd); + + req->rate = dd->last_rounded_rate; req->best_parent_hw = dd->clk_ref; } diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index e656f63efdce..54a3fa370004 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -34,14 +34,14 @@ struct clk_omap_reg { * @clk_ref: struct clk_hw pointer to the clock's reference clock input * @control_reg: register containing the DPLL mode bitfield * @enable_mask: mask of the DPLL mode bitfield in @control_reg - * @last_rounded_rate: cache of the last rate result of omap2_dpll_round_rate() - * @last_rounded_m: cache of the last M result of omap2_dpll_round_rate() + * @last_rounded_rate: cache of the last rate result of omap2_dpll_determine_rate() + * @last_rounded_m: cache of the last M result of omap2_dpll_determine_rate() * @last_rounded_m4xen: cache of the last M4X result of - * omap4_dpll_regm4xen_round_rate() + * omap4_dpll_regm4xen_determine_rate() * @last_rounded_lpmode: cache of the last lpmode result of * omap4_dpll_lpmode_recalc() * @max_multiplier: maximum valid non-bypass multiplier value (actual) - * @last_rounded_n: cache of the last N result of omap2_dpll_round_rate() + * @last_rounded_n: cache of the last N result of omap2_dpll_determine_rate() * @min_divider: minimum valid non-bypass divider value (actual) * @max_divider: maximum valid non-bypass divider value (actual) * @max_rate: maximum clock rate for the DPLL -- cgit v1.2.3 From 17d370a70bae277678b6ea82d71ef5892e7aaa97 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 26 Aug 2025 17:54:55 +0200 Subject: xdp, libeth: make the xdp_init_buff() micro-optimization generic Often times the compilers are not able to expand two consecutive 32-bit writes into one 64-bit on the corresponding architectures. This applies to xdp_init_buff() called for every received frame (or at least once per each 64 frames when the frag size is fixed). Move the not-so-pretty hack from libeth_xdp straight to xdp_init_buff(), but using a proper union around ::frame_sz and ::flags. The optimization is limited to LE architectures due to the structure layout. One simple example from idpf with the XDP series applied (Clang 22-git, CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE => -O2): add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27) Function old new delta idpf_vport_splitq_napi_poll 5076 5049 -27 The perf difference with XDP_DROP is around +0.8-1% which I see as more than satisfying. Suggested-by: Simon Horman Signed-off-by: Alexander Lobakin Tested-by: Ramu R Signed-off-by: Tony Nguyen --- include/net/libeth/xdp.h | 11 +---------- include/net/xdp.h | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index f4880b50e804..bc3507edd589 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1274,7 +1274,6 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer * head with the Rx buffer data: data pointer, length, headroom, and * truesize/tailroom. Zeroes the flags. - * Uses faster single u64 write instead of per-field access. */ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, const struct libeth_fqe *fqe, @@ -1282,17 +1281,9 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, { const struct page *page = __netmem_to_page(fqe->netmem); -#ifdef __LIBETH_WORD_ACCESS - static_assert(offsetofend(typeof(xdp->base), flags) - - offsetof(typeof(xdp->base), frame_sz) == - sizeof(u64)); - - *(u64 *)&xdp->base.frame_sz = fqe->truesize; -#else - xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); -#endif xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, pp_page_to_nmdesc(page)->pp->p.offset, len, true); + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); } /** diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..af60e11b336c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -85,8 +85,20 @@ struct xdp_buff { void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ - u32 flags; /* supported values defined in xdp_buff_flags */ + + union { + struct { + /* frame size to deduce data_hard_end/tailroom */ + u32 frame_sz; + /* supported values defined in xdp_buff_flags */ + u32 flags; + }; + +#ifdef __LITTLE_ENDIAN + /* Used to micro-optimize xdp_init_buff(), don't use directly */ + u64 frame_sz_flags_init; +#endif + }; }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) @@ -118,9 +130,19 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { - xdp->frame_sz = frame_sz; xdp->rxq = rxq; + +#ifdef __LITTLE_ENDIAN + /* + * Force the compilers to initialize ::flags and assign ::frame_sz with + * one write on 64-bit LE architectures as they're often unable to do + * it themselves. + */ + xdp->frame_sz_flags_init = frame_sz; +#else + xdp->frame_sz = frame_sz; xdp->flags = 0; +#endif } static __always_inline void -- cgit v1.2.3 From 3f5952917498e7bb9d227812d4349668f62c413b Mon Sep 17 00:00:00 2001 From: Per Larsen Date: Wed, 20 Aug 2025 01:10:09 +0000 Subject: KVM: arm64: Mask response to FFA_FEATURE call The minimum size and alignment boundary for FFA_RXTX_MAP is returned in bit[1:0]. Mask off any other bits in w2 when reading the minimum buffer size in hyp_ffa_post_init. Acked-by: Will Deacon Signed-off-by: Per Larsen Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 2 +- include/linux/arm_ffa.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 96109aa99c48..a8ec1a94f3f8 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -739,7 +739,7 @@ static int hyp_ffa_post_init(void) if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; - switch (res.a2) { + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { case FFA_FEAT_RXTX_MIN_SZ_4K: min_rxtx_sz = SZ_4K; break; diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index e1634897e159..cd7ee4df9045 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -128,6 +128,7 @@ #define FFA_FEAT_RXTX_MIN_SZ_4K 0 #define FFA_FEAT_RXTX_MIN_SZ_64K 1 #define FFA_FEAT_RXTX_MIN_SZ_16K 2 +#define FFA_FEAT_RXTX_MIN_SZ_MASK GENMASK(1, 0) /* FFA Bus/Device/Driver related */ struct ffa_device { -- cgit v1.2.3 From 6606c8c7e81886565f5cbdb0c0ce82e280c2b229 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:43:58 -0700 Subject: bitops: Add __attribute_const__ to generic ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to generic implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested with x86_64 defconfig using GCC 14.2.0, which should validate the implementations when used by ARM, ARM64, LoongArch, Microblaze, NIOS2, and SPARC32 architectures. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-2-kees@kernel.org Signed-off-by: Kees Cook --- include/asm-generic/bitops/__ffs.h | 2 +- include/asm-generic/bitops/__fls.h | 2 +- include/asm-generic/bitops/builtin-__ffs.h | 2 +- include/asm-generic/bitops/builtin-__fls.h | 2 +- include/asm-generic/bitops/builtin-fls.h | 2 +- include/asm-generic/bitops/ffs.h | 2 +- include/asm-generic/bitops/fls.h | 2 +- include/asm-generic/bitops/fls64.h | 4 ++-- include/linux/bitops.h | 2 +- lib/clz_ctz.c | 8 ++++---- 10 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h index 2d08c750c8a7..3a899c626fdc 100644 --- a/include/asm-generic/bitops/__ffs.h +++ b/include/asm-generic/bitops/__ffs.h @@ -10,7 +10,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___ffs(unsigned long word) { unsigned int num = 0; diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h index e974ec932ec1..35f33780ca6c 100644 --- a/include/asm-generic/bitops/__fls.h +++ b/include/asm-generic/bitops/__fls.h @@ -10,7 +10,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word) { unsigned int num = BITS_PER_LONG - 1; diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h index cf4b3d33bf96..d3c3f567045d 100644 --- a/include/asm-generic/bitops/builtin-__ffs.h +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -8,7 +8,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h index 6d72fc8a5259..7770c4f1bfcd 100644 --- a/include/asm-generic/bitops/builtin-__fls.h +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -8,7 +8,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int __fls(unsigned long word) { return (sizeof(word) * 8) - 1 - __builtin_clzl(word); } diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h index c8455cc28841..be707da8c7cd 100644 --- a/include/asm-generic/bitops/builtin-fls.h +++ b/include/asm-generic/bitops/builtin-fls.h @@ -9,7 +9,7 @@ * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } diff --git a/include/asm-generic/bitops/ffs.h b/include/asm-generic/bitops/ffs.h index 4c43f242daeb..5ff2b7fbda6d 100644 --- a/include/asm-generic/bitops/ffs.h +++ b/include/asm-generic/bitops/ffs.h @@ -10,7 +10,7 @@ * the libc and compiler builtin ffs routines, therefore * differs in spirit from ffz (man ffs). */ -static inline int generic_ffs(int x) +static inline __attribute_const__ int generic_ffs(int x) { int r = 1; diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index 26f3ce1dd6e4..8eed3437edb9 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int generic_fls(unsigned int x) +static __always_inline __attribute_const__ int generic_fls(unsigned int x) { int r = 32; diff --git a/include/asm-generic/bitops/fls64.h b/include/asm-generic/bitops/fls64.h index 866f2b2304ff..b5f58dd261a3 100644 --- a/include/asm-generic/bitops/fls64.h +++ b/include/asm-generic/bitops/fls64.h @@ -16,7 +16,7 @@ * at position 64. */ #if BITS_PER_LONG == 32 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { __u32 h = x >> 32; if (h) @@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x) return fls(x); } #elif BITS_PER_LONG == 64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { if (x == 0) return 0; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9be2d50da09a..ea7898cc5903 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -267,7 +267,7 @@ static inline int parity8(u8 val) * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ -static inline unsigned int __ffs64(u64 word) +static inline __attribute_const__ unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c index fb8c0c5c2bd2..8778ec44bf63 100644 --- a/lib/clz_ctz.c +++ b/lib/clz_ctz.c @@ -15,28 +15,28 @@ #include int __weak __ctzsi2(int val); -int __weak __ctzsi2(int val) +int __weak __attribute_const__ __ctzsi2(int val) { return __ffs(val); } EXPORT_SYMBOL(__ctzsi2); int __weak __clzsi2(int val); -int __weak __clzsi2(int val) +int __weak __attribute_const__ __clzsi2(int val) { return 32 - fls(val); } EXPORT_SYMBOL(__clzsi2); int __weak __clzdi2(u64 val); -int __weak __clzdi2(u64 val) +int __weak __attribute_const__ __clzdi2(u64 val) { return 64 - fls64(val); } EXPORT_SYMBOL(__clzdi2); int __weak __ctzdi2(u64 val); -int __weak __ctzdi2(u64 val) +int __weak __attribute_const__ __ctzdi2(u64 val) { return __ffs64(val); } -- cgit v1.2.3 From 4cd661c248b6671914ad59e16760bb6d908dfc61 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Wed, 13 Aug 2025 11:20:57 -0700 Subject: hyperv: Add missing field to hv_output_map_device_interrupt This field is unused, but the correct structure size is needed when computing the amount of space for the output argument to reside, so that it does not cross a page boundary. Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/hyperv/hvhdk_mini.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 42e7876455b5..858f6a3925b3 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -301,6 +301,7 @@ struct hv_input_map_device_interrupt { /* HV_OUTPUT_MAP_DEVICE_INTERRUPT */ struct hv_output_map_device_interrupt { struct hv_interrupt_entry interrupt_entry; + u64 ext_status_deprecated[5]; } __packed; /* HV_INPUT_UNMAP_DEVICE_INTERRUPT */ -- cgit v1.2.3 From f26c9306dff818bbf4ef545c5a5ee0eca7149922 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Tue, 2 Sep 2025 16:48:33 -0700 Subject: mshv: Add support for a new parent partition configuration Detect booting as an "L1VH" partition. This is a new scenario very similar to root partition where the mshv_root driver can be used to create and manage guest partitions. It mostly works the same as root partition, but there are some differences in how various features are handled. hv_l1vh_partition() is introduced to handle these cases. Add hv_parent_partition() which returns true for either case, replacing some hv_root_partition() checks. Signed-off-by: Nuno Das Neves Acked-by: Wei Liu Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv_common.c | 22 +++++++++++++--------- drivers/hv/mshv_root_main.c | 26 +++++++++++++------------- include/asm-generic/mshyperv.h | 11 +++++++++++ 3 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 49898d10faff..e109a620c83f 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -257,7 +257,7 @@ static void hv_kmsg_dump_register(void) static inline bool hv_output_page_exists(void) { - return hv_root_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); + return hv_parent_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); } void __init hv_get_partition_id(void) @@ -377,7 +377,7 @@ int __init hv_common_init(void) BUG_ON(!hyperv_pcpu_output_arg); } - if (hv_root_partition()) { + if (hv_parent_partition()) { hv_synic_eventring_tail = alloc_percpu(u8 *); BUG_ON(!hv_synic_eventring_tail); } @@ -531,7 +531,7 @@ int hv_common_cpu_init(unsigned int cpu) if (msr_vp_index > hv_max_vp_index) hv_max_vp_index = msr_vp_index; - if (hv_root_partition()) { + if (hv_parent_partition()) { synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, sizeof(u8), flags); @@ -558,7 +558,7 @@ int hv_common_cpu_die(unsigned int cpu) * originally allocated memory is reused in hv_common_cpu_init(). */ - if (hv_root_partition()) { + if (hv_parent_partition()) { synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); kfree(*synic_eventring_tail); *synic_eventring_tail = NULL; @@ -729,13 +729,17 @@ void hv_identify_partition_type(void) * the root partition setting if also a Confidential VM. */ if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) && - (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && !(ms_hyperv.priv_high & HV_ISOLATION)) { - pr_info("Hyper-V: running as root partition\n"); - if (IS_ENABLED(CONFIG_MSHV_ROOT)) - hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; - else + + if (!IS_ENABLED(CONFIG_MSHV_ROOT)) { pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n"); + } else if (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) { + pr_info("Hyper-V: running as root partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; + } else { + pr_info("Hyper-V: running as L1VH partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_L1VH; + } } } diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 72df774e410a..aa20a5c96afa 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -37,12 +37,6 @@ MODULE_AUTHOR("Microsoft"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); -/* TODO move this to mshyperv.h when needed outside driver */ -static inline bool hv_parent_partition(void) -{ - return hv_root_partition(); -} - /* TODO move this to another file when debugfs code is added */ enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ #if defined(CONFIG_X86) @@ -2074,9 +2068,13 @@ static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) /* Retrieve and stash the supported scheduler type */ static int __init mshv_retrieve_scheduler_type(struct device *dev) { - int ret; + int ret = 0; + + if (hv_l1vh_partition()) + hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT; + else + ret = hv_retrieve_scheduler_type(&hv_scheduler_type); - ret = hv_retrieve_scheduler_type(&hv_scheduler_type); if (ret) return ret; @@ -2203,9 +2201,6 @@ static int __init mshv_root_partition_init(struct device *dev) { int err; - if (mshv_retrieve_scheduler_type(dev)) - return -ENODEV; - err = root_scheduler_init(dev); if (err) return err; @@ -2227,7 +2222,7 @@ static int __init mshv_parent_partition_init(void) struct device *dev; union hv_hypervisor_version_info version_info; - if (!hv_root_partition() || is_kdump_kernel()) + if (!hv_parent_partition() || is_kdump_kernel()) return -ENODEV; if (hv_get_hypervisor_version(&version_info)) @@ -2264,7 +2259,12 @@ static int __init mshv_parent_partition_init(void) mshv_cpuhp_online = ret; - ret = mshv_root_partition_init(dev); + ret = mshv_retrieve_scheduler_type(dev); + if (ret) + goto remove_cpu_state; + + if (hv_root_partition()) + ret = mshv_root_partition_init(dev); if (ret) goto remove_cpu_state; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index a729b77983fa..dbd4c2f3aee3 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -31,6 +31,7 @@ enum hv_partition_type { HV_PARTITION_TYPE_GUEST, HV_PARTITION_TYPE_ROOT, + HV_PARTITION_TYPE_L1VH, }; struct ms_hyperv_info { @@ -354,12 +355,22 @@ static inline bool hv_root_partition(void) { return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT; } +static inline bool hv_l1vh_partition(void) +{ + return hv_curr_partition_type == HV_PARTITION_TYPE_L1VH; +} +static inline bool hv_parent_partition(void) +{ + return hv_root_partition() || hv_l1vh_partition(); +} int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); #else /* CONFIG_MSHV_ROOT */ static inline bool hv_root_partition(void) { return false; } +static inline bool hv_l1vh_partition(void) { return false; } +static inline bool hv_parent_partition(void) { return false; } static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) { return -EOPNOTSUPP; -- cgit v1.2.3 From 2d0ddbb65cef99aab241378b0f4ff2d6ea8c3a5a Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 31 Aug 2025 09:04:06 -0700 Subject: Drivers: hv: Simplify data structures for VMBus channel close message struct vmbus_close_msg is used for sending the VMBus channel close message. It contains a struct vmbus_channel_msginfo, which has a flex array member at the end. The latter's presence in the middle of struct vmbus_close_msg causes warnings when built with -Wflex-array-member-not-at-end. But the struct vmbus_channel_msginfo is unused because the Hyper-V host does not send a response to the channel close message. So remove the struct vmbus_channel_msginfo. Then, since the only remaining field is struct vmbus_channel_close_channel, also remove the containing struct vmbus_close_msg and directly use struct vmbus_channel_close_channel. Besides eliminating unnecessary complexity, these changes resolve the -Wflex-array-member-not-at-end warnings. Signed-off-by: Michael Kelley Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- drivers/hv/channel.c | 2 +- include/linux/hyperv.h | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 7c7c66e0dc3f..162d6aeece7b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -925,7 +925,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) /* Send a closing message */ - msg = &channel->close_msg.msg; + msg = &channel->close_msg; msg->header.msgtype = CHANNELMSG_CLOSECHANNEL; msg->child_relid = channel->offermsg.child_relid; diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a59c5c3e95fb..59826c89171c 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -707,11 +707,6 @@ struct vmbus_channel_msginfo { unsigned char msg[]; }; -struct vmbus_close_msg { - struct vmbus_channel_msginfo info; - struct vmbus_channel_close_channel msg; -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, @@ -800,7 +795,7 @@ struct vmbus_channel { struct hv_ring_buffer_info outbound; /* send to parent */ struct hv_ring_buffer_info inbound; /* receive from parent */ - struct vmbus_close_msg close_msg; + struct vmbus_channel_close_channel close_msg; /* Statistics */ u64 interrupts; /* Host to Guest interrupts */ -- cgit v1.2.3 From ceac1fb2290d230eb83aff3761058c559440de13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:06 +0000 Subject: ipv6: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Following patch needs this preliminary change. Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 24 ++++++++++++++++++++++++ net/ipv6/proc.c | 43 ++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 6dbd2bf8fa9c..a1624e8db1ab 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -338,6 +338,19 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } +#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ + mib_statistic, offset) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; i < cnt; i++) \ + buff64[i] += snmp_get_cpu_field64( \ + mib_statistic, \ + c, stats_list[i].entry, \ + offset); \ + } \ +} + #define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ { \ int i, c; \ @@ -349,6 +362,17 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } +#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; i < cnt; i++) \ + buff[i] += snmp_get_cpu_field( \ + mib_statistic, \ + c, stats_list[i].entry); \ + } \ +} + static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) { u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index e96f14a36834..92ed04729c2f 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { @@ -96,7 +95,6 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udp6_list[] = { @@ -109,7 +107,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { @@ -121,7 +118,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) @@ -182,35 +178,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, - const struct snmp_mib *itemlist) + const struct snmp_mib *itemlist, + int cnt) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + memset(buff, 0, sizeof(unsigned long) * cnt); - snmp_get_cpu_field_batch(buff, itemlist, pcpumib); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { - for (i = 0; itemlist[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, - const struct snmp_mib *itemlist, size_t syncpoff) + const struct snmp_mib *itemlist, + int cnt, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); + memset(buff64, 0, sizeof(u64) * cnt); - snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } @@ -219,14 +217,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, - NULL, snmp6_icmp6_list); + NULL, snmp6_icmp6_list, + ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, - NULL, snmp6_udp6_list); + NULL, snmp6_udp6_list, + ARRAY_SIZE(snmp6_udp6_list)); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list); + NULL, snmp6_udplite6_list, + ARRAY_SIZE(snmp6_udplite6_list)); return 0; } @@ -236,9 +239,11 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } -- cgit v1.2.3 From 20d3d26815441d03a9a15114729faaa54957baba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:13 +0000 Subject: net: snmp: remove SNMP_MIB_SENTINEL No more user of SNMP_MIB_SENTINEL, we can remove it. Also remove snmp_get_cpu_field[64]_batch() helpers. Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 23 ----------------------- include/net/snmp.h | 5 ----- 2 files changed, 28 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index a1624e8db1ab..380afb691c41 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -326,18 +326,6 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } #endif -#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ -{ \ - int i, c; \ - for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ - buff64[i] += snmp_get_cpu_field64( \ - mib_statistic, \ - c, stats_list[i].entry, \ - offset); \ - } \ -} - #define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ mib_statistic, offset) \ { \ @@ -351,17 +339,6 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ -{ \ - int i, c; \ - for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ - buff[i] += snmp_get_cpu_field( \ - mib_statistic, \ - c, stats_list[i].entry); \ - } \ -} - #define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ { \ int i, c; \ diff --git a/include/net/snmp.h b/include/net/snmp.h index 4cb4326dfebe..584e70742e9b 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -36,11 +36,6 @@ struct snmp_mib { .entry = _entry, \ } -#define SNMP_MIB_SENTINEL { \ - .name = NULL, \ - .entry = 0, \ -} - /* * We use unsigned longs for most mibs but u64 for ipstats. */ -- cgit v1.2.3 From 7ceb69ca82b1456a66783a1472d6e677e00065a1 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Sat, 30 Aug 2025 14:14:58 +0800 Subject: ASoC: tas2781: Add tas2118, tas2x20, tas5825 support Add tas2020, tas2118, tas2120, tas2320, tas2570, tas2572, tas5825 tas5827 support in tas2781 driver. Tas2118, tas2x20, tas257x have no on-chip DSP, tas582x have on-chip DSP but have no calibration required stereo smart amplifier. Signed-off-by: Baojun Xu Acked-by: Mark Brown Signed-off-by: Takashi Iwai --- include/sound/tas2781.h | 14 +- include/sound/tas2x20-tlv.h | 259 ++++++++++++++++++++++++++++++++++ sound/soc/codecs/tas2781-comlib-i2c.c | 2 + sound/soc/codecs/tas2781-i2c.c | 184 +++++++++++++++++------- 4 files changed, 410 insertions(+), 49 deletions(-) create mode 100644 include/sound/tas2x20-tlv.h (limited to 'include') diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index f0aefc04a957..ddd997ac3216 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -51,7 +51,9 @@ /* Software Reset, compatble with new device (TAS5825). */ #define TASDEVICE_REG_SWRESET TASDEVICE_REG(0x0, 0x0, 0x01) -#define TASDEVICE_REG_SWRESET_RESET (BIT(0) | BIT(4)) +#define TASDEVICE_REG_SWRESET_RESET BIT(0) + +#define TAS5825_REG_SWRESET_RESET (BIT(0) | BIT(4)) /* Checksum */ #define TASDEVICE_CHECKSUM_REG TASDEVICE_REG(0x0, 0x0, 0x7e) @@ -110,8 +112,17 @@ #define TAS2781_RUNTIME_RE_REG TASDEVICE_REG(0x64, 0x63, 0x44) enum audio_device { + TAS2020, + TAS2118, + TAS2120, + TAS2320, TAS2563, + TAS2570, + TAS2572, TAS2781, + TAS5825, + TAS5827, + TAS_OTHERS, }; enum dspbin_type { @@ -194,6 +205,7 @@ struct tasdevice_priv { unsigned char coef_binaryname[64]; unsigned char rca_binaryname[64]; unsigned char dev_name[32]; + const unsigned char (*dvc_tlv_table)[4]; const char *name_prefix; unsigned char ndev; unsigned int dspbin_typ; diff --git a/include/sound/tas2x20-tlv.h b/include/sound/tas2x20-tlv.h new file mode 100644 index 000000000000..6e6bcec4a0a1 --- /dev/null +++ b/include/sound/tas2x20-tlv.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// +// ALSA SoC Texas Instruments TAS2x20/TAS2118 Audio Smart Amplifier +// +// Copyright (C) 2025 Texas Instruments Incorporated +// https://www.ti.com +// +// The TAS2x20/TAS2118 hda driver implements for one, two, or even multiple +// TAS2x20/TAS2118 chips. +// +// Author: Baojun Xu +// + +#ifndef __TAS2X20_TLV_H__ +#define __TAS2X20_TLV_H__ + +#define TAS2X20_DVC_LEVEL TASDEVICE_REG(0x0, 0x2, 0x0c) +#define TAS2X20_AMP_LEVEL TASDEVICE_REG(0x0, 0x0, 0x07) + +static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2x20_dvc_tlv, 1650, 50, 0); +static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2x20_amp_tlv, 2100, 50, 0); + +/* pow(10, db/20) * pow(2,22) */ +static const __maybe_unused unsigned char tas2x20_dvc_table[][4] = { + { 0X00, 0X00, 0X0D, 0X00 }, /* -110.0db */ + { 0X00, 0X00, 0X0E, 0X00 }, /* -109.5db */ + { 0X00, 0X00, 0X0E, 0X00 }, /* -109.0db */ + { 0X00, 0X00, 0X0F, 0X00 }, /* -108.5db */ + { 0X00, 0X00, 0X10, 0X00 }, /* -108.0db */ + { 0X00, 0X00, 0X11, 0X00 }, /* -107.5db */ + { 0X00, 0X00, 0X12, 0X00 }, /* -107.0db */ + { 0X00, 0X00, 0X13, 0X00 }, /* -106.5db */ + { 0X00, 0X00, 0X15, 0X00 }, /* -106.0db */ + { 0X00, 0X00, 0X16, 0X00 }, /* -105.5db */ + { 0X00, 0X00, 0X17, 0X00 }, /* -105.0db */ + { 0X00, 0X00, 0X18, 0X00 }, /* -104.5db */ + { 0X00, 0X00, 0X1A, 0X00 }, /* -104.0db */ + { 0X00, 0X00, 0X1C, 0X00 }, /* -103.5db */ + { 0X00, 0X00, 0X1D, 0X00 }, /* -103.0db */ + { 0X00, 0X00, 0X1F, 0X00 }, /* -102.5db */ + { 0X00, 0X00, 0X21, 0X00 }, /* -102.0db */ + { 0X00, 0X00, 0X23, 0X00 }, /* -101.5db */ + { 0X00, 0X00, 0X25, 0X00 }, /* -101.0db */ + { 0X00, 0X00, 0X27, 0X00 }, /* -100.5db */ + { 0X00, 0X00, 0X29, 0X00 }, /* -100.0db */ + { 0X00, 0X00, 0X2C, 0X00 }, /* -99.5db */ + { 0X00, 0X00, 0X2F, 0X00 }, /* -99.0db */ + { 0X00, 0X00, 0X31, 0X00 }, /* -98.5db */ + { 0X00, 0X00, 0X34, 0X00 }, /* -98.0db */ + { 0X00, 0X00, 0X37, 0X00 }, /* -97.5db */ + { 0X00, 0X00, 0X3B, 0X00 }, /* -97.0db */ + { 0X00, 0X00, 0X3E, 0X00 }, /* -96.5db */ + { 0X00, 0X00, 0X42, 0X00 }, /* -96.0db */ + { 0X00, 0X00, 0X46, 0X00 }, /* -95.5db */ + { 0X00, 0X00, 0X4A, 0X00 }, /* -95.0db */ + { 0X00, 0X00, 0X4F, 0X00 }, /* -94.5db */ + { 0X00, 0X00, 0X53, 0X00 }, /* -94.0db */ + { 0X00, 0X00, 0X58, 0X00 }, /* -93.5db */ + { 0X00, 0X00, 0X5D, 0X00 }, /* -93.0db */ + { 0X00, 0X00, 0X63, 0X00 }, /* -92.5db */ + { 0X00, 0X00, 0X69, 0X00 }, /* -92.0db */ + { 0X00, 0X00, 0X6F, 0X00 }, /* -91.5db */ + { 0X00, 0X00, 0X76, 0X00 }, /* -91.0db */ + { 0X00, 0X00, 0X7D, 0X00 }, /* -90.5db */ + { 0X00, 0X00, 0X84, 0X00 }, /* -90.0db */ + { 0X00, 0X00, 0X8C, 0X00 }, /* -89.5db */ + { 0X00, 0X00, 0X94, 0X00 }, /* -89.0db */ + { 0X00, 0X00, 0X9D, 0X00 }, /* -88.5db */ + { 0X00, 0X00, 0XA6, 0X00 }, /* -88.0db */ + { 0X00, 0X00, 0XB0, 0X00 }, /* -87.5db */ + { 0X00, 0X00, 0XBB, 0X00 }, /* -87.0db */ + { 0X00, 0X00, 0XC6, 0X00 }, /* -86.5db */ + { 0X00, 0X00, 0XD2, 0X00 }, /* -86.0db */ + { 0X00, 0X00, 0XDE, 0X00 }, /* -85.5db */ + { 0X00, 0X00, 0XEB, 0X00 }, /* -85.0db */ + { 0X00, 0X00, 0XF9, 0X00 }, /* -84.5db */ + { 0X00, 0X01, 0X08, 0X00 }, /* -84.0db */ + { 0X00, 0X01, 0X18, 0X00 }, /* -83.5db */ + { 0X00, 0X01, 0X28, 0X00 }, /* -83.0db */ + { 0X00, 0X01, 0X3A, 0X00 }, /* -82.5db */ + { 0X00, 0X01, 0X4D, 0X00 }, /* -82.0db */ + { 0X00, 0X01, 0X60, 0X00 }, /* -81.5db */ + { 0X00, 0X01, 0X75, 0X00 }, /* -81.0db */ + { 0X00, 0X01, 0X8B, 0X00 }, /* -80.5db */ + { 0X00, 0X01, 0XA3, 0X00 }, /* -80.0db */ + { 0X00, 0X01, 0XBC, 0X00 }, /* -79.5db */ + { 0X00, 0X01, 0XD6, 0X00 }, /* -79.0db */ + { 0X00, 0X01, 0XF2, 0X00 }, /* -78.5db */ + { 0X00, 0X02, 0X10, 0X00 }, /* -78.0db */ + { 0X00, 0X02, 0X2F, 0X00 }, /* -77.5db */ + { 0X00, 0X02, 0X50, 0X00 }, /* -77.0db */ + { 0X00, 0X02, 0X73, 0X00 }, /* -76.5db */ + { 0X00, 0X02, 0X98, 0X00 }, /* -76.0db */ + { 0X00, 0X02, 0XC0, 0X00 }, /* -75.5db */ + { 0X00, 0X02, 0XE9, 0X00 }, /* -75.0db */ + { 0X00, 0X03, 0X16, 0X00 }, /* -74.5db */ + { 0X00, 0X03, 0X44, 0X00 }, /* -74.0db */ + { 0X00, 0X03, 0X76, 0X00 }, /* -73.5db */ + { 0X00, 0X03, 0XAA, 0X00 }, /* -73.0db */ + { 0X00, 0X03, 0XE2, 0X00 }, /* -72.5db */ + { 0X00, 0X04, 0X1D, 0X00 }, /* -72.0db */ + { 0X00, 0X04, 0X5B, 0X00 }, /* -71.5db */ + { 0X00, 0X04, 0X9E, 0X00 }, /* -71.0db */ + { 0X00, 0X04, 0XE4, 0X00 }, /* -70.5db */ + { 0X00, 0X05, 0X2E, 0X00 }, /* -70.0db */ + { 0X00, 0X05, 0X7C, 0X00 }, /* -69.5db */ + { 0X00, 0X05, 0XD0, 0X00 }, /* -69.0db */ + { 0X00, 0X06, 0X28, 0X00 }, /* -68.5db */ + { 0X00, 0X06, 0X85, 0X00 }, /* -68.0db */ + { 0X00, 0X06, 0XE8, 0X00 }, /* -67.5db */ + { 0X00, 0X07, 0X51, 0X00 }, /* -67.0db */ + { 0X00, 0X07, 0XC0, 0X00 }, /* -66.5db */ + { 0X00, 0X08, 0X36, 0X00 }, /* -66.0db */ + { 0X00, 0X08, 0XB2, 0X00 }, /* -65.5db */ + { 0X00, 0X09, 0X36, 0X00 }, /* -65.0db */ + { 0X00, 0X09, 0XC2, 0X00 }, /* -64.5db */ + { 0X00, 0X0A, 0X56, 0X00 }, /* -64.0db */ + { 0X00, 0X0A, 0XF3, 0X00 }, /* -63.5db */ + { 0X00, 0X0B, 0X99, 0X00 }, /* -63.0db */ + { 0X00, 0X0C, 0X49, 0X00 }, /* -62.5db */ + { 0X00, 0X0D, 0X03, 0X00 }, /* -62.0db */ + { 0X00, 0X0D, 0XC9, 0X00 }, /* -61.5db */ + { 0X00, 0X0E, 0X9A, 0X00 }, /* -61.0db */ + { 0X00, 0X0F, 0X77, 0X00 }, /* -60.5db */ + { 0X00, 0X10, 0X62, 0X00 }, /* -60.0db */ + { 0X00, 0X11, 0X5A, 0X00 }, /* -59.5db */ + { 0X00, 0X12, 0X62, 0X00 }, /* -59.0db */ + { 0X00, 0X13, 0X78, 0X00 }, /* -58.5db */ + { 0X00, 0X14, 0XA0, 0X00 }, /* -58.0db */ + { 0X00, 0X15, 0XD9, 0X00 }, /* -57.5db */ + { 0X00, 0X17, 0X24, 0X00 }, /* -57.0db */ + { 0X00, 0X18, 0X83, 0X00 }, /* -56.5db */ + { 0X00, 0X19, 0XF7, 0X00 }, /* -56.0db */ + { 0X00, 0X1B, 0X81, 0X00 }, /* -55.5db */ + { 0X00, 0X1D, 0X22, 0X00 }, /* -55.0db */ + { 0X00, 0X1E, 0XDC, 0X00 }, /* -54.5db */ + { 0X00, 0X20, 0XB0, 0X00 }, /* -54.0db */ + { 0X00, 0X22, 0XA0, 0X00 }, /* -53.5db */ + { 0X00, 0X24, 0XAD, 0X00 }, /* -53.0db */ + { 0X00, 0X26, 0XDA, 0X00 }, /* -52.5db */ + { 0X00, 0X29, 0X27, 0X00 }, /* -52.0db */ + { 0X00, 0X2B, 0X97, 0X00 }, /* -51.5db */ + { 0X00, 0X2E, 0X2D, 0X00 }, /* -51.0db */ + { 0X00, 0X30, 0XE9, 0X00 }, /* -50.5db */ + { 0X00, 0X33, 0XCF, 0X00 }, /* -50.0db */ + { 0X00, 0X36, 0XE1, 0X00 }, /* -49.5db */ + { 0X00, 0X3A, 0X21, 0X00 }, /* -49.0db */ + { 0X00, 0X3D, 0X93, 0X00 }, /* -48.5db */ + { 0X00, 0X41, 0X39, 0X00 }, /* -48.0db */ + { 0X00, 0X45, 0X17, 0X00 }, /* -47.5db */ + { 0X00, 0X49, 0X2F, 0X00 }, /* -47.0db */ + { 0X00, 0X4D, 0X85, 0X00 }, /* -46.5db */ + { 0X00, 0X52, 0X1D, 0X00 }, /* -46.0db */ + { 0X00, 0X56, 0XFA, 0X00 }, /* -45.5db */ + { 0X00, 0X5C, 0X22, 0X00 }, /* -45.0db */ + { 0X00, 0X61, 0X97, 0X00 }, /* -44.5db */ + { 0X00, 0X67, 0X60, 0X00 }, /* -44.0db */ + { 0X00, 0X6D, 0X80, 0X00 }, /* -43.5db */ + { 0X00, 0X73, 0XFD, 0X00 }, /* -43.0db */ + { 0X00, 0X7A, 0XDC, 0X00 }, /* -42.5db */ + { 0X00, 0X82, 0X24, 0X00 }, /* -42.0db */ + { 0X00, 0X89, 0XDA, 0X00 }, /* -41.5db */ + { 0X00, 0X92, 0X05, 0X00 }, /* -41.0db */ + { 0X00, 0X9A, 0XAC, 0X00 }, /* -40.5db */ + { 0X00, 0XA3, 0XD7, 0X00 }, /* -40.0db */ + { 0X00, 0XAD, 0X8C, 0X00 }, /* -39.5db */ + { 0X00, 0XB7, 0XD4, 0X00 }, /* -39.0db */ + { 0X00, 0XC2, 0XB9, 0X00 }, /* -38.5db */ + { 0X00, 0XCE, 0X43, 0X00 }, /* -38.0db */ + { 0X00, 0XDA, 0X7B, 0X00 }, /* -37.5db */ + { 0X00, 0XE7, 0X6E, 0X00 }, /* -37.0db */ + { 0X00, 0XF5, 0X24, 0X00 }, /* -36.5db */ + { 0X01, 0X03, 0XAB, 0X00 }, /* -36.0db */ + { 0X01, 0X13, 0X0E, 0X00 }, /* -35.5db */ + { 0X01, 0X23, 0X5A, 0X00 }, /* -35.0db */ + { 0X01, 0X34, 0X9D, 0X00 }, /* -34.5db */ + { 0X01, 0X46, 0XE7, 0X00 }, /* -34.0db */ + { 0X01, 0X5A, 0X46, 0X00 }, /* -33.5db */ + { 0X01, 0X6E, 0XCA, 0X00 }, /* -33.0db */ + { 0X01, 0X84, 0X86, 0X00 }, /* -32.5db */ + { 0X01, 0X9B, 0X8C, 0X00 }, /* -32.0db */ + { 0X01, 0XB3, 0XEE, 0X00 }, /* -31.5db */ + { 0X01, 0XCD, 0XC3, 0X00 }, /* -31.0db */ + { 0X01, 0XE9, 0X20, 0X00 }, /* -30.5db */ + { 0X02, 0X06, 0X1B, 0X00 }, /* -30.0db */ + { 0X02, 0X24, 0XCE, 0X00 }, /* -29.5db */ + { 0X02, 0X45, 0X53, 0X00 }, /* -29.0db */ + { 0X02, 0X67, 0XC5, 0X00 }, /* -28.5db */ + { 0X02, 0X8C, 0X42, 0X00 }, /* -28.0db */ + { 0X02, 0XB2, 0XE8, 0X00 }, /* -27.5db */ + { 0X02, 0XDB, 0XD8, 0X00 }, /* -27.0db */ + { 0X03, 0X07, 0X36, 0X00 }, /* -26.5db */ + { 0X03, 0X35, 0X25, 0X00 }, /* -26.0db */ + { 0X03, 0X65, 0XCD, 0X00 }, /* -25.5db */ + { 0X03, 0X99, 0X57, 0X00 }, /* -25.0db */ + { 0X03, 0XCF, 0XEE, 0X00 }, /* -24.5db */ + { 0X04, 0X09, 0XC2, 0X00 }, /* -24.0db */ + { 0X04, 0X47, 0X03, 0X00 }, /* -23.5db */ + { 0X04, 0X87, 0XE5, 0X00 }, /* -23.0db */ + { 0X04, 0XCC, 0XA0, 0X00 }, /* -22.5db */ + { 0X05, 0X15, 0X6D, 0X00 }, /* -22.0db */ + { 0X05, 0X62, 0X8A, 0X00 }, /* -21.5db */ + { 0X05, 0XB4, 0X39, 0X00 }, /* -21.0db */ + { 0X06, 0X0A, 0XBF, 0X00 }, /* -20.5db */ + { 0X06, 0X66, 0X66, 0X00 }, /* -20.0db */ + { 0X06, 0XC7, 0X7B, 0X00 }, /* -19.5db */ + { 0X07, 0X2E, 0X50, 0X00 }, /* -19.0db */ + { 0X07, 0X9B, 0X3D, 0X00 }, /* -18.5db */ + { 0X08, 0X0E, 0X9F, 0X00 }, /* -18.0db */ + { 0X08, 0X88, 0XD7, 0X00 }, /* -17.5db */ + { 0X09, 0X0A, 0X4D, 0X00 }, /* -17.0db */ + { 0X09, 0X93, 0X6E, 0X00 }, /* -16.5db */ + { 0X0A, 0X24, 0XB0, 0X00 }, /* -16.0db */ + { 0X0A, 0XBE, 0X8D, 0X00 }, /* -15.5db */ + { 0X0B, 0X61, 0X88, 0X00 }, /* -15.0db */ + { 0X0C, 0X0E, 0X2B, 0X00 }, /* -14.5db */ + { 0X0C, 0XC5, 0X09, 0X00 }, /* -14.0db */ + { 0X0D, 0X86, 0XBD, 0X00 }, /* -13.5db */ + { 0X0E, 0X53, 0XEB, 0X00 }, /* -13.0db */ + { 0X0F, 0X2D, 0X42, 0X00 }, /* -12.5db */ + { 0X10, 0X13, 0X79, 0X00 }, /* -12.0db */ + { 0X11, 0X07, 0X54, 0X00 }, /* -11.5db */ + { 0X12, 0X09, 0XA3, 0X00 }, /* -11.0db */ + { 0X13, 0X1B, 0X40, 0X00 }, /* -10.5db */ + { 0X14, 0X3D, 0X13, 0X00 }, /* -10.0db */ + { 0X15, 0X70, 0X12, 0X00 }, /* -9.5db */ + { 0X16, 0XB5, 0X43, 0X00 }, /* -9.0db */ + { 0X18, 0X0D, 0XB8, 0X00 }, /* -8.5db */ + { 0X19, 0X7A, 0X96, 0X00 }, /* -8.0db */ + { 0X1A, 0XFD, 0X13, 0X00 }, /* -7.5db */ + { 0X1C, 0X96, 0X76, 0X00 }, /* -7.0db */ + { 0X1E, 0X48, 0X1C, 0X00 }, /* -6.5db */ + { 0X20, 0X13, 0X73, 0X00 }, /* -6.0db */ + { 0X21, 0XFA, 0X02, 0X00 }, /* -5.5db */ + { 0X23, 0XFD, 0X66, 0X00 }, /* -5.0db */ + { 0X26, 0X1F, 0X54, 0X00 }, /* -4.5db */ + { 0X28, 0X61, 0X9A, 0X00 }, /* -4.0db */ + { 0X2A, 0XC6, 0X25, 0X00 }, /* -3.5db */ + { 0X2D, 0X4E, 0XFB, 0X00 }, /* -3.0db */ + { 0X2F, 0XFE, 0X44, 0X00 }, /* -2.5db */ + { 0X32, 0XD6, 0X46, 0X00 }, /* -2.0db */ + { 0X35, 0XD9, 0X6B, 0X00 }, /* -1.5db */ + { 0X39, 0X0A, 0X41, 0X00 }, /* -1.0db */ + { 0X3C, 0X6B, 0X7E, 0X00 }, /* -0.5db */ + { 0X40, 0X00, 0X00, 0X00 }, /* 0.0db */ + { 0X43, 0XCA, 0XD0, 0X00 }, /* 0.5db */ + { 0X47, 0XCF, 0X26, 0X00 }, /* 1.0db */ + { 0X4C, 0X10, 0X6B, 0X00 }, /* 1.5db */ + { 0X50, 0X92, 0X3B, 0X00 }, /* 2.0db */ + { 0X55, 0X58, 0X6A, 0X00 }, /* 2.5db */ + { 0X5A, 0X67, 0X03, 0X00 }, /* 3.0db */ + { 0X5F, 0XC2, 0X53, 0X00 }, /* 3.5db */ + { 0X65, 0X6E, 0XE3, 0X00 }, /* 4.0db */ + { 0X6B, 0X71, 0X86, 0X00 }, /* 4.5db */ + { 0X71, 0XCF, 0X54, 0X00 }, /* 5.0db */ + { 0X78, 0X8D, 0XB4, 0X00 }, /* 5.5db */ + { 0X7F, 0XB2, 0X61, 0X00 }, /* 6.0db */ +}; +#endif diff --git a/sound/soc/codecs/tas2781-comlib-i2c.c b/sound/soc/codecs/tas2781-comlib-i2c.c index c078bb0a8437..b3fd7350143b 100644 --- a/sound/soc/codecs/tas2781-comlib-i2c.c +++ b/sound/soc/codecs/tas2781-comlib-i2c.c @@ -320,6 +320,8 @@ void tasdevice_reset(struct tasdevice_priv *tas_dev) for (i = 0; i < tas_dev->ndev; i++) { ret = tasdevice_dev_write(tas_dev, i, TASDEVICE_REG_SWRESET, + tas_dev->chip_id >= TAS5825 ? + TAS5825_REG_SWRESET_RESET : TASDEVICE_REG_SWRESET_RESET); if (ret < 0) dev_err(tas_dev->dev, diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index 0e09d794516f..0b8e3638c2ac 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -30,8 +30,10 @@ #include #include #include +#include #include #include +#include #include #define X2563_CL_STT_VAL(xreg, xval) \ @@ -98,16 +100,32 @@ static const struct bulk_reg_val tas2781_cali_start_reg[] = { }; static const struct i2c_device_id tasdevice_id[] = { + { "tas2020", TAS2020 }, + { "tas2118", TAS2118 }, + { "tas2120", TAS2120 }, + { "tas2320", TAS2320 }, { "tas2563", TAS2563 }, + { "tas2570", TAS2570 }, + { "tas2572", TAS2572 }, { "tas2781", TAS2781 }, + { "tas5825", TAS5825 }, + { "tas5827", TAS5827 }, {} }; MODULE_DEVICE_TABLE(i2c, tasdevice_id); #ifdef CONFIG_OF static const struct of_device_id tasdevice_of_match[] = { + { .compatible = "ti,tas2020" }, + { .compatible = "ti,tas2118" }, + { .compatible = "ti,tas2120" }, + { .compatible = "ti,tas2320" }, { .compatible = "ti,tas2563" }, + { .compatible = "ti,tas2570" }, + { .compatible = "ti,tas2572" }, { .compatible = "ti,tas2781" }, + { .compatible = "ti,tas5825" }, + { .compatible = "ti,tas5827" }, {}, }; MODULE_DEVICE_TABLE(of, tasdevice_of_match); @@ -797,7 +815,7 @@ static int tasdev_nop_get( return 0; } -static int tas2563_digital_gain_get( +static int tasdevice_digital_gain_get( struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { @@ -823,15 +841,15 @@ static int tas2563_digital_gain_get( while (r > 1 + l) { mid = (l + r) / 2; - ar_mid = get_unaligned_be32(tas2563_dvc_table[mid]); + ar_mid = get_unaligned_be32(tas_dev->dvc_tlv_table[mid]); if (target < ar_mid) r = mid; else l = mid; } - ar_l = get_unaligned_be32(tas2563_dvc_table[l]); - ar_r = get_unaligned_be32(tas2563_dvc_table[r]); + ar_l = get_unaligned_be32(tas_dev->dvc_tlv_table[l]); + ar_r = get_unaligned_be32(tas_dev->dvc_tlv_table[r]); /* find out the member same as or closer to the current volume */ ucontrol->value.integer.value[0] = @@ -841,7 +859,7 @@ out: return 0; } -static int tas2563_digital_gain_put( +static int tasdevice_digital_gain_put( struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { @@ -867,7 +885,7 @@ static int tas2563_digital_gain_put( } volrd = get_unaligned_be32(&data[0]); - volwr = get_unaligned_be32(tas2563_dvc_table[vol]); + volwr = get_unaligned_be32(tas_dev->dvc_tlv_table[vol]); if (volrd == volwr) { rc = 0; @@ -876,7 +894,7 @@ static int tas2563_digital_gain_put( for (i = 0; i < tas_dev->ndev; i++) { ret = tasdevice_dev_bulk_write(tas_dev, i, reg, - (unsigned char *)tas2563_dvc_table[vol], 4); + (unsigned char *)tas_dev->dvc_tlv_table[vol], 4); if (ret) { dev_err(tas_dev->dev, "%s, set digital vol error in dev %d\n", @@ -892,11 +910,6 @@ out: return rc; } -static const struct snd_kcontrol_new tasdevice_snd_controls[] = { - SOC_SINGLE_BOOL_EXT("Speaker Force Firmware Load", 0, - tasdev_force_fwload_get, tasdev_force_fwload_put), -}; - static const struct snd_kcontrol_new tasdevice_cali_controls[] = { SOC_SINGLE_EXT("Calibration Stop", SND_SOC_NOPM, 0, 1, 0, tasdev_nop_get, tasdev_calib_stop_put), @@ -907,6 +920,16 @@ static const struct snd_kcontrol_new tasdevice_cali_controls[] = { SND_SOC_BYTES_EXT("Amp XMA2 Data", 6, tasdev_XMA2_data_get, NULL), }; +static const struct snd_kcontrol_new tas2x20_snd_controls[] = { + SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2X20_AMP_LEVEL, + 0, 0, 42, 1, tas2781_amp_getvol, + tas2781_amp_putvol, tas2x20_amp_tlv), + SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS2X20_DVC_LEVEL, + 0, 0, ARRAY_SIZE(tas2x20_dvc_table) - 1, 0, + tasdevice_digital_gain_get, tasdevice_digital_gain_put, + tas2x20_dvc_tlv), +}; + static const struct snd_kcontrol_new tas2781_snd_controls[] = { SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2781_AMP_LEVEL, 1, 0, 20, 0, tas2781_amp_getvol, @@ -916,6 +939,15 @@ static const struct snd_kcontrol_new tas2781_snd_controls[] = { tas2781_digital_putvol, tas2781_dvc_tlv), }; +static const struct snd_kcontrol_new tas5825_snd_controls[] = { + SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS5825_AMP_LEVEL, + 0, 0, 31, 1, tas2781_amp_getvol, + tas2781_amp_putvol, tas5825_amp_tlv), + SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS5825_DVC_LEVEL, + 0, 0, 254, 1, tas2781_amp_getvol, + tas2781_amp_putvol, tas5825_dvc_tlv), +}; + static const struct snd_kcontrol_new tas2781_cali_controls[] = { SND_SOC_BYTES_EXT("Amp Latch Data", 3, tas2781_latch_reg_get, NULL), }; @@ -923,7 +955,7 @@ static const struct snd_kcontrol_new tas2781_cali_controls[] = { static const struct snd_kcontrol_new tas2563_snd_controls[] = { SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS2563_DVC_LVL, 0, 0, ARRAY_SIZE(tas2563_dvc_table) - 1, 0, - tas2563_digital_gain_get, tas2563_digital_gain_put, + tasdevice_digital_gain_get, tasdevice_digital_gain_put, tas2563_dvc_tlv), }; @@ -968,8 +1000,8 @@ static int tasdevice_info_chip_id(struct snd_kcontrol *kcontrol, { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; - uinfo->value.integer.min = TAS2563; - uinfo->value.integer.max = TAS2781; + uinfo->value.integer.min = TAS2020; + uinfo->value.integer.max = TAS_OTHERS; return 0; } @@ -1168,9 +1200,9 @@ static int tasdevice_active_num_put(struct snd_kcontrol *kcontrol, static int tasdevice_dsp_create_ctrls(struct tasdevice_priv *tas_priv) { struct snd_kcontrol_new *dsp_ctrls; - char *active_dev_num, *chip_id; + char *active_dev_num, *chip_id, *fw_load; char *conf_name, *prog_name; - int nr_controls = 4; + int nr_controls = 5; int mix_index = 0; /* Alloc kcontrol via devm_kzalloc, which don't manually @@ -1228,6 +1260,19 @@ static int tasdevice_dsp_create_ctrls(struct tasdevice_priv *tas_priv) dsp_ctrls[mix_index].get = tasdevice_get_chip_id; mix_index++; + fw_load = devm_kstrdup(tas_priv->dev, "Speaker Force Firmware Load", + GFP_KERNEL); + if (!fw_load) + return -ENOMEM; + + dsp_ctrls[mix_index].name = fw_load; + dsp_ctrls[mix_index].iface = SNDRV_CTL_ELEM_IFACE_MIXER; + dsp_ctrls[mix_index].info = snd_soc_info_bool_ext; + dsp_ctrls[mix_index].put = tasdev_force_fwload_put; + dsp_ctrls[mix_index].get = tasdev_force_fwload_get; + dsp_ctrls[mix_index].private_value = 0UL; + mix_index++; + return snd_soc_add_component_controls(tas_priv->codec, dsp_ctrls, nr_controls < mix_index ? nr_controls : mix_index); } @@ -1587,6 +1632,16 @@ static void tasdevice_fw_ready(const struct firmware *fmw, * failing to load DSP firmware is NOT an error. */ tas_priv->fw_state = TASDEVICE_RCA_FW_OK; + /* There is no DSP firmware required for TAS2118/2X20/257X. */ + switch (tas_priv->chip_id) { + case TAS2020: + case TAS2118: + case TAS2120: + case TAS2320: + case TAS2570: + case TAS2572: + goto out; + } if (tas_priv->name_prefix) scnprintf(tas_priv->coef_binaryname, 64, "%s-%s_coef.bin", tas_priv->name_prefix, tas_priv->dev_name); @@ -1608,34 +1663,37 @@ static void tasdevice_fw_ready(const struct firmware *fmw, dev_err(tas_priv->dev, "dsp controls error\n"); goto out; } - - ret = tasdevice_create_cali_ctrls(tas_priv); - if (ret) { - dev_err(tas_priv->dev, "cali controls error\n"); - goto out; - } - tas_priv->fw_state = TASDEVICE_DSP_FW_ALL_OK; - /* If calibrated data occurs error, dsp will still works with default - * calibrated data inside algo. - */ - for (i = 0; i < tas_priv->ndev; i++) { - if (tas_priv->name_prefix) - scnprintf(tas_priv->cal_binaryname[i], 64, - "%s-%s_cal_0x%02x.bin", tas_priv->name_prefix, - tas_priv->dev_name, - tas_priv->tasdevice[i].dev_addr); - else - scnprintf(tas_priv->cal_binaryname[i], 64, - "%s_cal_0x%02x.bin", tas_priv->dev_name, - tas_priv->tasdevice[i].dev_addr); - ret = tas2781_load_calibration(tas_priv, - tas_priv->cal_binaryname[i], i); - if (ret != 0) - dev_err(tas_priv->dev, - "%s: load %s error, default will effect\n", - __func__, tas_priv->cal_binaryname[i]); + /* There is no calibration required for TAS5825/TAS5827. */ + if (tas_priv->chip_id < TAS5825) { + ret = tasdevice_create_cali_ctrls(tas_priv); + if (ret) { + dev_err(tas_priv->dev, "cali controls error\n"); + goto out; + } + /* If calibrated data occurs error, dsp will still works + * with default calibrated data inside algo. + */ + for (i = 0; i < tas_priv->ndev; i++) { + if (tas_priv->name_prefix) + scnprintf(tas_priv->cal_binaryname[i], 64, + "%s-%s_cal_0x%02x.bin", + tas_priv->name_prefix, + tas_priv->dev_name, + tas_priv->tasdevice[i].dev_addr); + else + scnprintf(tas_priv->cal_binaryname[i], 64, + "%s_cal_0x%02x.bin", + tas_priv->dev_name, + tas_priv->tasdevice[i].dev_addr); + ret = tas2781_load_calibration(tas_priv, + tas_priv->cal_binaryname[i], i); + if (ret != 0) + dev_err(tas_priv->dev, + "%s: load %s error, keep default.\n", + __func__, tas_priv->cal_binaryname[i]); + } } tasdevice_prmg_load(tas_priv, 0); @@ -1653,8 +1711,14 @@ static void tasdevice_fw_ready(const struct firmware *fmw, #endif out: if (tas_priv->fw_state == TASDEVICE_RCA_FW_OK) { - /* If DSP FW fail, DSP kcontrol won't be created. */ - tasdevice_dsp_remove(tas_priv); + switch (tas_priv->chip_id) { + case TAS2563: + case TAS2781: + case TAS5825: + case TAS5827: + /* If DSP FW fail, DSP kcontrol won't be created. */ + tasdevice_dsp_remove(tas_priv); + } } mutex_unlock(&tas_priv->codec_lock); release_firmware(fmw); @@ -1798,13 +1862,30 @@ static int tasdevice_codec_probe(struct snd_soc_component *codec) int rc; switch (tas_priv->chip_id) { + case TAS2020: + case TAS2118: + case TAS2120: + case TAS2320: + case TAS2570: + case TAS2572: + p = (struct snd_kcontrol_new *)tas2x20_snd_controls; + size = ARRAY_SIZE(tas2x20_snd_controls); + tas_priv->dvc_tlv_table = tas2x20_dvc_table; + break; case TAS2781: p = (struct snd_kcontrol_new *)tas2781_snd_controls; size = ARRAY_SIZE(tas2781_snd_controls); break; + case TAS5825: + case TAS5827: + p = (struct snd_kcontrol_new *)tas5825_snd_controls; + size = ARRAY_SIZE(tas5825_snd_controls); + break; default: p = (struct snd_kcontrol_new *)tas2563_snd_controls; size = ARRAY_SIZE(tas2563_snd_controls); + tas_priv->dvc_tlv_table = tas2563_dvc_table; + break; } rc = snd_soc_add_component_controls(codec, p, size); @@ -1844,8 +1925,6 @@ static const struct snd_soc_component_driver soc_codec_driver_tasdevice = { .probe = tasdevice_codec_probe, .remove = tasdevice_codec_remove, - .controls = tasdevice_snd_controls, - .num_controls = ARRAY_SIZE(tasdevice_snd_controls), .dapm_widgets = tasdevice_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(tasdevice_dapm_widgets), .dapm_routes = tasdevice_audio_map, @@ -1961,7 +2040,16 @@ static void tasdevice_i2c_remove(struct i2c_client *client) #ifdef CONFIG_ACPI static const struct acpi_device_id tasdevice_acpi_match[] = { - { "TAS2781", TAS2781 }, + { "TXNW2020", TAS2020 }, + { "TXNW2118", TAS2118 }, + { "TXNW2120", TAS2120 }, + { "TXNW2320", TAS2320 }, + { "TXNW2563", TAS2563 }, + { "TXNW2570", TAS2570 }, + { "TXNW2572", TAS2572 }, + { "TXNW2781", TAS2781 }, + { "TXNW5825", TAS5825 }, + { "TXNW5827", TAS5827 }, {}, }; -- cgit v1.2.3 From faf23f54d366467bb449a7b2c39b382db9f92e80 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 12 Aug 2025 17:17:06 +0300 Subject: ptp: Add ioctl commands to expose raw cycle counter values Introduce two new ioctl commands, PTP_SYS_OFFSET_PRECISE_CYCLES and PTP_SYS_OFFSET_EXTENDED_CYCLES, to allow user space to access the raw free-running cycle counter from PTP devices. These ioctls are variants of the existing PRECISE and EXTENDED offset queries, but instead of returning device time in realtime, they return the raw cycle counter value. Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Acked-by: Richard Cochran Link: https://patch.msgid.link/1755008228-88881-2-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_chardev.c | 34 ++++++++++++++++++++++++++-------- include/uapi/linux/ptp_clock.h | 4 ++++ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 4ca5a464a46a..e9719f365aab 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -285,17 +285,21 @@ static long ptp_enable_pps(struct ptp_clock *ptp, bool enable) return ops->enable(ops, &req, enable); } -static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) +typedef int (*ptp_crosststamp_fn)(struct ptp_clock_info *, + struct system_device_crosststamp *); + +static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg, + ptp_crosststamp_fn crosststamp_fn) { struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; struct timespec64 ts; int err; - if (!ptp->info->getcrosststamp) + if (!crosststamp_fn) return -EOPNOTSUPP; - err = ptp->info->getcrosststamp(ptp->info, &xtstamp); + err = crosststamp_fn(ptp->info, &xtstamp); if (err) return err; @@ -313,12 +317,17 @@ static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) return copy_to_user(arg, &precise_offset, sizeof(precise_offset)) ? -EFAULT : 0; } -static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) +typedef int (*ptp_gettimex_fn)(struct ptp_clock_info *, + struct timespec64 *, + struct ptp_system_timestamp *); + +static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg, + ptp_gettimex_fn gettimex_fn) { struct ptp_sys_offset_extended *extoff __free(kfree) = NULL; struct ptp_system_timestamp sts; - if (!ptp->info->gettimex64) + if (!gettimex_fn) return -EOPNOTSUPP; extoff = memdup_user(arg, sizeof(*extoff)); @@ -346,7 +355,7 @@ static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) struct timespec64 ts; int err; - err = ptp->info->gettimex64(ptp->info, &ts, &sts); + err = gettimex_fn(ptp->info, &ts, &sts); if (err) return err; @@ -497,11 +506,13 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, case PTP_SYS_OFFSET_PRECISE: case PTP_SYS_OFFSET_PRECISE2: - return ptp_sys_offset_precise(ptp, argptr); + return ptp_sys_offset_precise(ptp, argptr, + ptp->info->getcrosststamp); case PTP_SYS_OFFSET_EXTENDED: case PTP_SYS_OFFSET_EXTENDED2: - return ptp_sys_offset_extended(ptp, argptr); + return ptp_sys_offset_extended(ptp, argptr, + ptp->info->gettimex64); case PTP_SYS_OFFSET: case PTP_SYS_OFFSET2: @@ -523,6 +534,13 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, case PTP_MASK_EN_SINGLE: return ptp_mask_en_single(pccontext->private_clkdata, argptr); + case PTP_SYS_OFFSET_PRECISE_CYCLES: + return ptp_sys_offset_precise(ptp, argptr, + ptp->info->getcrosscycles); + + case PTP_SYS_OFFSET_EXTENDED_CYCLES: + return ptp_sys_offset_extended(ptp, argptr, + ptp->info->getcyclesx64); default: return -ENOTTY; } diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d6..65f187b5f0d0 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -245,6 +245,10 @@ struct ptp_pin_desc { _IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended) #define PTP_MASK_CLEAR_ALL _IO(PTP_CLK_MAGIC, 19) #define PTP_MASK_EN_SINGLE _IOW(PTP_CLK_MAGIC, 20, unsigned int) +#define PTP_SYS_OFFSET_PRECISE_CYCLES \ + _IOWR(PTP_CLK_MAGIC, 21, struct ptp_sys_offset_precise) +#define PTP_SYS_OFFSET_EXTENDED_CYCLES \ + _IOWR(PTP_CLK_MAGIC, 22, struct ptp_sys_offset_extended) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occurred. */ -- cgit v1.2.3 From ff97bc38be343e4530e2f140b40cbdce2e09152f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 3 Sep 2025 10:30:00 +0300 Subject: net/mlx5: Add RS FEC histogram infrastructure Define the Ports Phy Histogram Configuration Register (PPHCR) to expose RS-FEC histogram bin ranges, and expose a new counter group in the Ports Performance Counters Register (PPCNT) to report the corresponding histogram values. Co-developed-by: Yael Chemla Signed-off-by: Yael Chemla Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1756884600-520195-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 72a83666e67f..d7f46a8fbfa1 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1525,6 +1525,7 @@ enum { MLX5_PHYSICAL_LAYER_RECOVERY_GROUP = 0x1a, MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP = 0x21, + MLX5_RS_FEC_HISTOGRAM_GROUP = 0x23, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..c0858af0e854 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -130,6 +130,7 @@ enum { MLX5_REG_PDDR = 0x5031, MLX5_REG_PMLP = 0x5002, MLX5_REG_PPLM = 0x5023, + MLX5_REG_PPHCR = 0x503E, MLX5_REG_PCAM = 0x507f, MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9f14a0c7f4f..097b1b7ada63 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4901,6 +4901,11 @@ union mlx5_ifc_field_select_802_1_r_roce_auto_bits { u8 reserved_at_0[0x20]; }; +struct mlx5_ifc_rs_histogram_cntrs_bits { + u8 hist[16][0x40]; + u8 reserved_at_400[0x2c0]; +}; + union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout; struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout; @@ -4915,6 +4920,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs; struct mlx5_ifc_phys_layer_recovery_cntrs_bits phys_layer_recovery_cntrs; + struct mlx5_ifc_rs_histogram_cntrs_bits rs_histogram_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -11738,6 +11744,28 @@ struct mlx5_ifc_mtctr_reg_bits { u8 second_clock_timestamp[0x40]; }; +struct mlx5_ifc_bin_range_layout_bits { + u8 reserved_at_0[0xa]; + u8 high_val[0x6]; + u8 reserved_at_10[0xa]; + u8 low_val[0x6]; +}; + +struct mlx5_ifc_pphcr_reg_bits { + u8 active_hist_type[0x4]; + u8 reserved_at_4[0x4]; + u8 local_port[0x8]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x8]; + u8 num_of_bins[0x8]; + u8 reserved_at_30[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_bin_range_layout_bits bin_range[16]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -11804,6 +11832,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mtmp_reg_bits mtmp_reg; struct mlx5_ifc_mtptm_reg_bits mtptm_reg; struct mlx5_ifc_mtctr_reg_bits mtctr_reg; + struct mlx5_ifc_pphcr_reg_bits pphcr_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From 6b6dc81ee7e8ca87c71a533e1d69cf96a4f1e986 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:44:59 +0000 Subject: bonding: add support for per-port LACP actor priority Introduce a new netlink attribute 'actor_port_prio' to allow setting the LACP actor port priority on a per-slave basis. This extends the existing bonding infrastructure to support more granular control over LACP negotiations. The priority value is embedded in LACPDU packets and will be used by subsequent patches to influence aggregator selection policies. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902064501.360822-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 9 +++++++++ drivers/net/bonding/bond_3ad.c | 4 ++++ drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++ drivers/net/bonding/bond_options.c | 36 ++++++++++++++++++++++++++++++++++++ include/net/bond_3ad.h | 1 + include/net/bond_options.h | 1 + include/uapi/linux/if_link.h | 1 + 7 files changed, 68 insertions(+) (limited to 'include') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index a2b42ae719d2..345b60992446 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -193,6 +193,15 @@ ad_actor_sys_prio This parameter has effect only in 802.3ad mode and is available through SysFs interface. +actor_port_prio + + In an AD system, this specifies the port priority. The allowed range + is 1 - 65535. If the value is not specified, it takes 255 as the + default value. + + This parameter has effect only in 802.3ad mode and is available through + netlink interface. + ad_actor_system In an AD system, this specifies the mac-address for the actor in diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 4edc8e6b6b64..67ca78923b04 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -436,6 +436,7 @@ static void __ad_actor_update_port(struct port *port) port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; + port->actor_port_priority = SLAVE_AD_INFO(port->slave)->port_priority; } /* Conversions */ @@ -2209,6 +2210,9 @@ void bond_3ad_bind_slave(struct slave *slave) ad_initialize_port(port, &bond->params); + /* Port priority is initialized. Update it to slave's ad info */ + SLAVE_AD_INFO(slave)->port_priority = port->actor_port_priority; + port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index e573b34a1bbc..ba71d95a82d2 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -28,6 +28,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev, nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ + nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */ 0; } @@ -77,6 +78,10 @@ static int bond_fill_slave_info(struct sk_buff *skb, ad_port->partner_oper.port_state)) goto nla_put_failure; } + + if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, + SLAVE_AD_INFO(slave)->port_priority)) + goto nla_put_failure; } return 0; @@ -130,6 +135,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, + [IFLA_BOND_SLAVE_ACTOR_PORT_PRIO] = { .type = NLA_U16 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], @@ -180,6 +186,16 @@ static int bond_slave_changelink(struct net_device *bond_dev, return err; } + if (data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]) { + u16 ad_prio = nla_get_u16(data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]); + + bond_opt_slave_initval(&newval, &slave_dev, ad_prio); + err = __bond_opt_set(bond, BOND_OPT_ACTOR_PORT_PRIO, &newval, + data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO], extack); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index c0a5eb8766b5..897022272334 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -79,6 +79,8 @@ static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_user_port_key_set(struct bonding *bond, @@ -222,6 +224,13 @@ static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { NULL, -1, 0}, }; +static const struct bond_opt_value bond_actor_port_prio_tbl[] = { + { "minval", 0, BOND_VALFLAG_MIN}, + { "maxval", 65535, BOND_VALFLAG_MAX}, + { "default", 255, BOND_VALFLAG_DEFAULT}, + { NULL, -1, 0}, +}; + static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, @@ -483,6 +492,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .values = bond_ad_actor_sys_prio_tbl, .set = bond_option_ad_actor_sys_prio_set, }, + [BOND_OPT_ACTOR_PORT_PRIO] = { + .id = BOND_OPT_ACTOR_PORT_PRIO, + .name = "actor_port_prio", + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), + .values = bond_actor_port_prio_tbl, + .set = bond_option_actor_port_prio_set, + }, [BOND_OPT_AD_ACTOR_SYSTEM] = { .id = BOND_OPT_AD_ACTOR_SYSTEM, .name = "ad_actor_system", @@ -1812,6 +1828,26 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, return 0; } +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + struct slave *slave; + + slave = bond_slave_get_rtnl(newval->slave_dev); + if (!slave) { + netdev_dbg(bond->dev, "%s called on NULL slave\n", __func__); + return -ENODEV; + } + + netdev_dbg(newval->slave_dev, "Setting actor_port_prio to %llu\n", + newval->value); + + SLAVE_AD_INFO(slave)->port_priority = newval->value; + bond_3ad_update_ad_actor_settings(bond); + + return 0; +} + static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval) { diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index dba369a2cf27..e9188646e22e 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -274,6 +274,7 @@ struct ad_slave_info { struct port port; /* 802.3ad port structure */ struct bond_3ad_stats stats; u16 id; + u16 port_priority; }; static inline const char *bond_3ad_churn_desc(churn_state_t state) diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 022b122a9fb6..e6eedf23aea1 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -78,6 +78,7 @@ enum { BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, BOND_OPT_BROADCAST_NEIGH, + BOND_OPT_ACTOR_PORT_PRIO, BOND_OPT_LAST }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 784ace3a519c..45f56c9f95d9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1564,6 +1564,7 @@ enum { IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, IFLA_BOND_SLAVE_PRIO, + IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From e5a6643435fa4ad1e104323ec7d3e6215e2d832c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:45:00 +0000 Subject: bonding: support aggregator selection based on port priority Add a new ad_select policy 'port_priority' that uses the per-port actor priority values (set via ad_actor_port_prio) to determine aggregator selection. This allows administrators to influence which ports are preferred for aggregation by assigning different priority values, providing more flexible load balancing control in LACP configurations. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902064501.360822-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 16 ++++++++++++---- drivers/net/bonding/bond_3ad.c | 27 +++++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 9 +++++---- include/net/bond_3ad.h | 1 + 4 files changed, 45 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 345b60992446..e700bf1d095c 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -250,10 +250,18 @@ ad_select ports (slaves). Reselection occurs as described under the "bandwidth" setting, above. - The bandwidth and count selection policies permit failover of - 802.3ad aggregations when partial failure of the active aggregator - occurs. This keeps the aggregator with the highest availability - (either in bandwidth or in number of ports) active at all times. + actor_port_prio or 3 + + The active aggregator is chosen by the highest total sum of + actor port priorities across its active ports. Note this + priority is actor_port_prio, not per port prio, which is + used for primary reselect. + + The bandwidth, count and actor_port_prio selection policies permit + failover of 802.3ad aggregations when partial failure of the active + aggregator occurs. This keeps the aggregator with the highest + availability (either in bandwidth, number of ports, or total value + of port priorities) active at all times. This option was added in bonding version 3.4.0. diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 67ca78923b04..49717b7b82a2 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -747,6 +747,18 @@ static int __agg_active_ports(struct aggregator *agg) return active; } +static unsigned int __agg_ports_priority(const struct aggregator *agg) +{ + struct port *port = agg->lag_ports; + unsigned int prio = 0; + + for (; port; port = port->next_port_in_aggregator) + if (port->is_enabled) + prio += port->actor_port_priority; + + return prio; +} + /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at @@ -1708,6 +1720,9 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best, * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * + * BOND_AD_PRIO: Select by total priority of ports. If priority + * is equal, select by count. + * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * @@ -1729,6 +1744,14 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best, return best; switch (__get_agg_selection_mode(curr->lag_ports)) { + case BOND_AD_PRIO: + if (__agg_ports_priority(curr) > __agg_ports_priority(best)) + return curr; + + if (__agg_ports_priority(curr) < __agg_ports_priority(best)) + return best; + + fallthrough; case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; @@ -1794,6 +1817,10 @@ static int agg_device_up(const struct aggregator *agg) * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * + * BOND_AD_PRIO: select the aggregator with highest total priority of ports + * (slaves), and reselect whenever a link state change takes place or the + * set of slaves in the bond changes. + * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 897022272334..5b275cb266bc 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -162,10 +162,11 @@ static const struct bond_opt_value bond_lacp_rate_tbl[] = { }; static const struct bond_opt_value bond_ad_select_tbl[] = { - { "stable", BOND_AD_STABLE, BOND_VALFLAG_DEFAULT}, - { "bandwidth", BOND_AD_BANDWIDTH, 0}, - { "count", BOND_AD_COUNT, 0}, - { NULL, -1, 0}, + { "stable", BOND_AD_STABLE, BOND_VALFLAG_DEFAULT}, + { "bandwidth", BOND_AD_BANDWIDTH, 0}, + { "count", BOND_AD_COUNT, 0}, + { "actor_port_prio", BOND_AD_PRIO, 0}, + { NULL, -1, 0}, }; static const struct bond_opt_value bond_num_peer_notif_tbl[] = { diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index e9188646e22e..c92d4a976246 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -26,6 +26,7 @@ enum { BOND_AD_STABLE = 0, BOND_AD_BANDWIDTH = 1, BOND_AD_COUNT = 2, + BOND_AD_PRIO = 3, }; /* rx machine states(43.4.11 in the 802.3ad standard) */ -- cgit v1.2.3 From ed7240444e82aaaa2245a3cc9b040e4db894a665 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 8 Sep 2025 09:18:11 +0800 Subject: dt-bindings: interrupt-controller: aspeed: Add AST2700 SCU IC compatibles Add compatible strings for the four SCU interrupt controller instances on the AST2700 SoC (scu-ic0 to 3), following the multi-instance model used on AST2600. Also define interrupt indices in the binding header. Signed-off-by: Ryan Chen Signed-off-by: Thomas Gleixner Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/all/20250908011812.1033858-4-ryan_chen@aspeedtech.com --- .../interrupt-controller/aspeed,ast2500-scu-ic.yaml | 6 +++++- include/dt-bindings/interrupt-controller/aspeed-scu-ic.h | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml index d5287a2bf866..d998a9d69b91 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml @@ -5,7 +5,7 @@ $id: http://devicetree.org/schemas/interrupt-controller/aspeed,ast2500-scu-ic.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Aspeed AST25XX and AST26XX SCU Interrupt Controller +title: Aspeed AST25XX, AST26XX, AST27XX SCU Interrupt Controller maintainers: - Eddie James @@ -16,6 +16,10 @@ properties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 reg: maxItems: 1 diff --git a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h index f315d5a7f5ee..7dd04424afcc 100644 --- a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h +++ b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h @@ -20,4 +20,18 @@ #define ASPEED_AST2600_SCU_IC1_LPC_RESET_LO_TO_HI 0 #define ASPEED_AST2600_SCU_IC1_LPC_RESET_HI_TO_LO 1 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_HI_TO_LO 2 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_HI_TO_LO 0 + +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_HI_TO_LO 0 + #endif /* _DT_BINDINGS_INTERRUPT_CONTROLLER_ASPEED_SCU_IC_H_ */ -- cgit v1.2.3 From cdea7cdae26995992a766443e1ea862923f2443d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 21 Aug 2025 15:28:14 +0200 Subject: hrtimer: Use hrtimer_cb_get_time() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various other helpers contain open-coded implementations of hrtimer_cb_get_time(). This prevents refactoring the implementation. Reuse the existing helper. For this to work, move hrtimer_cb_get_time() a bit up in the file and also make its argument 'const'. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-7-3ae822e5bfbd@linutronix.de --- include/linux/hrtimer.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ef867bb8c44..e655502b14e6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,14 +154,14 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) +static inline ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) { - return ktime_sub(timer->node.expires, timer->base->get_time()); + return timer->base->get_time(); } -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return timer->base->get_time(); + return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) @@ -200,8 +200,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { - return __hrtimer_expires_remaining_adjusted(timer, - timer->base->get_time()); + return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD @@ -363,7 +362,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { - return hrtimer_forward(timer, timer->base->get_time(), interval); + return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ -- cgit v1.2.3 From 009eb5da29a91016e3ebb988e6401e79411be7a1 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 21 Aug 2025 15:28:15 +0200 Subject: hrtimer: Remove hrtimer_clock_base:: Get_time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_time() callbacks always need to match the bases clockid. Instead of maintaining that association twice in hrtimer_bases, use a helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-8-3ae822e5bfbd@linutronix.de --- include/linux/hrtimer.h | 5 +---- include/linux/hrtimer_defs.h | 2 -- kernel/time/hrtimer.c | 34 +++++++++++++++++++++++++--------- kernel/time/timer_list.c | 2 -- scripts/gdb/linux/timerlist.py | 2 -- 5 files changed, 26 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e655502b14e6..2cf1bf65b225 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,10 +154,7 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) -{ - return timer->base->get_time(); -} +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 84a5045f80f3..aa49ffa130e5 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -41,7 +41,6 @@ * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -51,7 +50,6 @@ struct hrtimer_clock_base { seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; - ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..fedd1d793f6c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) @@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); + } +} + +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b03d0ada6469..488e47e96e93 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py index 98445671fe83..ccc24d30de80 100644 --- a/scripts/gdb/linux/timerlist.py +++ b/scripts/gdb/linux/timerlist.py @@ -56,8 +56,6 @@ def print_base(base): text += " .index: {}\n".format(base['index']) text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) - - text += " .get_time: {}\n".format(base['get_time']) if constants.LX_CONFIG_HIGH_RES_TIMERS: text += " .offset: {} nsecs\n".format(base['offset']) text += "active timers:\n" -- cgit v1.2.3 From 267b9cdee522d03f95acf7c77de91056a4e004b3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 9 Sep 2025 12:30:35 +0100 Subject: ASoC: cs-amp-lib: Add handling for Lenovo and HP UEFI speaker ID Add handling of the Lenovo-specific and HP-specific EFI variables for speaker ID. Future Lenovo and HP models will not give the codec driver access to the speaker detect GPIO. Instead, the BIOS will read the GPIO and create an EFI variable with a value indicating the state of the GPIO. The Lenovo and HP EFI variables are both defined to have only two valid values. But the variable name, GUID and values are different. This adds a new exported function cs_amp_get_vendor_spkid(). Signed-off-by: Richard Fitzgerald Message-ID: <20250909113039.922065-3-rf@opensource.cirrus.com> Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 1 + sound/soc/codecs/cs-amp-lib.c | 101 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) (limited to 'include') diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 5459c221badf..43a87a39110c 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -49,6 +49,7 @@ int cs_amp_write_cal_coeffs(struct cs_dsp *dsp, const struct cirrus_amp_cal_data *data); int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index, struct cirrus_amp_cal_data *out_data); +int cs_amp_get_vendor_spkid(struct device *dev); struct cs_amp_test_hooks { efi_status_t (*get_efi_variable)(efi_char16_t *name, diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index b1530e7c75e8..9b51d056d863 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -20,6 +20,14 @@ #define CIRRUS_LOGIC_CALIBRATION_EFI_GUID \ EFI_GUID(0x02f9af02, 0x7734, 0x4233, 0xb4, 0x3d, 0x93, 0xfe, 0x5a, 0xa3, 0x5d, 0xb3) +#define LENOVO_SPEAKER_ID_EFI_NAME L"SdwSpeaker" +#define LENOVO_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0x48df970e, 0xe27f, 0x460a, 0xb5, 0x86, 0x77, 0x19, 0x80, 0x1d, 0x92, 0x82) + +#define HP_SPEAKER_ID_EFI_NAME L"HPSpeakerID" +#define HP_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e) + static int cs_amp_write_cal_coeff(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, const char *ctl_name, u32 val) @@ -114,6 +122,24 @@ static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name, return EFI_NOT_FOUND; } +static int cs_amp_convert_efi_status(efi_status_t status) +{ + switch (status) { + case EFI_SUCCESS: + return 0; + case EFI_NOT_FOUND: + return -ENOENT; + case EFI_BUFFER_TOO_SMALL: + return -EFBIG; + case EFI_UNSUPPORTED: + case EFI_ACCESS_DENIED: + case EFI_SECURITY_VIOLATION: + return -EACCES; + default: + return -EIO; + } +} + static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) { struct cirrus_amp_efi_data *efi_data; @@ -276,6 +302,81 @@ int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_ } EXPORT_SYMBOL_NS_GPL(cs_amp_get_efi_calibration_data, "SND_SOC_CS_AMP_LIB"); +struct cs_amp_spkid_efi { + efi_char16_t *name; + efi_guid_t *guid; + u8 values[2]; +}; + +static int cs_amp_get_efi_byte_spkid(struct device *dev, const struct cs_amp_spkid_efi *info) +{ + efi_status_t status; + unsigned long size; + u8 spkid; + int i, ret; + + size = sizeof(spkid); + status = cs_amp_get_efi_variable(info->name, info->guid, &size, &spkid); + ret = cs_amp_convert_efi_status(status); + if (ret < 0) + return ret; + + if (size == 0) + return -ENOENT; + + for (i = 0; i < ARRAY_SIZE(info->values); i++) { + if (info->values[i] == spkid) + return i; + } + + dev_err(dev, "EFI speaker ID bad value %#x\n", spkid); + + return -EINVAL; +} + +static const struct cs_amp_spkid_efi cs_amp_spkid_byte_types[] = { + { + .name = LENOVO_SPEAKER_ID_EFI_NAME, + .guid = &LENOVO_SPEAKER_ID_EFI_GUID, + .values = { 0xd0, 0xd1 }, + }, + { + .name = HP_SPEAKER_ID_EFI_NAME, + .guid = &HP_SPEAKER_ID_EFI_GUID, + .values = { 0x30, 0x31 }, + }, +}; + +/** + * cs_amp_get_vendor_spkid - get a speaker ID from vendor-specific storage + * @dev: pointer to struct device + * + * Known vendor-specific methods of speaker ID are checked and if one is + * found its speaker ID value is returned. + * + * Return: >=0 is a valid speaker ID. -ENOENT if a vendor-specific method + * was not found. -EACCES if the vendor-specific storage could not + * be read. Other error values indicate that the data from the + * vendor-specific storage was found but could not be understood. + */ +int cs_amp_get_vendor_spkid(struct device *dev) +{ + int i, ret; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE) && + !IS_ENABLED(CONFIG_SND_SOC_CS_AMP_LIB_TEST)) + return -ENOENT; + + for (i = 0; i < ARRAY_SIZE(cs_amp_spkid_byte_types); i++) { + ret = cs_amp_get_efi_byte_spkid(dev, &cs_amp_spkid_byte_types[i]); + if (ret != -ENOENT) + return ret; + } + + return -ENOENT; +} +EXPORT_SYMBOL_NS_GPL(cs_amp_get_vendor_spkid, "SND_SOC_CS_AMP_LIB"); + static const struct cs_amp_test_hooks cs_amp_test_hook_ptrs = { .get_efi_variable = cs_amp_get_efi_variable, .write_cal_coeff = cs_amp_write_cal_coeff, -- cgit v1.2.3 From 70a6f71b1a77decfc5b1db426ccbe914b58adb38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Sep 2025 12:56:38 +0200 Subject: block: add a bio_init_inline helper Just a simpler wrapper around bio_init for callers that want to initialize a bio with inline bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- block/blk-crypto-fallback.c | 3 +-- block/blk-map.c | 8 ++++---- drivers/md/bcache/debug.c | 3 +-- drivers/md/bcache/io.c | 3 +-- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/bcache/writeback.c | 2 +- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 4 ++-- drivers/target/target_core_pscsi.c | 2 +- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/journal.c | 2 +- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/super-io.c | 2 +- fs/squashfs/block.c | 2 +- include/linux/bio.h | 5 +++++ 20 files changed, 32 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 44c43b970387..bd8bf179981a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->nr--; put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + if (nr_vecs) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { - bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 005c9157ffb3..dbc2d8784dab 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, - bio_src->bi_opf); + bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; diff --git a/block/blk-map.c b/block/blk-map.c index 23e5d5ebe59e..92b7e19b1a1f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); + bio_init_inline(bio, NULL, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; @@ -264,7 +264,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + bio_init_inline(bio, NULL, nr_vecs, req_op(rq)); } return bio; } @@ -326,7 +326,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + bio_init_inline(bio, NULL, nr_vecs, op); if (is_vmalloc_addr(data)) { bio->bi_private = data; if (!bio_add_vmalloc(bio, data, len)) { @@ -392,7 +392,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); + bio_init_inline(bio, NULL, nr_pages, op); while (len) { struct page *page; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..d50eb82ccb4f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_iter.bi_size = bucket_bytes(ca); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 26a6a535ec32..4fc80c6d5b31 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1492c8552255..6d250e366412 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* * When the cache disk is first registered, ca->sb.njournal_buckets diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 302e75f1fc4b..36dd8f14a6df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff7595caf440..8f3a23f4b168 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index cf17fd46e255..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 408c26398321..bc11aaa38615 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..c52ccd12d86b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index f991cf759836..db4e09042469 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -861,7 +861,7 @@ new_bio: bio = bio_kmalloc(nr_vecs, GFP_KERNEL); if (!bio) goto fail; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, + bio_init_inline(bio, NULL, nr_vecs, rw ? REQ_OP_WRITE : REQ_OP_READ); bio->bi_end_io = pscsi_bi_endio; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 590cd29f3e86..e701e6d6e321 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2084,7 +2084,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, INIT_WORK(&scrub->work, btree_node_scrub_work); - bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bio_init_inline(&scrub->bio, ca->disk_sb.bdev, vecs, REQ_OP_READ); bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); scrub->bio.bi_iter.bi_sector = pick.ptr.offset; scrub->bio.bi_end_io = btree_node_scrub_endio; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ddfeb0dafc9d..3dbf9faaaa4c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1634,7 +1634,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; - bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + bio_init_inline(&ja->bio[i]->bio, NULL, nr_bvecs, 0); } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9e028dbcc3d0..ce8888d518c3 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1071,7 +1071,7 @@ reread: bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) return bch_err_throw(c, ENOMEM_journal_read_bucket); - bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + bio_init_inline(bio, ca->disk_sb.bdev, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6c2e1d647403..e3fcffb93d2b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -232,7 +232,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) if (!bio) return -BCH_ERR_ENOMEM_sb_bio_realloc; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + bio_init_inline(bio, NULL, nr_bvecs, 0); kfree(sb->bio); sb->bio = bio; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b69c294e3ef0..a05e3793f93a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -231,7 +231,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); + bio_init_inline(bio, sb->s_bdev, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 46ffac5caab7..eb7f4fbd8aa9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,6 +405,11 @@ struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); +static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, + unsigned short max_vecs, blk_opf_t opf) +{ + bio_init(bio, bdev, bio->bi_inline_vecs, max_vecs, opf); +} extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); -- cgit v1.2.3 From d86eaa0f3c56da286853b698b45c8ce404291082 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Sep 2025 12:56:39 +0200 Subject: block: remove the bi_inline_vecs variable sized array from struct bio Bios are embedded into other structures, and at least spare is unhappy about embedding structures with variable sized arrays. There's no real need to the array anyway, we can replace it with a helper pointing to the memory just behind the bio, and with the previous cleanups there is very few site doing anything special with it. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- drivers/md/bcache/movinggc.c | 6 +++--- drivers/md/bcache/writeback.c | 6 +++--- drivers/md/dm-vdo/vio.c | 2 +- fs/bcachefs/data_update.h | 1 - fs/bcachefs/journal.c | 4 ++-- include/linux/bio.h | 2 +- include/linux/blk_types.h | 12 +++++------- 8 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index bd8bf179981a..971d96afaf8d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -617,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; - return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); + return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), + gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 4fc80c6d5b31..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 36dd8f14a6df..6ba73dc1a3df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e7f4153e55e3..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", size, vio_size) != VDO_SUCCESS) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 5e14d13568de..1b37780abfda 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -62,7 +62,6 @@ struct promote_op { struct work_struct work; struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3dbf9faaaa4c..474e0e867b68 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1627,8 +1627,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); + ja->bio[i] = kzalloc(sizeof(*ja->bio[i]) + + sizeof(struct bio_vec) * nr_bvecs, GFP_KERNEL); if (!ja->bio[i]) return bch_err_throw(c, ENOMEM_dev_journal_init); diff --git a/include/linux/bio.h b/include/linux/bio.h index eb7f4fbd8aa9..27cbff5b0356 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,7 +408,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, unsigned short max_vecs, blk_opf_t opf) { - bio_init(bio, bdev, bio->bi_inline_vecs, max_vecs, opf); + bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf); } extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 930daff207df..bbb7893e0542 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -269,18 +269,16 @@ struct bio { struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; - - /* - * We can inline a number of vecs at the end of the bio, to avoid - * double allocations for a small number of bio_vecs. This member - * MUST obviously be kept at the very end of the bio. - */ - struct bio_vec bi_inline_vecs[]; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) +static inline struct bio_vec *bio_inline_vecs(struct bio *bio) +{ + return (struct bio_vec *)(bio + 1); +} + /* * bio flags */ -- cgit v1.2.3 From 199c9a8d26638845f509b76e3c176c27e7baafd7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Sep 2025 20:33:10 +0800 Subject: blk-mq: Document tags_srcu member in blk_mq_tag_set structure Add missing documentation for the tags_srcu member that was introduced to defer freeing of tags page_list to prevent use-after-free when iterating tags. Fixes htmldocs warning: WARNING: include/linux/blk-mq.h:536 struct member 'tags_srcu' not described in 'blk_mq_tag_set' Reported-by: Stephen Rothwell Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1325ceeb743a..b25d12545f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,8 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent + * use-after-free when iterating tags. * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator. -- cgit v1.2.3 From ce4c356d760f6fb22d69f0c5091542891ba6f394 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:36:16 +0200 Subject: media: update Hans Verkuil's email address Replace hansverk@cisco.com by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/core/cec-core.c | 2 +- drivers/media/cec/platform/cec-gpio/cec-gpio.c | 2 +- drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c | 2 +- drivers/media/i2c/adv7604.c | 2 +- drivers/media/i2c/adv7842.c | 2 +- drivers/media/i2c/ths7303.c | 2 +- drivers/media/mc/mc-request.c | 2 +- drivers/media/pci/cobalt/cobalt-driver.c | 2 +- drivers/media/radio/radio-aimslab.c | 2 +- drivers/media/radio/radio-gemtek.c | 2 +- drivers/media/radio/radio-isa.c | 2 +- drivers/media/radio/radio-isa.h | 2 +- drivers/media/radio/radio-miropcm20.c | 2 +- drivers/media/radio/radio-rtrack2.c | 2 +- drivers/media/radio/radio-terratec.c | 2 +- drivers/media/radio/radio-zoltrix.c | 2 +- drivers/media/test-drivers/vicodec/vicodec-core.c | 2 +- include/media/i2c/ths7303.h | 2 +- include/media/media-request.h | 2 +- include/uapi/linux/v4l2-dv-timings.h | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index e10bd588a586..d7259599029f 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -439,6 +439,6 @@ static void __exit cec_devnode_exit(void) subsys_initcall(cec_devnode_init); module_exit(cec_devnode_exit) -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_DESCRIPTION("Device node registration for cec drivers"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/cec/platform/cec-gpio/cec-gpio.c b/drivers/media/cec/platform/cec-gpio/cec-gpio.c index 3c27789d8657..842555ed42c7 100644 --- a/drivers/media/cec/platform/cec-gpio/cec-gpio.c +++ b/drivers/media/cec/platform/cec-gpio/cec-gpio.c @@ -291,6 +291,6 @@ static struct platform_driver cec_gpio_pdrv = { module_platform_driver(cec_gpio_pdrv); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("CEC GPIO driver"); diff --git a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c index c476e8d1279e..e2eff17952ab 100644 --- a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c +++ b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c @@ -28,7 +28,7 @@ #include "extron-da-hd-4k-plus.h" -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_DESCRIPTION("Extron DA HD 4K PLUS HDMI CEC driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c index afed38596362..8fe7c2f72883 100644 --- a/drivers/media/i2c/adv7604.c +++ b/drivers/media/i2c/adv7604.c @@ -42,7 +42,7 @@ module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug level (0-2)"); MODULE_DESCRIPTION("Analog Devices ADV7604/10/11/12 video decoder driver"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_AUTHOR("Mats Randgaard "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/adv7842.c b/drivers/media/i2c/adv7842.c index 5545cd23e113..9780082db841 100644 --- a/drivers/media/i2c/adv7842.c +++ b/drivers/media/i2c/adv7842.c @@ -38,7 +38,7 @@ module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug level (0-2)"); MODULE_DESCRIPTION("Analog Devices ADV7842 video decoder driver"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_AUTHOR("Martin Bugge "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/ths7303.c b/drivers/media/i2c/ths7303.c index b7cedc5b3e8e..ff268ebeb4d9 100644 --- a/drivers/media/i2c/ths7303.c +++ b/drivers/media/i2c/ths7303.c @@ -7,7 +7,7 @@ * Author: Chaithrika U S * * Contributors: - * Hans Verkuil + * Hans Verkuil * Lad, Prabhakar * Martin Bugge * diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c index 5edfc2791ce7..f66f728b1b43 100644 --- a/drivers/media/mc/mc-request.c +++ b/drivers/media/mc/mc-request.c @@ -6,7 +6,7 @@ * Copyright (C) 2018 Intel Corporation * Copyright (C) 2018 Google, Inc. * - * Author: Hans Verkuil + * Author: Hans Verkuil * Author: Sakari Ailus */ diff --git a/drivers/media/pci/cobalt/cobalt-driver.c b/drivers/media/pci/cobalt/cobalt-driver.c index 39e25cc53edb..b7695705fdee 100644 --- a/drivers/media/pci/cobalt/cobalt-driver.c +++ b/drivers/media/pci/cobalt/cobalt-driver.c @@ -44,7 +44,7 @@ module_param_named(ignore_err, cobalt_ignore_err, int, 0644); MODULE_PARM_DESC(ignore_err, "If set then ignore missing i2c adapters/receivers. Default: 0\n"); -MODULE_AUTHOR("Hans Verkuil & Morten Hestnes"); +MODULE_AUTHOR("Hans Verkuil & Morten Hestnes"); MODULE_DESCRIPTION("cobalt driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c index 3c8c17d64821..2c1d413e8636 100644 --- a/drivers/media/radio/radio-aimslab.c +++ b/drivers/media/radio/radio-aimslab.c @@ -4,7 +4,7 @@ * * Copyright 1997 M. Kirkwood * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c index 5ca6274c45bd..a3265f1dd189 100644 --- a/drivers/media/radio/radio-gemtek.c +++ b/drivers/media/radio/radio-gemtek.c @@ -15,7 +15,7 @@ * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * * Note: this card seems to swap the left and right audio channels! diff --git a/drivers/media/radio/radio-isa.c b/drivers/media/radio/radio-isa.c index 4f87c76a2a96..1a144536ffa7 100644 --- a/drivers/media/radio/radio-isa.c +++ b/drivers/media/radio/radio-isa.c @@ -4,7 +4,7 @@ * This takes care of all the V4L2 scaffolding, allowing the ISA drivers * to concentrate on the actual hardware operation. * - * Copyright (C) 2012 Hans Verkuil + * Copyright (C) 2012 Hans Verkuil */ #include diff --git a/drivers/media/radio/radio-isa.h b/drivers/media/radio/radio-isa.h index 0f3db473da5e..62ff5c3fb5d5 100644 --- a/drivers/media/radio/radio-isa.h +++ b/drivers/media/radio/radio-isa.h @@ -4,7 +4,7 @@ * This takes care of all the V4L2 scaffolding, allowing the ISA drivers * to concentrate on the actual hardware operation. * - * Copyright (C) 2012 Hans Verkuil + * Copyright (C) 2012 Hans Verkuil */ #ifndef _RADIO_ISA_H_ diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c index 27f058c5e677..67712ab3d564 100644 --- a/drivers/media/radio/radio-miropcm20.c +++ b/drivers/media/radio/radio-miropcm20.c @@ -23,7 +23,7 @@ * This code has been reintroduced and converted to use * the new V4L2 RDS API by: * - * Hans Verkuil + * Hans Verkuil */ #include diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c index 16b13a63bfed..efc02069bf9d 100644 --- a/drivers/media/radio/radio-rtrack2.c +++ b/drivers/media/radio/radio-rtrack2.c @@ -7,7 +7,7 @@ * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * * Fully tested with actual hardware and the v4l2-compliance tool. diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c index 720080634454..43817dd0a0fe 100644 --- a/drivers/media/radio/radio-terratec.c +++ b/drivers/media/radio/radio-terratec.c @@ -17,7 +17,7 @@ * Frequency control is done digitally -- ie out(port,encodefreq(95.8)); * Volume Control is done digitally * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab */ diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c index 099b7af6a410..e043bee52384 100644 --- a/drivers/media/radio/radio-zoltrix.c +++ b/drivers/media/radio/radio-zoltrix.c @@ -30,7 +30,7 @@ * 2006-07-24 - Converted to V4L2 API * by Mauro Carvalho Chehab * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * * Note that this is the driver for the Zoltrix Radio Plus. * This driver does not work for the Zoltrix Radio Plus 108 or the diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index 174ea761d09a..a3df3a33237e 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -26,7 +26,7 @@ #include "codec-v4l2-fwht.h" MODULE_DESCRIPTION("Virtual codec device"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_LICENSE("GPL v2"); static bool multiplanar; diff --git a/include/media/i2c/ths7303.h b/include/media/i2c/ths7303.h index fc937025cdb4..7eda467b6725 100644 --- a/include/media/i2c/ths7303.h +++ b/include/media/i2c/ths7303.h @@ -5,7 +5,7 @@ * Copyright 2013 Cisco Systems, Inc. and/or its affiliates. * * Contributors: - * Hans Verkuil + * Hans Verkuil * Lad, Prabhakar * Martin Bugge */ diff --git a/include/media/media-request.h b/include/media/media-request.h index d4ac557678a7..bb500b2f9da4 100644 --- a/include/media/media-request.h +++ b/include/media/media-request.h @@ -5,7 +5,7 @@ * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2018 Intel Corporation * - * Author: Hans Verkuil + * Author: Hans Verkuil * Author: Sakari Ailus */ diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 44a16e0e5a12..58f478f98a35 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -2,7 +2,7 @@ /* * V4L2 DV timings header. * - * Copyright (C) 2012-2016 Hans Verkuil + * Copyright (C) 2012-2016 Hans Verkuil */ #ifndef _V4L2_DV_TIMINGS_H -- cgit v1.2.3 From e8c8d961d8ad53d8d104d7474c08d8e4626c7767 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:41:15 +0200 Subject: media: include: update Hans Verkuil's email address Replace hverkuil@xs4all.nl by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/dt-bindings/media/tvp5150.h | 2 +- include/linux/videodev2.h | 2 +- include/media/drv-intf/cx25840.h | 2 +- include/media/drv-intf/msp3400.h | 2 +- include/media/i2c/bt819.h | 2 +- include/media/i2c/cs5345.h | 2 +- include/media/i2c/cs53l32a.h | 2 +- include/media/i2c/m52790.h | 2 +- include/media/i2c/mt9v011.h | 2 +- include/media/i2c/saa7115.h | 2 +- include/media/i2c/saa7127.h | 2 +- include/media/i2c/tvaudio.h | 2 +- include/media/i2c/upd64031a.h | 2 +- include/media/i2c/upd64083.h | 2 +- include/media/i2c/wm8775.h | 2 +- include/media/v4l2-common.h | 2 +- include/media/v4l2-ctrls.h | 2 +- include/media/v4l2-device.h | 2 +- include/media/v4l2-subdev.h | 2 +- include/uapi/linux/ivtv.h | 2 +- include/uapi/linux/videodev2.h | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/media/tvp5150.h b/include/dt-bindings/media/tvp5150.h index dda00c038530..ba34c420c303 100644 --- a/include/dt-bindings/media/tvp5150.h +++ b/include/dt-bindings/media/tvp5150.h @@ -2,7 +2,7 @@ /* tvp5150.h - definition for tvp5150 inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 219037f4c08d..9609cf365e8e 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -50,7 +50,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef __LINUX_VIDEODEV2_H diff --git a/include/media/drv-intf/cx25840.h b/include/media/drv-intf/cx25840.h index ba69bc525382..8b455d9dd5ca 100644 --- a/include/media/drv-intf/cx25840.h +++ b/include/media/drv-intf/cx25840.h @@ -3,7 +3,7 @@ /* * cx25840.h - definition for cx25840/1/2/3 inputs * - * Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + * Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ #ifndef _CX25840_H_ diff --git a/include/media/drv-intf/msp3400.h b/include/media/drv-intf/msp3400.h index d6dfae104a6f..853258ee6bbd 100644 --- a/include/media/drv-intf/msp3400.h +++ b/include/media/drv-intf/msp3400.h @@ -2,7 +2,7 @@ /* msp3400.h - definition for msp3400 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/bt819.h b/include/media/i2c/bt819.h index 70aa46bd5182..2277a7eb9548 100644 --- a/include/media/i2c/bt819.h +++ b/include/media/i2c/bt819.h @@ -2,7 +2,7 @@ /* bt819.h - bt819 notifications - Copyright (C) 2009 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2009 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs5345.h b/include/media/i2c/cs5345.h index d41e4dca3fcc..39e1cf6c1a2f 100644 --- a/include/media/i2c/cs5345.h +++ b/include/media/i2c/cs5345.h @@ -2,7 +2,7 @@ /* cs5345.h - definition for cs5345 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs53l32a.h b/include/media/i2c/cs53l32a.h index 52ceb2f916d3..777f667855cb 100644 --- a/include/media/i2c/cs53l32a.h +++ b/include/media/i2c/cs53l32a.h @@ -2,7 +2,7 @@ /* cs53l32a.h - definition for cs53l32a inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/m52790.h b/include/media/i2c/m52790.h index 3f214fa9bc64..cedaaf215273 100644 --- a/include/media/i2c/m52790.h +++ b/include/media/i2c/m52790.h @@ -2,7 +2,7 @@ /* m52790.h - definition for m52790 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/mt9v011.h b/include/media/i2c/mt9v011.h index 41c00b3e7184..552839756e64 100644 --- a/include/media/i2c/mt9v011.h +++ b/include/media/i2c/mt9v011.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* mt9v011 sensor * - * Copyright (C) 2011 Hans Verkuil + * Copyright (C) 2011 Hans Verkuil */ #ifndef __MT9V011_H__ diff --git a/include/media/i2c/saa7115.h b/include/media/i2c/saa7115.h index 0cd6080d7cb1..a607c91ef5f3 100644 --- a/include/media/i2c/saa7115.h +++ b/include/media/i2c/saa7115.h @@ -2,7 +2,7 @@ /* saa7115.h - definition for saa7111/3/4/5 inputs and frequency flags - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/saa7127.h b/include/media/i2c/saa7127.h index 53ee999e6090..c81ee1743df1 100644 --- a/include/media/i2c/saa7127.h +++ b/include/media/i2c/saa7127.h @@ -2,7 +2,7 @@ /* saa7127.h - definition for saa7126/7/8/9 inputs/outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/tvaudio.h b/include/media/i2c/tvaudio.h index 42cd3206fb6c..206f42ed4e69 100644 --- a/include/media/i2c/tvaudio.h +++ b/include/media/i2c/tvaudio.h @@ -2,7 +2,7 @@ /* tvaudio.h - definition for tvaudio inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/upd64031a.h b/include/media/i2c/upd64031a.h index b6570abc84ef..d39b2b7f0cf3 100644 --- a/include/media/i2c/upd64031a.h +++ b/include/media/i2c/upd64031a.h @@ -2,7 +2,7 @@ /* * upd64031a - NEC Electronics Ghost Reduction input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64031A_H_ diff --git a/include/media/i2c/upd64083.h b/include/media/i2c/upd64083.h index 17fb7b5201cc..72cf547c25fc 100644 --- a/include/media/i2c/upd64083.h +++ b/include/media/i2c/upd64083.h @@ -2,7 +2,7 @@ /* * upd6408x - NEC Electronics 3-Dimensional Y/C separation input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64083_H_ diff --git a/include/media/i2c/wm8775.h b/include/media/i2c/wm8775.h index 6ccdeb3817ab..a02695ee3a58 100644 --- a/include/media/i2c/wm8775.h +++ b/include/media/i2c/wm8775.h @@ -2,7 +2,7 @@ /* wm8775.h - definition for wm8775 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index d8e23991a656..594314459333 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -7,7 +7,7 @@ Each ioctl begins with VIDIOC_INT_ to clearly mark that it is an internal define, - Copyright (C) 2005 Hans Verkuil + Copyright (C) 2005 Hans Verkuil */ diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index 4a294a5c7bdd..31fc1bee3797 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -2,7 +2,7 @@ /* * V4L2 controls support header. * - * Copyright (C) 2010 Hans Verkuil + * Copyright (C) 2010 Hans Verkuil */ #ifndef _V4L2_CTRLS_H diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h index dd897a362f36..25f69b1b8db0 100644 --- a/include/media/v4l2-device.h +++ b/include/media/v4l2-device.h @@ -2,7 +2,7 @@ /* V4L2 device support header. - Copyright (C) 2008 Hans Verkuil + Copyright (C) 2008 Hans Verkuil */ diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 4b28086808c9..e0bb58cb6d04 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -2,7 +2,7 @@ /* * V4L2 sub-device support header. * - * Copyright (C) 2008 Hans Verkuil + * Copyright (C) 2008 Hans Verkuil */ #ifndef _V4L2_SUBDEV_H diff --git a/include/uapi/linux/ivtv.h b/include/uapi/linux/ivtv.h index e74f18642b11..c9241f7271c4 100644 --- a/include/uapi/linux/ivtv.h +++ b/include/uapi/linux/ivtv.h @@ -2,7 +2,7 @@ /* Public ivtv API header Copyright (C) 2003-2004 Kevin Thayer - Copyright (C) 2004-2007 Hans Verkuil + Copyright (C) 2004-2007 Hans Verkuil This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 64943f1a6149..becd08fdbddb 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -51,7 +51,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef _UAPI__LINUX_VIDEODEV2_H -- cgit v1.2.3 From 55e3c86887ddbd1c676a06dcc9c3cceeb5380008 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 13 Aug 2025 00:45:13 +0300 Subject: media: i2c: mt9v022: Drop unused mt9v022.h header The mt9v022 driver got removed in commit e7eab49132ba ("media: staging/media/soc_camera: remove this driver"), but its platform header file got left behind. Remove it. Signed-off-by: Laurent Pinchart Signed-off-by: Sakari Ailus Reviewed-by: Mehdi Djait Signed-off-by: Hans Verkuil --- include/media/i2c/mt9v022.h | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 include/media/i2c/mt9v022.h (limited to 'include') diff --git a/include/media/i2c/mt9v022.h b/include/media/i2c/mt9v022.h deleted file mode 100644 index 6966eb538165..000000000000 --- a/include/media/i2c/mt9v022.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * mt9v022 sensor - */ - -#ifndef __MT9V022_H__ -#define __MT9V022_H__ - -struct mt9v022_platform_data { - unsigned short y_skip_top; /* Lines to skip at the top */ -}; - -#endif -- cgit v1.2.3 From b4dd3bbd2eeb60702a8251c50cadd098257c5bfe Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 13 Aug 2025 00:45:15 +0300 Subject: media: i2c: mt9v032: Drop support for platform data The last user of the mt9v032 driver through board files and platform data has long been removed. Drop support for platform data from the driver. Signed-off-by: Laurent Pinchart Signed-off-by: Sakari Ailus Reviewed-by: Mehdi Djait Signed-off-by: Hans Verkuil --- MAINTAINERS | 1 - drivers/media/i2c/Kconfig | 1 + drivers/media/i2c/mt9v032.c | 66 +++++++++++++++++++-------------------------- include/media/i2c/mt9v032.h | 12 --------- 4 files changed, 28 insertions(+), 52 deletions(-) delete mode 100644 include/media/i2c/mt9v032.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 4b095bbb14cb..89d5f1c3f918 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17273,7 +17273,6 @@ S: Maintained T: git git://linuxtv.org/media.git F: Documentation/devicetree/bindings/media/i2c/aptina,mt9v032.txt F: drivers/media/i2c/mt9v032.c -F: include/media/i2c/mt9v032.h MT9V111 APTINA CAMERA SENSOR M: Jacopo Mondi diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig index ccc143ca13aa..c976a1016e70 100644 --- a/drivers/media/i2c/Kconfig +++ b/drivers/media/i2c/Kconfig @@ -327,6 +327,7 @@ config VIDEO_MT9V011 config VIDEO_MT9V032 tristate "Micron MT9V032 sensor support" + depends on OF select REGMAP_I2C help This is a Video4Linux2 sensor driver for the Micron diff --git a/drivers/media/i2c/mt9v032.c b/drivers/media/i2c/mt9v032.c index 888f3280378d..d4359d5b92bb 100644 --- a/drivers/media/i2c/mt9v032.c +++ b/drivers/media/i2c/mt9v032.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -182,6 +181,13 @@ static const struct mt9v032_model_version mt9v032_versions[] = { { MT9V034_CHIP_ID_REV1, "MT9V024/MT9V034 rev1" }, }; +struct mt9v032_platform_data { + unsigned int clk_pol:1; + + const s64 *link_freqs; + s64 link_def_freq; +}; + struct mt9v032 { struct device *dev; @@ -207,7 +213,7 @@ struct mt9v032 { struct gpio_desc *reset_gpio; struct gpio_desc *standby_gpio; - struct mt9v032_platform_data *pdata; + struct mt9v032_platform_data pdata; const struct mt9v032_model_info *model; const struct mt9v032_model_version *version; @@ -332,7 +338,7 @@ static int __mt9v032_set_power(struct mt9v032 *mt9v032, bool on) return ret; /* Configure the pixel clock polarity */ - if (mt9v032->pdata && mt9v032->pdata->clk_pol) { + if (mt9v032->pdata.clk_pol) { ret = regmap_write(map, mt9v032->model->data->pclk_reg, MT9V032_PIXEL_CLOCK_INV_PXL_CLK); if (ret < 0) @@ -683,7 +689,7 @@ static int mt9v032_s_ctrl(struct v4l2_ctrl *ctrl) if (mt9v032->link_freq == NULL) break; - freq = mt9v032->pdata->link_freqs[mt9v032->link_freq->val]; + freq = mt9v032->pdata.link_freqs[mt9v032->link_freq->val]; *mt9v032->pixel_rate->p_new.p_s64 = freq; mt9v032->sysclk = freq; break; @@ -996,26 +1002,19 @@ static const struct regmap_config mt9v032_regmap_config = { * Driver initialization and probing */ -static struct mt9v032_platform_data *mt9v032_get_pdata(struct mt9v032 *mt9v032) +static int mt9v032_get_pdata(struct mt9v032 *mt9v032) { - struct mt9v032_platform_data *pdata = NULL; + struct mt9v032_platform_data *pdata = &mt9v032->pdata; struct v4l2_fwnode_endpoint endpoint = { .bus_type = 0 }; - struct device_node *np; + struct device_node *np __free(device_node) = NULL; struct property *prop; - if (!IS_ENABLED(CONFIG_OF) || !mt9v032->dev->of_node) - return mt9v032->dev->platform_data; - np = of_graph_get_endpoint_by_regs(mt9v032->dev->of_node, 0, -1); if (!np) - return NULL; + return -EINVAL; if (v4l2_fwnode_endpoint_parse(of_fwnode_handle(np), &endpoint) < 0) - goto done; - - pdata = devm_kzalloc(mt9v032->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - goto done; + return -EINVAL; prop = of_find_property(np, "link-frequencies", NULL); if (prop) { @@ -1025,11 +1024,11 @@ static struct mt9v032_platform_data *mt9v032_get_pdata(struct mt9v032 *mt9v032) link_freqs = devm_kcalloc(mt9v032->dev, size, sizeof(*link_freqs), GFP_KERNEL); if (!link_freqs) - goto done; + return -EINVAL; if (of_property_read_u64_array(np, "link-frequencies", link_freqs, size) < 0) - goto done; + return -EINVAL; pdata->link_freqs = link_freqs; pdata->link_def_freq = link_freqs[0]; @@ -1038,9 +1037,7 @@ static struct mt9v032_platform_data *mt9v032_get_pdata(struct mt9v032 *mt9v032) pdata->clk_pol = !!(endpoint.bus.parallel.flags & V4L2_MBUS_PCLK_SAMPLE_RISING); -done: - of_node_put(np); - return pdata; + return 0; } static int mt9v032_probe(struct i2c_client *client) @@ -1075,8 +1072,13 @@ static int mt9v032_probe(struct i2c_client *client) return PTR_ERR(mt9v032->standby_gpio); mutex_init(&mt9v032->power_lock); - mt9v032->pdata = mt9v032_get_pdata(mt9v032); - mt9v032->model = i2c_get_match_data(client); + + ret = mt9v032_get_pdata(mt9v032); + if (ret) + return dev_err_probe(mt9v032->dev, -EINVAL, + "Failed to parse DT properties\n"); + + mt9v032->model = device_get_match_data(mt9v032->dev); v4l2_ctrl_handler_init(&mt9v032->ctrls, 11 + ARRAY_SIZE(mt9v032_aegc_controls)); @@ -1121,8 +1123,8 @@ static int mt9v032_probe(struct i2c_client *client) v4l2_ctrl_new_std(&mt9v032->ctrls, &mt9v032_ctrl_ops, V4L2_CID_PIXEL_RATE, 1, INT_MAX, 1, 1); - if (mt9v032->pdata && mt9v032->pdata->link_freqs) { - const struct mt9v032_platform_data *pdata = mt9v032->pdata; + if (mt9v032->pdata.link_freqs) { + const struct mt9v032_platform_data *pdata = &mt9v032->pdata; unsigned int def = 0; for (i = 0; pdata->link_freqs[i]; ++i) { @@ -1264,19 +1266,6 @@ static const struct mt9v032_model_info mt9v032_models[] = { }, }; -static const struct i2c_device_id mt9v032_id[] = { - { "mt9v022", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V022_COLOR] }, - { "mt9v022m", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V022_MONO] }, - { "mt9v024", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V024_COLOR] }, - { "mt9v024m", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V024_MONO] }, - { "mt9v032", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V032_COLOR] }, - { "mt9v032m", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V032_MONO] }, - { "mt9v034", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V034_COLOR] }, - { "mt9v034m", (kernel_ulong_t)&mt9v032_models[MT9V032_MODEL_V034_MONO] }, - { /* Sentinel */ } -}; -MODULE_DEVICE_TABLE(i2c, mt9v032_id); - static const struct of_device_id mt9v032_of_match[] = { { .compatible = "aptina,mt9v022", .data = &mt9v032_models[MT9V032_MODEL_V022_COLOR] }, { .compatible = "aptina,mt9v022m", .data = &mt9v032_models[MT9V032_MODEL_V022_MONO] }, @@ -1297,7 +1286,6 @@ static struct i2c_driver mt9v032_driver = { }, .probe = mt9v032_probe, .remove = mt9v032_remove, - .id_table = mt9v032_id, }; module_i2c_driver(mt9v032_driver); diff --git a/include/media/i2c/mt9v032.h b/include/media/i2c/mt9v032.h deleted file mode 100644 index 83a37ccfb649..000000000000 --- a/include/media/i2c/mt9v032.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _MEDIA_MT9V032_H -#define _MEDIA_MT9V032_H - -struct mt9v032_platform_data { - unsigned int clk_pol:1; - - const s64 *link_freqs; - s64 link_def_freq; -}; - -#endif -- cgit v1.2.3 From dd235b07b65e123c0fdadc2883b9c16aa4749164 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 13 Aug 2025 00:45:53 +0300 Subject: media: v4l2-common: Add legacy camera sensor clock helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recently introduced devm_v4l2_sensor_clk_get() helper aims at simplifying sensor drivers by centralizing clock handling code, as well as reducing cargo-cult and deprecated behaviour. A set of drivers implement external clock handling in a non-standard way. This can't be changed as there is a high risk of breaking existing platforms, but keeping the code as-is creates a risk of new drivers copying deprecated behaviour. To fix this, introduce a new devm_v4l2_sensor_clk_get_legacy() helper and use it in those driver. Compared to devm_v4l2_sensor_clk_get(), the new helper takes the "clock-frequency" property into account and sets the external clock rate on OF platforms, and adds the ability to specify a fixed default or fallback clock rate in case the "clock-frequency" property is not present. Signed-off-by: Laurent Pinchart Reviewed-by: Niklas Söderlund Signed-off-by: Sakari Ailus Reviewed-by: Mehdi Djait Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-common.c | 39 +++++++++++++++++++++++++-------- include/media/v4l2-common.h | 41 ++++++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 6f8bce4d6b62..b367d479d6b3 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -708,24 +708,40 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, } EXPORT_SYMBOL_GPL(v4l2_link_freq_to_bitmap); -struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id) +struct clk *__devm_v4l2_sensor_clk_get(struct device *dev, const char *id, + bool legacy, bool fixed_rate, + unsigned long clk_rate) { + bool of_node = is_of_node(dev_fwnode(dev)); const char *clk_id __free(kfree) = NULL; struct clk_hw *clk_hw; struct clk *clk; - bool of_node; - u32 rate; - int ret; + u32 rate = clk_rate; + int ret = 0; clk = devm_clk_get_optional(dev, id); if (IS_ERR(clk)) return clk; - ret = device_property_read_u32(dev, "clock-frequency", &rate); - of_node = is_of_node(dev_fwnode(dev)); + /* + * If the caller didn't request a fixed rate, retrieve it from the + * clock-frequency property. -EINVAL indicates the property is absent, + * and is not a failure. Other errors, or success with a clock-frequency + * value of 0, are hard failures. + */ + if (!fixed_rate || !clk_rate) { + ret = device_property_read_u32(dev, "clock-frequency", &rate); + if ((ret && ret != -EINVAL) || (!ret && !rate)) + return ERR_PTR(-EINVAL); + } if (clk) { - if (!ret && !of_node) { + /* + * On non-OF platforms, or when legacy behaviour is requested, + * set the clock rate if a rate has been specified by the caller + * or by the clock-frequency property. + */ + if (rate && (!of_node || legacy)) { ret = clk_set_rate(clk, rate); if (ret) { dev_err(dev, "Failed to set clock rate: %u\n", @@ -736,9 +752,14 @@ struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id) return clk; } - if (!IS_ENABLED(CONFIG_COMMON_CLK) || of_node) + /* + * Register a dummy fixed clock on non-OF platforms or when legacy + * behaviour is requested. This required the common clock framework. + */ + if (!IS_ENABLED(CONFIG_COMMON_CLK) || (of_node && !legacy)) return ERR_PTR(-ENOENT); + /* We need a rate to create a clock. */ if (ret) return ERR_PTR(ret == -EINVAL ? -EPROBE_DEFER : ret); @@ -755,4 +776,4 @@ struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id) return clk_hw->clk; } -EXPORT_SYMBOL_GPL(devm_v4l2_sensor_clk_get); +EXPORT_SYMBOL_GPL(__devm_v4l2_sensor_clk_get); diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 594314459333..5c0a7f6b5bb6 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -612,6 +612,10 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, unsigned int num_of_driver_link_freqs, unsigned long *bitmap); +struct clk *__devm_v4l2_sensor_clk_get(struct device *dev, const char *id, + bool legacy, bool fixed_rate, + unsigned long clk_rate); + /** * devm_v4l2_sensor_clk_get - lookup and obtain a reference to a clock producer * for a camera sensor @@ -644,7 +648,42 @@ int v4l2_link_freq_to_bitmap(struct device *dev, const u64 *fw_link_freqs, * * Returns a pointer to a struct clk on success or an error pointer on failure. */ -struct clk *devm_v4l2_sensor_clk_get(struct device *dev, const char *id); +static inline struct clk * +devm_v4l2_sensor_clk_get(struct device *dev, const char *id) +{ + return __devm_v4l2_sensor_clk_get(dev, id, false, false, 0); +} + +/** + * devm_v4l2_sensor_clk_get_legacy - lookup and obtain a reference to a clock + * producer for a camera sensor. + * + * @dev: device for v4l2 sensor clock "consumer" + * @id: clock consumer ID + * @fixed_rate: interpret the @clk_rate as a fixed rate or default rate + * @clk_rate: the clock rate + * + * This function behaves the same way as devm_v4l2_sensor_clk_get() except that + * it extends the behaviour on ACPI platforms to all platforms. + * + * The function also provides the ability to set the clock rate to a fixed + * frequency by setting @fixed_rate to true and specifying the fixed frequency + * in @clk_rate, or to use a default clock rate when the "clock-frequency" + * property is absent by setting @fixed_rate to false and specifying the default + * frequency in @clk_rate. Setting @fixed_rate to true and @clk_rate to 0 is an + * error. + * + * This function is meant to support legacy behaviour in existing drivers only. + * It must not be used in any new driver. + * + * Returns a pointer to a struct clk on success or an error pointer on failure. + */ +static inline struct clk * +devm_v4l2_sensor_clk_get_legacy(struct device *dev, const char *id, + bool fixed_rate, unsigned long clk_rate) +{ + return __devm_v4l2_sensor_clk_get(dev, id, true, fixed_rate, clk_rate); +} static inline u64 v4l2_buffer_get_timestamp(const struct v4l2_buffer *buf) { -- cgit v1.2.3 From eddb5ba91b289faa15117d4fc1c2fb223f3493c2 Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Fri, 30 May 2025 15:38:09 +0200 Subject: PM / devfreq: rockchip-dfi: add support for LPDDR5 The Rockchip RK3588 SoC can also support LPDDR5 memory. This type of memory needs some special case handling in the rockchip-dfi driver. Add support for it in rockchip-dfi, as well as the needed GRF register definitions. This has been tested as returning both the right cycle count and bandwidth on a LPDDR5 board where the CKR bit is 1. I couldn't test whether the values are correct on a system where CKR is 0, as I'm not savvy enough with the Rockchip tooling to know whether this can be set in the DDR init blob. Downstream has some special case handling for a hardware version where not just the control bits differ, but also the register. Since I don't know whether that hardware version is in any production silicon, it's left unimplemented for now, with an error message urging users to report if they have such a system. There is a slight change of behaviour for non-LPDDR5 systems: instead of writing 0 as the control flags to the control register and pretending everything is alright if the memory type is unknown, we now explicitly return an error. Signed-off-by: Nicolas Frattaroli Reviewed-by: Sascha Hauer Acked-by: Heiko Stuebner Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20250530-rk3588-dfi-improvements-v1-2-6e077c243a95@collabora.com/ --- drivers/devfreq/event/rockchip-dfi.c | 84 ++++++++++++++++++++++++++++-------- include/soc/rockchip/rk3588_grf.h | 8 +++- include/soc/rockchip/rockchip_grf.h | 1 + 3 files changed, 73 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c index 54effb635196..5a2c9badcc64 100644 --- a/drivers/devfreq/event/rockchip-dfi.c +++ b/drivers/devfreq/event/rockchip-dfi.c @@ -34,15 +34,18 @@ /* DDRMON_CTRL */ #define DDRMON_CTRL 0x04 +#define DDRMON_CTRL_LPDDR5 BIT(6) #define DDRMON_CTRL_DDR4 BIT(5) #define DDRMON_CTRL_LPDDR4 BIT(4) #define DDRMON_CTRL_HARDWARE_EN BIT(3) #define DDRMON_CTRL_LPDDR23 BIT(2) #define DDRMON_CTRL_SOFTWARE_EN BIT(1) #define DDRMON_CTRL_TIMER_CNT_EN BIT(0) -#define DDRMON_CTRL_DDR_TYPE_MASK (DDRMON_CTRL_DDR4 | \ +#define DDRMON_CTRL_DDR_TYPE_MASK (DDRMON_CTRL_LPDDR5 | \ + DDRMON_CTRL_DDR4 | \ DDRMON_CTRL_LPDDR4 | \ DDRMON_CTRL_LPDDR23) +#define DDRMON_CTRL_LP5_BANK_MODE_MASK GENMASK(8, 7) #define DDRMON_CH0_WR_NUM 0x20 #define DDRMON_CH0_RD_NUM 0x24 @@ -116,13 +119,60 @@ struct rockchip_dfi { int buswidth[DMC_MAX_CHANNELS]; int ddrmon_stride; bool ddrmon_ctrl_single; + u32 lp5_bank_mode; + bool lp5_ckr; /* true if in 4:1 command-to-data clock ratio mode */ unsigned int count_multiplier; /* number of data clocks per count */ }; +static int rockchip_dfi_ddrtype_to_ctrl(struct rockchip_dfi *dfi, u32 *ctrl, + u32 *mask) +{ + u32 ddrmon_ver; + + *mask = DDRMON_CTRL_DDR_TYPE_MASK; + + switch (dfi->ddr_type) { + case ROCKCHIP_DDRTYPE_LPDDR2: + case ROCKCHIP_DDRTYPE_LPDDR3: + *ctrl = DDRMON_CTRL_LPDDR23; + break; + case ROCKCHIP_DDRTYPE_LPDDR4: + case ROCKCHIP_DDRTYPE_LPDDR4X: + *ctrl = DDRMON_CTRL_LPDDR4; + break; + case ROCKCHIP_DDRTYPE_LPDDR5: + ddrmon_ver = readl_relaxed(dfi->regs); + if (ddrmon_ver < 0x40) { + *ctrl = DDRMON_CTRL_LPDDR5 | dfi->lp5_bank_mode; + *mask |= DDRMON_CTRL_LP5_BANK_MODE_MASK; + break; + } + + /* + * As it is unknown whether the unpleasant special case + * behaviour used by the vendor kernel is needed for any + * shipping hardware, ask users to report if they have + * some of that hardware. + */ + dev_err(&dfi->edev->dev, + "unsupported DDRMON version 0x%04X, please let linux-rockchip know!\n", + ddrmon_ver); + return -EOPNOTSUPP; + default: + dev_err(&dfi->edev->dev, "unsupported memory type 0x%X\n", + dfi->ddr_type); + return -EOPNOTSUPP; + } + + return 0; +} + static int rockchip_dfi_enable(struct rockchip_dfi *dfi) { void __iomem *dfi_regs = dfi->regs; int i, ret = 0; + u32 ctrl; + u32 ctrl_mask; mutex_lock(&dfi->mutex); @@ -136,8 +186,11 @@ static int rockchip_dfi_enable(struct rockchip_dfi *dfi) goto out; } + ret = rockchip_dfi_ddrtype_to_ctrl(dfi, &ctrl, &ctrl_mask); + if (ret) + goto out; + for (i = 0; i < dfi->max_channels; i++) { - u32 ctrl = 0; if (!(dfi->channel_mask & BIT(i))) continue; @@ -147,21 +200,7 @@ static int rockchip_dfi_enable(struct rockchip_dfi *dfi) DDRMON_CTRL_SOFTWARE_EN | DDRMON_CTRL_HARDWARE_EN), dfi_regs + i * dfi->ddrmon_stride + DDRMON_CTRL); - /* set ddr type to dfi */ - switch (dfi->ddr_type) { - case ROCKCHIP_DDRTYPE_LPDDR2: - case ROCKCHIP_DDRTYPE_LPDDR3: - ctrl = DDRMON_CTRL_LPDDR23; - break; - case ROCKCHIP_DDRTYPE_LPDDR4: - case ROCKCHIP_DDRTYPE_LPDDR4X: - ctrl = DDRMON_CTRL_LPDDR4; - break; - default: - break; - } - - writel_relaxed(HIWORD_UPDATE(ctrl, DDRMON_CTRL_DDR_TYPE_MASK), + writel_relaxed(HIWORD_UPDATE(ctrl, ctrl_mask), dfi_regs + i * dfi->ddrmon_stride + DDRMON_CTRL); /* enable count, use software mode */ @@ -652,6 +691,7 @@ static int rockchip_ddr_perf_init(struct rockchip_dfi *dfi) break; case ROCKCHIP_DDRTYPE_LPDDR4: case ROCKCHIP_DDRTYPE_LPDDR4X: + case ROCKCHIP_DDRTYPE_LPDDR5: dfi->burst_len = 16; break; } @@ -730,7 +770,7 @@ static int rk3568_dfi_init(struct rockchip_dfi *dfi) static int rk3588_dfi_init(struct rockchip_dfi *dfi) { struct regmap *regmap_pmu = dfi->regmap_pmu; - u32 reg2, reg3, reg4; + u32 reg2, reg3, reg4, reg6; regmap_read(regmap_pmu, RK3588_PMUGRF_OS_REG2, ®2); regmap_read(regmap_pmu, RK3588_PMUGRF_OS_REG3, ®3); @@ -757,6 +797,14 @@ static int rk3588_dfi_init(struct rockchip_dfi *dfi) dfi->ddrmon_stride = 0x4000; dfi->count_multiplier = 2; + if (dfi->ddr_type == ROCKCHIP_DDRTYPE_LPDDR5) { + regmap_read(regmap_pmu, RK3588_PMUGRF_OS_REG6, ®6); + dfi->lp5_bank_mode = FIELD_GET(RK3588_PMUGRF_OS_REG6_LP5_BANK_MODE, reg6) << 7; + dfi->lp5_ckr = FIELD_GET(RK3588_PMUGRF_OS_REG6_LP5_CKR, reg6); + if (dfi->lp5_ckr) + dfi->count_multiplier *= 2; + } + return 0; }; diff --git a/include/soc/rockchip/rk3588_grf.h b/include/soc/rockchip/rk3588_grf.h index 630b35a55064..02a7b2432d99 100644 --- a/include/soc/rockchip/rk3588_grf.h +++ b/include/soc/rockchip/rk3588_grf.h @@ -12,7 +12,11 @@ #define RK3588_PMUGRF_OS_REG3_DRAMTYPE_INFO_V3 GENMASK(13, 12) #define RK3588_PMUGRF_OS_REG3_SYSREG_VERSION GENMASK(31, 28) -#define RK3588_PMUGRF_OS_REG4 0x210 -#define RK3588_PMUGRF_OS_REG5 0x214 +#define RK3588_PMUGRF_OS_REG4 0x210 +#define RK3588_PMUGRF_OS_REG5 0x214 +#define RK3588_PMUGRF_OS_REG6 0x218 +#define RK3588_PMUGRF_OS_REG6_LP5_BANK_MODE GENMASK(2, 1) +/* Whether the LPDDR5 is in 2:1 (= 0) or 4:1 (= 1) CKR a.k.a. DQS mode */ +#define RK3588_PMUGRF_OS_REG6_LP5_CKR BIT(0) #endif /* __SOC_RK3588_GRF_H */ diff --git a/include/soc/rockchip/rockchip_grf.h b/include/soc/rockchip/rockchip_grf.h index e46fd72aea8d..41c7bb26fd53 100644 --- a/include/soc/rockchip/rockchip_grf.h +++ b/include/soc/rockchip/rockchip_grf.h @@ -13,6 +13,7 @@ enum { ROCKCHIP_DDRTYPE_LPDDR3 = 6, ROCKCHIP_DDRTYPE_LPDDR4 = 7, ROCKCHIP_DDRTYPE_LPDDR4X = 8, + ROCKCHIP_DDRTYPE_LPDDR5 = 9, }; #endif /* __SOC_ROCKCHIP_GRF_H */ -- cgit v1.2.3 From fec2e705729dc93de5399d8b139e4746805c3d81 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:51 -0700 Subject: block: check for valid bio while splitting We're already iterating every segment, so check these for a valid IO lengths at the same time. Individual segment lengths will not be checked on passthrough commands. The read/write command segments must be sized to the dma alignment. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- block/blk-merge.c | 21 +++++++++++++++++---- include/linux/bio.h | 4 ++-- include/linux/blkdev.h | 7 +++++++ 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/block/blk-map.c b/block/blk-map.c index 92b7e19b1a1f..ca4f5cf00038 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -443,7 +443,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) int ret; /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) diff --git a/block/blk-merge.c b/block/blk-merge.c index 70d704615be5..cffc0fe48d8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,25 +279,30 @@ static unsigned int bio_split_alignment(struct bio *bio, } /** - * bio_split_rw_at - check if and where to split a read/write bio + * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors * @max_bytes: [in] maximum number of bytes per bio + * @len_align_mask: [in] length alignment mask for each vector * * Find out if @bio needs to be split to fit the queue limits in @lim and a * maximum size of @max_bytes. Returns a negative error number if @bio can't be * split, 0 if the bio doesn't have to be split, or a positive sector offset if * @bio needs to be split. */ -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes) +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { + if (bv.bv_offset & lim->dma_alignment || + bv.bv_len & len_align_mask) + return -EINVAL; + /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -339,8 +344,16 @@ split: * Individual bvecs might not be logical block aligned. Round down the * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. + * + * It is possible to submit a bio that can't be split into a valid io: + * there may either be too many discontiguous vectors for the max + * segments limit, or contain virtual boundary gaps without having a + * valid block sized split. A zero byte result means one of those + * conditions occured. */ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); + if (!bytes) + return -EINVAL; /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -350,7 +363,7 @@ split: bio_clear_polled(bio); return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw_at); +EXPORT_SYMBOL_GPL(bio_split_io_at); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) diff --git a/include/linux/bio.h b/include/linux/bio.h index 27cbff5b0356..13d1df02656a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -322,8 +322,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes); +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7709d55adc23..9efacabaa2f7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1870,6 +1870,13 @@ bdev_atomic_write_unit_max_bytes(struct block_device *bdev) return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } +static inline int bio_split_rw_at(struct bio *bio, + const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) +{ + return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 743bf2e0c49c835cb7c4e4ac7d5a2610587047be Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:52 -0700 Subject: block: add size alignment to bio_iov_iter_get_pages The block layer tries to align bio vectors to the block device's logical block size. Some cases don't have a block device, or we may need to align to something larger, which we can't derive it from the queue limits. Have the caller specify what they want, or allow any length alignment if nothing was specified. Since the most common use case relies on the block device's limits, a helper function is provided. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 19 +++++++++++-------- block/fops.c | 6 +++--- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 9 ++++++++- include/linux/blkdev.h | 7 +++++++ 5 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 971d96afaf8d..f91dc9f32bdc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1208,7 +1208,8 @@ static unsigned int get_contig_folio_len(unsigned int *num_pages, * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; @@ -1217,7 +1218,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; - size_t offset, folio_offset, left, len; + size_t offset, folio_offset, left, len, trim; int ret = 0; /* @@ -1246,8 +1247,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - if (bio->bi_bdev) { - size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + trim = size & len_align_mask; + if (trim) { iov_iter_revert(iter, trim); size -= trim; } @@ -1302,9 +1303,10 @@ out: } /** - * bio_iov_iter_get_pages - add user or kernel pages to a bio + * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @len_align_mask: the mask to align each vector size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1321,7 +1323,8 @@ out: * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { int ret = 0; @@ -1337,12 +1340,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { - ret = __bio_iov_iter_get_pages(bio, iter); + ret = __bio_iov_iter_get_pages(bio, iter, len_align_mask); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); return bio->bi_vcnt ? 0 : ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25..d136fb5f6b6a 100644 --- a/block/fops.c +++ b/block/fops.c @@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_pages(&bio, iter); + ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b84f6af2eb4c..fea23fa6a402 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -434,7 +434,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, dio->submit.iter); + ret = bio_iov_iter_get_bdev_pages(bio, dio->submit.iter, iomap->bdev); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall diff --git a/include/linux/bio.h b/include/linux/bio.h index 13d1df02656a..a64a30131031 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -446,7 +446,14 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask); + +static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, 0); +} + void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9efacabaa2f7..44e1066f7446 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1877,6 +1877,13 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } +static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 9eab1d4e0d15b633adc170c458c51e8be3b1c553 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:56 -0700 Subject: block: remove bdev_iter_is_aligned No more callers. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 44e1066f7446..c8cb08b2ed29 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1590,13 +1590,6 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev) return queue_dma_alignment(bdev_get_queue(bdev)); } -static inline bool bdev_iter_is_aligned(struct block_device *bdev, - struct iov_iter *iter) -{ - return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), - bdev_logical_block_size(bdev) - 1); -} - static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { -- cgit v1.2.3 From b475272f03ca5d0c437c8f899ff229b21010ec83 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:58 -0700 Subject: iov_iter: remove iov_iter_is_aligned No more callers. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/uio.h | 2 -- lib/iov_iter.c | 95 ----------------------------------------------------- 2 files changed, 97 deletions(-) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e86c653186c..5b127043a151 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -286,8 +286,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f9193f952f49..2fe66a6b8789 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -784,101 +784,6 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) } EXPORT_SYMBOL(iov_iter_discard); -static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct iovec *iov = iter_iov(i); - size_t size = i->count; - size_t skip = i->iov_offset; - - do { - size_t len = iov->iov_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(iov->iov_base + skip) & addr_mask) - return false; - - iov++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct bio_vec *bvec = i->bvec; - unsigned skip = i->iov_offset; - size_t size = i->count; - - do { - size_t len = bvec->bv_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) - return false; - - bvec++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -/** - * iov_iter_is_aligned() - Check if the addresses and lengths of each segments - * are aligned to the parameters. - * - * @i: &struct iov_iter to restore - * @addr_mask: bit mask to check against the iov element's addresses - * @len_mask: bit mask to check against the iov element's lengths - * - * Return: false if any addresses or lengths intersect with the provided masks - */ -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - if (likely(iter_is_ubuf(i))) { - if (i->count & len_mask) - return false; - if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) - return false; - return true; - } - - if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return iov_iter_aligned_iovec(i, addr_mask, len_mask); - - if (iov_iter_is_bvec(i)) - return iov_iter_aligned_bvec(i, addr_mask, len_mask); - - /* With both xarray and folioq types, we're dealing with whole folios. */ - if (iov_iter_is_xarray(i)) { - if (i->count & len_mask) - return false; - if ((i->xarray_start + i->iov_offset) & addr_mask) - return false; - } - if (iov_iter_is_folioq(i)) { - if (i->count & len_mask) - return false; - if (i->iov_offset & addr_mask) - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(iov_iter_is_aligned); - static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); -- cgit v1.2.3 From d57447ffb5fadffdba920f2fb933296fb6c5ff57 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 3 Sep 2025 12:33:17 -0700 Subject: blk-mq-dma: bring back p2p request flags We only need to consider data and metadata dma mapping types separately. The request and bio integrity payload have enough flag bits to internally track the mapping type for each. Use these so the caller doesn't need to track them, and provide separete request and integrity helpers to the common code. This will make it easier to scale new mappings, like the proposed MMIO attribute, without burdening the caller to track such things. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Leon Romanovsky Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++++ drivers/nvme/host/pci.c | 21 ++++----------------- include/linux/bio-integrity.h | 1 + include/linux/blk-integrity.h | 15 +++++++++++++++ include/linux/blk-mq-dma.h | 11 +++++++++-- include/linux/blk_types.h | 2 ++ 6 files changed, 35 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 660b5e200ccf..449950029872 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -174,6 +174,10 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: + if (iter->iter.is_integrity) + bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; + else + req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d8a9dee55de3..28e203b894eb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -260,12 +260,6 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, - /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ - IOD_P2P_BUS_ADDR = 1U << 3, - - /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ - IOD_META_P2P_BUS_ADDR = 1U << 4, - /* Metadata using non-coalesced MPTR */ IOD_SINGLE_META_SEGMENT = 1U << 5, }; @@ -737,9 +731,8 @@ static void nvme_unmap_metadata(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, - iod->meta_total_len, - iod->flags & IOD_META_P2P_BUS_ADDR)) { + if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len)) { if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) nvme_free_sgls(req, sge, &sge[1]); else @@ -766,8 +759,7 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, - iod->flags & IOD_P2P_BUS_ADDR)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req, iod->descriptors[0], &iod->cmd.common.dptr.sgl); @@ -1043,9 +1035,6 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; - if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - iod->flags |= IOD_P2P_BUS_ADDR; - if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) @@ -1068,9 +1057,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) &iod->meta_dma_state, &iter)) return iter.status; - if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - iod->flags |= IOD_META_P2P_BUS_ADDR; - else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) entries = 1; /* diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0a25716820fe..851254f36eb3 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,6 +13,7 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ + BIP_P2P_DMA = 1 << 8, /* using P2P address */ }; struct bio_integrity_payload { diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 78fe2459e661..b659373788f6 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -27,6 +27,15 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); + +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); +} + int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -115,6 +124,12 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return false; +} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 0f45ea110ca1..51829958d872 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -43,7 +43,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_rq_dma_unmap - try to DMA unmap a request + * blk_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state @@ -53,7 +53,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, +static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { if (is_p2p) @@ -68,4 +68,11 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, return !dma_need_unmap(dma_dev); } +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + req->cmd_flags & REQ_P2PDMA); +} + #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index bbb7893e0542..4bd098fd61cb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -384,6 +384,7 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ + __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -416,6 +417,7 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From ce0b015e2619ae64b7d33fb24a6b6cadcd70c317 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Sat, 6 Sep 2025 18:29:43 -0700 Subject: devlink: Add 'total_vfs' generic device param NICs are typically configured with total_vfs=0, forcing users to rely on external tools to enable SR-IOV (a widely used and essential feature). Add total_vfs parameter to devlink for SR-IOV max VF configurability. Enables standard kernel tools to manage SR-IOV, addressing the need for flexible VF configuration. Signed-off-by: Vlad Dumitrescu Tested-by: Kamal Heib Reviewed-by: Jiri Pirko Signed-off-by: Saeed Mahameed Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250907012953.301746-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 5 +++++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'include') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index 211b58177e12..c51da4fba7e7 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -143,3 +143,8 @@ own name. * - ``clock_id`` - u64 - Clock ID used by the device for registering DPLL devices and pins. + * - ``total_vfs`` + - u32 + - The max number of Virtual Functions (VFs) exposed by the PF. + after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs + directory will report this value. diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f44e702c25c..8d4362f010e4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -530,6 +530,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -594,6 +595,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" #define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 41dcc86cfd94..33134940c266 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -102,6 +102,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, + .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From bf2da4799fdb6eb58d9c9541b7dc1096c260499d Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 6 Sep 2025 18:29:44 -0700 Subject: net/mlx5: Implement cqe_compress_type via devlink params Selects which algorithm should be used by the NIC in order to decide rate of CQE compression dependeng on PCIe bus conditions. Supported values: 1) balanced, merges fewer CQEs, resulting in a moderate compression ratio but maintaining a balance between bandwidth savings and performance 2) aggressive, merges more CQEs into a single entry, achieving a higher compression rate and maximizing performance, particularly under high traffic loads. Signed-off-by: Saeed Mahameed Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250907012953.301746-3-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/mlx5.rst | 10 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 8 + drivers/net/ethernet/mellanox/mlx5/core/devlink.h | 1 + .../net/ethernet/mellanox/mlx5/core/lib/nv_param.c | 245 +++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/lib/nv_param.h | 14 ++ include/linux/mlx5/driver.h | 1 + 7 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h (limited to 'include') diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst index 7febe0aecd53..2edc842b620d 100644 --- a/Documentation/networking/devlink/mlx5.rst +++ b/Documentation/networking/devlink/mlx5.rst @@ -117,6 +117,16 @@ parameters. - driverinit - Control the size (in packets) of the hairpin queues. + * - ``cqe_compress_type`` + - string + - permanent + - Configure which mechanism/algorithm should be used by the NIC that will + affect the rate (aggressiveness) of compressed CQEs depending on PCIe bus + conditions and other internal NIC factors. This mode affects all queues + that enable compression. + * ``balanced`` : Merges fewer CQEs, resulting in a moderate compression ratio but maintaining a balance between bandwidth savings and performance + * ``aggressive`` : Merges more CQEs into a single entry, achieving a higher compression rate and maximizing performance, particularly under high traffic loads + The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` Info versions diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index a65ab661375a..d77696f46eb5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -17,7 +17,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o + fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 2c0e0c16ca90..326d438e75b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -10,6 +10,7 @@ #include "esw/qos.h" #include "sf/dev/dev.h" #include "sf/sf.h" +#include "lib/nv_param.h" static int mlx5_devlink_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, @@ -895,8 +896,14 @@ int mlx5_devlink_params_register(struct devlink *devlink) if (err) goto max_uc_list_err; + err = mlx5_nv_param_register_dl_params(devlink); + if (err) + goto nv_param_err; + return 0; +nv_param_err: + mlx5_devlink_max_uc_list_params_unregister(devlink); max_uc_list_err: mlx5_devlink_auxdev_params_unregister(devlink); auxdev_reg_err: @@ -907,6 +914,7 @@ auxdev_reg_err: void mlx5_devlink_params_unregister(struct devlink *devlink) { + mlx5_nv_param_unregister_dl_params(devlink); mlx5_devlink_max_uc_list_params_unregister(devlink); mlx5_devlink_auxdev_params_unregister(devlink); devl_params_unregister(devlink, mlx5_devlink_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h index 961f75da6227..74bcdfa70361 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -22,6 +22,7 @@ enum mlx5_devlink_param_id { MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT, MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES, MLX5_DEVLINK_PARAM_ID_HAIRPIN_QUEUE_SIZE, + MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE }; struct mlx5_trap_ctx { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c new file mode 100644 index 000000000000..20a39483be04 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "nv_param.h" +#include "mlx5_core.h" + +enum { + MLX5_CLASS_0_CTRL_ID_NV_SW_OFFLOAD_CONFIG = 0x10a, +}; + +struct mlx5_ifc_configuration_item_type_class_global_bits { + u8 type_class[0x8]; + u8 parameter_index[0x18]; +}; + +union mlx5_ifc_config_item_type_auto_bits { + struct mlx5_ifc_configuration_item_type_class_global_bits + configuration_item_type_class_global; + u8 reserved_at_0[0x20]; +}; + +struct mlx5_ifc_config_item_bits { + u8 valid[0x2]; + u8 priority[0x2]; + u8 header_type[0x2]; + u8 ovr_en[0x1]; + u8 rd_en[0x1]; + u8 access_mode[0x2]; + u8 reserved_at_a[0x1]; + u8 writer_id[0x5]; + u8 version[0x4]; + u8 reserved_at_14[0x2]; + u8 host_id_valid[0x1]; + u8 length[0x9]; + + union mlx5_ifc_config_item_type_auto_bits type; + + u8 reserved_at_40[0x10]; + u8 crc16[0x10]; +}; + +struct mlx5_ifc_mnvda_reg_bits { + struct mlx5_ifc_config_item_bits configuration_item_header; + + u8 configuration_item_data[64][0x20]; +}; + +struct mlx5_ifc_nv_sw_offload_conf_bits { + u8 ip_over_vxlan_port[0x10]; + u8 tunnel_ecn_copy_offload_disable[0x1]; + u8 pci_atomic_mode[0x3]; + u8 sr_enable[0x1]; + u8 ptp_cyc2realtime[0x1]; + u8 vector_calc_disable[0x1]; + u8 uctx_en[0x1]; + u8 prio_tag_required_en[0x1]; + u8 esw_fdb_ipv4_ttl_modify_enable[0x1]; + u8 mkey_by_name[0x1]; + u8 ip_over_vxlan_en[0x1]; + u8 one_qp_per_recovery[0x1]; + u8 cqe_compression[0x3]; + u8 tunnel_udp_entropy_proto_disable[0x1]; + u8 reserved_at_21[0x1]; + u8 ar_enable[0x1]; + u8 log_max_outstanding_wqe[0x5]; + u8 vf_migration[0x2]; + u8 log_tx_psn_win[0x6]; + u8 lro_log_timeout3[0x4]; + u8 lro_log_timeout2[0x4]; + u8 lro_log_timeout1[0x4]; + u8 lro_log_timeout0[0x4]; +}; + +#define MNVDA_HDR_SZ \ + (MLX5_ST_SZ_BYTES(mnvda_reg) - \ + MLX5_BYTE_OFF(mnvda_reg, configuration_item_data)) + +#define MLX5_SET_CFG_ITEM_TYPE(_cls_name, _mnvda_ptr, _field, _val) \ + MLX5_SET(mnvda_reg, _mnvda_ptr, \ + configuration_item_header.type.configuration_item_type_class_##_cls_name._field, \ + _val) + +#define MLX5_SET_CFG_HDR_LEN(_mnvda_ptr, _cls_name) \ + MLX5_SET(mnvda_reg, _mnvda_ptr, configuration_item_header.length, \ + MLX5_ST_SZ_BYTES(_cls_name)) + +#define MLX5_GET_CFG_HDR_LEN(_mnvda_ptr) \ + MLX5_GET(mnvda_reg, _mnvda_ptr, configuration_item_header.length) + +static int mlx5_nv_param_read(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + u32 param_idx, type_class; + u32 header_len; + void *cls_ptr; + int err; + + if (WARN_ON(len > MLX5_ST_SZ_BYTES(mnvda_reg)) || len < MNVDA_HDR_SZ) + return -EINVAL; /* A caller bug */ + + err = mlx5_core_access_reg(dev, mnvda, len, mnvda, len, MLX5_REG_MNVDA, + 0, 0); + if (!err) + return 0; + + cls_ptr = MLX5_ADDR_OF(mnvda_reg, mnvda, + configuration_item_header.type.configuration_item_type_class_global); + + type_class = MLX5_GET(configuration_item_type_class_global, cls_ptr, + type_class); + param_idx = MLX5_GET(configuration_item_type_class_global, cls_ptr, + parameter_index); + header_len = MLX5_GET_CFG_HDR_LEN(mnvda); + + mlx5_core_warn(dev, "Failed to read mnvda reg: type_class 0x%x, param_idx 0x%x, header_len %u, err %d\n", + type_class, param_idx, header_len, err); + + return -EOPNOTSUPP; +} + +static int mlx5_nv_param_write(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + if (WARN_ON(len > MLX5_ST_SZ_BYTES(mnvda_reg)) || len < MNVDA_HDR_SZ) + return -EINVAL; + + if (WARN_ON(MLX5_GET_CFG_HDR_LEN(mnvda) == 0)) + return -EINVAL; + + return mlx5_core_access_reg(dev, mnvda, len, mnvda, len, MLX5_REG_MNVDA, + 0, 1); +} + +static int +mlx5_nv_param_read_sw_offload_conf(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, type_class, 0); + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, parameter_index, + MLX5_CLASS_0_CTRL_ID_NV_SW_OFFLOAD_CONFIG); + MLX5_SET_CFG_HDR_LEN(mnvda, nv_sw_offload_conf); + + return mlx5_nv_param_read(dev, mnvda, len); +} + +static const char *const + cqe_compress_str[] = { "balanced", "aggressive" }; + +static int +mlx5_nv_param_devlink_cqe_compress_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + u8 value = U8_MAX; + void *data; + int err; + + err = mlx5_nv_param_read_sw_offload_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + value = MLX5_GET(nv_sw_offload_conf, data, cqe_compression); + + if (value >= ARRAY_SIZE(cqe_compress_str)) + return -EOPNOTSUPP; + + strscpy(ctx->val.vstr, cqe_compress_str[value], sizeof(ctx->val.vstr)); + return 0; +} + +static int +mlx5_nv_param_devlink_cqe_compress_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cqe_compress_str); i++) { + if (!strcmp(val.vstr, cqe_compress_str[i])) + return 0; + } + + NL_SET_ERR_MSG_MOD(extack, + "Invalid value, supported values are balanced/aggressive"); + return -EOPNOTSUPP; +} + +static int +mlx5_nv_param_devlink_cqe_compress_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + int err = 0; + void *data; + u8 value; + + if (!strcmp(ctx->val.vstr, "aggressive")) + value = 1; + else /* balanced: can't be anything else already validated above */ + value = 0; + + err = mlx5_nv_param_read_sw_offload_conf(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read sw_offload_conf mnvda reg"); + return err; + } + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + MLX5_SET(nv_sw_offload_conf, data, cqe_compression, value); + + return mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); +} + +static const struct devlink_param mlx5_nv_param_devlink_params[] = { + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE, + "cqe_compress_type", DEVLINK_PARAM_TYPE_STRING, + BIT(DEVLINK_PARAM_CMODE_PERMANENT), + mlx5_nv_param_devlink_cqe_compress_get, + mlx5_nv_param_devlink_cqe_compress_set, + mlx5_nv_param_devlink_cqe_compress_validate), +}; + +int mlx5_nv_param_register_dl_params(struct devlink *devlink) +{ + if (!mlx5_core_is_pf(devlink_priv(devlink))) + return 0; + + return devl_params_register(devlink, mlx5_nv_param_devlink_params, + ARRAY_SIZE(mlx5_nv_param_devlink_params)); +} + +void mlx5_nv_param_unregister_dl_params(struct devlink *devlink) +{ + if (!mlx5_core_is_pf(devlink_priv(devlink))) + return; + + devl_params_unregister(devlink, mlx5_nv_param_devlink_params, + ARRAY_SIZE(mlx5_nv_param_devlink_params)); +} + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h new file mode 100644 index 000000000000..9f4922ff7745 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_NV_PARAM_H +#define __MLX5_NV_PARAM_H + +#include +#include "devlink.h" + +int mlx5_nv_param_register_dl_params(struct devlink *devlink); +void mlx5_nv_param_unregister_dl_params(struct devlink *devlink); + +#endif + diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c0858af0e854..fcfc18bfeba9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -137,6 +137,7 @@ enum { MLX5_REG_MTCAP = 0x9009, MLX5_REG_MTMP = 0x900A, MLX5_REG_MCIA = 0x9014, + MLX5_REG_MNVDA = 0x9024, MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, MLX5_REG_MRTC = 0x902d, -- cgit v1.2.3 From e096a7cc0be126d9376e549a10d71cf16b1a1c1c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 5 Sep 2025 11:07:09 +0800 Subject: ptp: add debugfs interfaces to loop back the periodic output signal For some PTP devices, they have the capability to loop back the periodic output signal for debugging, such as the ptp_qoriq device. So add the generic interfaces to set the periodic output signal loopback, rather than each vendor having a different implementation. Show how many channels support the periodic output signal loopback: $ cat /sys/kernel/debug/ptp/n_perout_loopback Enable the loopback of the periodic output signal of channel X: $ echo 1 > /sys/kernel/debug/ptp/perout_loopback Disable the loopback of the periodic output signal of channel X: $ echo 0 > /sys/kernel/debug/ptp/perout_loopback Suggested-by: Andrew Lunn Signed-off-by: Wei Fang Link: https://patch.msgid.link/20250905030711.1509648-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clock.c | 69 ++++++++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 10 ++++++ 2 files changed, 79 insertions(+) (limited to 'include') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 5739a57958c7..1d920f8e20a8 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -248,6 +248,69 @@ static void ptp_aux_kworker(struct kthread_work *work) kthread_queue_delayed_work(ptp->kworker, &ptp->aux_work, delay); } +static ssize_t ptp_n_perout_loopback_read(struct file *filep, + char __user *buffer, + size_t count, loff_t *pos) +{ + struct ptp_clock *ptp = filep->private_data; + char buf[12] = {}; + + snprintf(buf, sizeof(buf), "%d\n", ptp->info->n_per_lp); + + return simple_read_from_buffer(buffer, count, pos, buf, strlen(buf)); +} + +static const struct file_operations ptp_n_perout_loopback_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ptp_n_perout_loopback_read, +}; + +static ssize_t ptp_perout_loopback_write(struct file *filep, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct ptp_clock *ptp = filep->private_data; + struct ptp_clock_info *ops = ptp->info; + unsigned int index, enable; + int len, cnt, err; + char buf[32] = {}; + + if (*ppos || !count) + return -EINVAL; + + if (count >= sizeof(buf)) + return -ENOSPC; + + len = simple_write_to_buffer(buf, sizeof(buf) - 1, + ppos, buffer, count); + if (len < 0) + return len; + + buf[len] = '\0'; + cnt = sscanf(buf, "%u %u", &index, &enable); + if (cnt != 2) + return -EINVAL; + + if (index >= ops->n_per_lp) + return -EINVAL; + + if (enable != 0 && enable != 1) + return -EINVAL; + + err = ops->perout_loopback(ops, index, enable); + if (err) + return err; + + return count; +} + +static const struct file_operations ptp_perout_loopback_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = ptp_perout_loopback_write, +}; + /* public interface */ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, @@ -389,6 +452,12 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, /* Debugfs initialization */ snprintf(debugfsname, sizeof(debugfsname), "ptp%d", ptp->index); ptp->debugfs_root = debugfs_create_dir(debugfsname, NULL); + if (info->n_per_lp > 0 && info->perout_loopback) { + debugfs_create_file("n_perout_loopback", 0400, ptp->debugfs_root, + ptp, &ptp_n_perout_loopback_fops); + debugfs_create_file("perout_loopback", 0200, ptp->debugfs_root, + ptp, &ptp_perout_loopback_ops); + } return ptp; diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 7dd7951b23d5..884364596dd3 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -67,6 +67,8 @@ struct ptp_system_timestamp { * @n_ext_ts: The number of external time stamp channels. * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. + * @n_per_lp: The number of channels that support loopback the periodic + * output signal. * @pps: Indicates whether the clock supports a PPS callback. * * @supported_perout_flags: The set of flags the driver supports for the @@ -175,6 +177,11 @@ struct ptp_system_timestamp { * scheduling time (>=0) or negative value in case further * scheduling is not required. * + * @perout_loopback: Request driver to enable or disable the periodic output + * signal loopback. + * parameter index: index of the periodic output signal channel. + * parameter on: caller passes one to enable or zero to disable. + * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). * @@ -189,6 +196,7 @@ struct ptp_clock_info { int n_ext_ts; int n_per_out; int n_pins; + int n_per_lp; int pps; unsigned int supported_perout_flags; unsigned int supported_extts_flags; @@ -213,6 +221,8 @@ struct ptp_clock_info { int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); long (*do_aux_work)(struct ptp_clock_info *ptp); + int (*perout_loopback)(struct ptp_clock_info *ptp, unsigned int index, + int on); }; struct ptp_clock; -- cgit v1.2.3 From f3164840a136b123c8348ca5af4d83d99aa86eb7 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 5 Sep 2025 11:07:11 +0800 Subject: ptp: qoriq: convert to use generic interfaces to set loopback mode Since the generic debugfs interfaces for setting the periodic pulse signal loopback have been added to the ptp_clock driver, so convert the vendor-defined debugfs interfaces to the generic interfaces. Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20250905030711.1509648-4-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - drivers/ptp/Kconfig | 2 +- drivers/ptp/Makefile | 4 +- drivers/ptp/ptp_qoriq.c | 24 +++++++++- drivers/ptp/ptp_qoriq_debugfs.c | 101 ---------------------------------------- include/linux/fsl/ptp_qoriq.h | 10 ---- 6 files changed, 24 insertions(+), 118 deletions(-) delete mode 100644 drivers/ptp/ptp_qoriq_debugfs.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index b81595e9ea95..e3907f0c1243 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9817,7 +9817,6 @@ F: drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp* F: drivers/net/ethernet/freescale/dpaa2/dprtc* F: drivers/net/ethernet/freescale/enetc/enetc_ptp.c F: drivers/ptp/ptp_qoriq.c -F: drivers/ptp/ptp_qoriq_debugfs.c F: include/linux/fsl/ptp_qoriq.h FREESCALE QUAD SPI DRIVER diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 9256bf2e8ad4..5f8ea34d11d6 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -67,7 +67,7 @@ config PTP_1588_CLOCK_QORIQ packets using the SO_TIMESTAMPING API. To compile this driver as a module, choose M here: the module - will be called ptp-qoriq. + will be called ptp_qoriq. comment "Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks." depends on PHYLIB=n || NETWORK_PHY_TIMESTAMPING=n diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 8985d723d29c..bdc47e284f14 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -12,9 +12,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK) += ptp_vmclock.o -obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp-qoriq.o -ptp-qoriq-y += ptp_qoriq.o -ptp-qoriq-$(CONFIG_DEBUG_FS) += ptp_qoriq_debugfs.o +obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp_qoriq.o obj-$(CONFIG_PTP_1588_CLOCK_IDTCM) += ptp_clockmatrix.o obj-$(CONFIG_PTP_1588_CLOCK_FC3W) += ptp_fc3.o obj-$(CONFIG_PTP_1588_CLOCK_IDT82P33) += ptp_idt82p33.o diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index 4d488c1f1941..8da995e36aeb 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -465,6 +465,25 @@ static int ptp_qoriq_auto_config(struct ptp_qoriq *ptp_qoriq, return 0; } +static int ptp_qoriq_perout_loopback(struct ptp_clock_info *ptp, + unsigned int index, int on) +{ + struct ptp_qoriq *ptp_qoriq = container_of(ptp, struct ptp_qoriq, caps); + struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; + u32 loopback_bit = index ? PP2L : PP1L; + u32 tmr_ctrl; + + tmr_ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); + if (on) + tmr_ctrl |= loopback_bit; + else + tmr_ctrl &= ~loopback_bit; + + ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, tmr_ctrl); + + return 0; +} + int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, const struct ptp_clock_info *caps) { @@ -479,6 +498,8 @@ int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, ptp_qoriq->base = base; ptp_qoriq->caps = *caps; + ptp_qoriq->caps.n_per_lp = 2; + ptp_qoriq->caps.perout_loopback = ptp_qoriq_perout_loopback; if (of_property_read_u32(node, "fsl,cksel", &ptp_qoriq->cksel)) ptp_qoriq->cksel = DEFAULT_CKSEL; @@ -568,7 +589,7 @@ int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, return PTR_ERR(ptp_qoriq->clock); ptp_qoriq->phc_index = ptp_clock_index(ptp_qoriq->clock); - ptp_qoriq_create_debugfs(ptp_qoriq); + return 0; } EXPORT_SYMBOL_GPL(ptp_qoriq_init); @@ -580,7 +601,6 @@ void ptp_qoriq_free(struct ptp_qoriq *ptp_qoriq) ptp_qoriq->write(®s->ctrl_regs->tmr_temask, 0); ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, 0); - ptp_qoriq_remove_debugfs(ptp_qoriq); ptp_clock_unregister(ptp_qoriq->clock); iounmap(ptp_qoriq->base); free_irq(ptp_qoriq->irq, ptp_qoriq); diff --git a/drivers/ptp/ptp_qoriq_debugfs.c b/drivers/ptp/ptp_qoriq_debugfs.c deleted file mode 100644 index e8dddcedf288..000000000000 --- a/drivers/ptp/ptp_qoriq_debugfs.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* Copyright 2019 NXP - */ -#include -#include -#include - -static int ptp_qoriq_fiper1_lpbk_get(void *data, u64 *val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - *val = ctrl & PP1L ? 1 : 0; - - return 0; -} - -static int ptp_qoriq_fiper1_lpbk_set(void *data, u64 val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - if (val == 0) - ctrl &= ~PP1L; - else - ctrl |= PP1L; - - ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, ctrl); - return 0; -} - -DEFINE_DEBUGFS_ATTRIBUTE(ptp_qoriq_fiper1_fops, ptp_qoriq_fiper1_lpbk_get, - ptp_qoriq_fiper1_lpbk_set, "%llu\n"); - -static int ptp_qoriq_fiper2_lpbk_get(void *data, u64 *val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - *val = ctrl & PP2L ? 1 : 0; - - return 0; -} - -static int ptp_qoriq_fiper2_lpbk_set(void *data, u64 val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - if (val == 0) - ctrl &= ~PP2L; - else - ctrl |= PP2L; - - ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, ctrl); - return 0; -} - -DEFINE_DEBUGFS_ATTRIBUTE(ptp_qoriq_fiper2_fops, ptp_qoriq_fiper2_lpbk_get, - ptp_qoriq_fiper2_lpbk_set, "%llu\n"); - -void ptp_qoriq_create_debugfs(struct ptp_qoriq *ptp_qoriq) -{ - struct dentry *root; - - root = debugfs_create_dir(dev_name(ptp_qoriq->dev), NULL); - if (IS_ERR(root)) - return; - if (!root) - goto err_root; - - ptp_qoriq->debugfs_root = root; - - if (!debugfs_create_file_unsafe("fiper1-loopback", 0600, root, - ptp_qoriq, &ptp_qoriq_fiper1_fops)) - goto err_node; - if (!debugfs_create_file_unsafe("fiper2-loopback", 0600, root, - ptp_qoriq, &ptp_qoriq_fiper2_fops)) - goto err_node; - return; - -err_node: - debugfs_remove_recursive(root); - ptp_qoriq->debugfs_root = NULL; -err_root: - dev_err(ptp_qoriq->dev, "failed to initialize debugfs\n"); -} - -void ptp_qoriq_remove_debugfs(struct ptp_qoriq *ptp_qoriq) -{ - debugfs_remove_recursive(ptp_qoriq->debugfs_root); - ptp_qoriq->debugfs_root = NULL; -} diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index b301bf7199d3..3601e25779ba 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -145,7 +145,6 @@ struct ptp_qoriq { struct ptp_clock *clock; struct ptp_clock_info caps; struct resource *rsrc; - struct dentry *debugfs_root; struct device *dev; bool extts_fifo_support; bool fiper3_support; @@ -195,14 +194,5 @@ int ptp_qoriq_settime(struct ptp_clock_info *ptp, int ptp_qoriq_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on); int extts_clean_up(struct ptp_qoriq *ptp_qoriq, int index, bool update_event); -#ifdef CONFIG_DEBUG_FS -void ptp_qoriq_create_debugfs(struct ptp_qoriq *ptp_qoriq); -void ptp_qoriq_remove_debugfs(struct ptp_qoriq *ptp_qoriq); -#else -static inline void ptp_qoriq_create_debugfs(struct ptp_qoriq *ptp_qoriq) -{ } -static inline void ptp_qoriq_remove_debugfs(struct ptp_qoriq *ptp_qoriq) -{ } -#endif #endif -- cgit v1.2.3 From faac32d4ece30609f1a0930ca0ae951cf6dc1786 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Wed, 3 Sep 2025 10:44:37 +0800 Subject: scsi: ufs: host: mediatek: Enhance recovery on hibernation exit failure Improve the recovery process for hibernation exit failures. Trigger the error handler and break the suspend operation to ensure effective recovery from hibernation errors. Activate the error handling mechanism by ufshcd_force_error_recovery and scheduling the error handler work. Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 ++- drivers/ufs/host/ufs-mediatek.c | 14 +++++++++++--- include/ufs/ufshcd.h | 1 + 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 96ad57c3144b..c5e427520b1b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -6446,13 +6446,14 @@ void ufshcd_schedule_eh_work(struct ufs_hba *hba) } } -static void ufshcd_force_error_recovery(struct ufs_hba *hba) +void ufshcd_force_error_recovery(struct ufs_hba *hba) { spin_lock_irq(hba->host->host_lock); hba->force_reset = true; ufshcd_schedule_eh_work(hba); spin_unlock_irq(hba->host->host_lock); } +EXPORT_SYMBOL_GPL(ufshcd_force_error_recovery); static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow) { diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c index 61c8fe135100..e62f02b7ca76 100644 --- a/drivers/ufs/host/ufs-mediatek.c +++ b/drivers/ufs/host/ufs-mediatek.c @@ -1686,7 +1686,7 @@ static void ufs_mtk_dev_vreg_set_lpm(struct ufs_hba *hba, bool lpm) } } -static void ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba) +static int ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba) { int ret; @@ -1697,8 +1697,16 @@ static void ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba) ufs_mtk_wait_idle_state(hba, 5); ret = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100); - if (ret) + if (ret) { dev_warn(hba->dev, "exit h8 state fail, ret=%d\n", ret); + + ufshcd_force_error_recovery(hba); + + /* trigger error handler and break suspend */ + ret = -EBUSY; + } + + return ret; } static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, @@ -1709,7 +1717,7 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, if (status == PRE_CHANGE) { if (ufshcd_is_auto_hibern8_supported(hba)) - ufs_mtk_auto_hibern8_disable(hba); + return ufs_mtk_auto_hibern8_disable(hba); return 0; } diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 1d3943777584..219935b3a76f 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1508,5 +1508,6 @@ int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask); int ufshcd_write_ee_control(struct ufs_hba *hba); int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, const u16 *other_mask, u16 set, u16 clr); +void ufshcd_force_error_recovery(struct ufs_hba *hba); #endif /* End of Header */ -- cgit v1.2.3 From bc5dbf7739594b05c673ab3905471257be9921e7 Mon Sep 17 00:00:00 2001 From: Nitin Rawat Date: Wed, 3 Sep 2025 13:18:15 +0530 Subject: scsi: ufs: ufs-qcom: Refactor MCQ register dump logic Refactor MCQ register dump to align with the new resource mapping. As part of refactor, below changes are done: - Update ufs_qcom_dump_regs() function signature to accept direct base address instead of resource ID enum - Modify ufs_qcom_dump_mcq_hci_regs() to use hba->mcq_base and calculated addresses from MCQ operation info - Replace enum ufshcd_res with direct memory-mapped I/O addresses Additionally remove the ufshcd_res_info structure and associated enum ufshcd_res definitions from the UFS host controller header. These were previously used for MCQ resource mapping but are no longer needed following recent refactoring to use direct base addresses instead of multiple separate resource regions. Signed-off-by: Nitin Rawat Reviewed-by: Bart Van Assche Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 34 +++++++++++++++++++--------------- include/ufs/ufshcd.h | 25 ------------------------- 2 files changed, 19 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 382d9e0f25a5..fee0bb6fd23c 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -1742,7 +1742,7 @@ static void ufs_qcom_dump_testbus(struct ufs_hba *hba) } static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, - const char *prefix, enum ufshcd_res id) + const char *prefix, void __iomem *base) { u32 *regs __free(kfree) = NULL; size_t pos; @@ -1755,7 +1755,7 @@ static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, return -ENOMEM; for (pos = 0; pos < len; pos += 4) - regs[pos / 4] = readl(hba->res[id].base + offset + pos); + regs[pos / 4] = readl(base + offset + pos); print_hex_dump(KERN_ERR, prefix, len > 4 ? DUMP_PREFIX_OFFSET : DUMP_PREFIX_NONE, @@ -1766,30 +1766,34 @@ static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, static void ufs_qcom_dump_mcq_hci_regs(struct ufs_hba *hba) { + struct ufshcd_mcq_opr_info_t *opr = &hba->mcq_opr[0]; + void __iomem *mcq_vs_base = hba->mcq_base + UFS_MEM_VS_BASE; + struct dump_info { + void __iomem *base; size_t offset; size_t len; const char *prefix; - enum ufshcd_res id; }; struct dump_info mcq_dumps[] = { - {0x0, 256 * 4, "MCQ HCI-0 ", RES_MCQ}, - {0x400, 256 * 4, "MCQ HCI-1 ", RES_MCQ}, - {0x0, 5 * 4, "MCQ VS-0 ", RES_MCQ_VS}, - {0x0, 256 * 4, "MCQ SQD-0 ", RES_MCQ_SQD}, - {0x400, 256 * 4, "MCQ SQD-1 ", RES_MCQ_SQD}, - {0x800, 256 * 4, "MCQ SQD-2 ", RES_MCQ_SQD}, - {0xc00, 256 * 4, "MCQ SQD-3 ", RES_MCQ_SQD}, - {0x1000, 256 * 4, "MCQ SQD-4 ", RES_MCQ_SQD}, - {0x1400, 256 * 4, "MCQ SQD-5 ", RES_MCQ_SQD}, - {0x1800, 256 * 4, "MCQ SQD-6 ", RES_MCQ_SQD}, - {0x1c00, 256 * 4, "MCQ SQD-7 ", RES_MCQ_SQD}, + {hba->mcq_base, 0x0, 256 * 4, "MCQ HCI-0 "}, + {hba->mcq_base, 0x400, 256 * 4, "MCQ HCI-1 "}, + {mcq_vs_base, 0x0, 5 * 4, "MCQ VS-0 "}, + {opr->base, 0x0, 256 * 4, "MCQ SQD-0 "}, + {opr->base, 0x400, 256 * 4, "MCQ SQD-1 "}, + {opr->base, 0x800, 256 * 4, "MCQ SQD-2 "}, + {opr->base, 0xc00, 256 * 4, "MCQ SQD-3 "}, + {opr->base, 0x1000, 256 * 4, "MCQ SQD-4 "}, + {opr->base, 0x1400, 256 * 4, "MCQ SQD-5 "}, + {opr->base, 0x1800, 256 * 4, "MCQ SQD-6 "}, + {opr->base, 0x1c00, 256 * 4, "MCQ SQD-7 "}, + }; for (int i = 0; i < ARRAY_SIZE(mcq_dumps); i++) { ufs_qcom_dump_regs(hba, mcq_dumps[i].offset, mcq_dumps[i].len, - mcq_dumps[i].prefix, mcq_dumps[i].id); + mcq_dumps[i].prefix, mcq_dumps[i].base); cond_resched(); } } diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 1d3943777584..a7bcf7c7a1af 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -794,30 +794,6 @@ struct ufs_hba_monitor { bool enabled; }; -/** - * struct ufshcd_res_info_t - MCQ related resource regions - * - * @name: resource name - * @resource: pointer to resource region - * @base: register base address - */ -struct ufshcd_res_info { - const char *name; - struct resource *resource; - void __iomem *base; -}; - -enum ufshcd_res { - RES_UFS, - RES_MCQ, - RES_MCQ_SQD, - RES_MCQ_SQIS, - RES_MCQ_CQD, - RES_MCQ_CQIS, - RES_MCQ_VS, - RES_MAX, -}; - /** * struct ufshcd_mcq_opr_info_t - Operation and Runtime registers * @@ -1127,7 +1103,6 @@ struct ufs_hba { bool lsdb_sup; bool mcq_enabled; bool mcq_esi_enabled; - struct ufshcd_res_info res[RES_MAX]; void __iomem *mcq_base; struct ufs_hw_queue *uhq; struct ufs_hw_queue *dev_cmd_queue; -- cgit v1.2.3 From 5f5598d945e2a69f764aa5c2074dad73e23bcfcb Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Sat, 6 Sep 2025 15:16:53 -0500 Subject: dt-bindings: memory: tegra210: Add memory client IDs Each memory client has unique hardware ID, add these IDs. Signed-off-by: Aaron Kling Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/memory/tegra210-mc.h | 74 ++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/memory/tegra210-mc.h b/include/dt-bindings/memory/tegra210-mc.h index 5e082547f179..881bf78aa8b2 100644 --- a/include/dt-bindings/memory/tegra210-mc.h +++ b/include/dt-bindings/memory/tegra210-mc.h @@ -75,4 +75,78 @@ #define TEGRA210_MC_RESET_ETR 28 #define TEGRA210_MC_RESET_TSECB 29 +#define TEGRA210_MC_PTCR 0 +#define TEGRA210_MC_DISPLAY0A 1 +#define TEGRA210_MC_DISPLAY0AB 2 +#define TEGRA210_MC_DISPLAY0B 3 +#define TEGRA210_MC_DISPLAY0BB 4 +#define TEGRA210_MC_DISPLAY0C 5 +#define TEGRA210_MC_DISPLAY0CB 6 +#define TEGRA210_MC_AFIR 14 +#define TEGRA210_MC_AVPCARM7R 15 +#define TEGRA210_MC_DISPLAYHC 16 +#define TEGRA210_MC_DISPLAYHCB 17 +#define TEGRA210_MC_HDAR 21 +#define TEGRA210_MC_HOST1XDMAR 22 +#define TEGRA210_MC_HOST1XR 23 +#define TEGRA210_MC_NVENCSRD 28 +#define TEGRA210_MC_PPCSAHBDMAR 29 +#define TEGRA210_MC_PPCSAHBSLVR 30 +#define TEGRA210_MC_SATAR 31 +#define TEGRA210_MC_MPCORER 39 +#define TEGRA210_MC_NVENCSWR 43 +#define TEGRA210_MC_AFIW 49 +#define TEGRA210_MC_AVPCARM7W 50 +#define TEGRA210_MC_HDAW 53 +#define TEGRA210_MC_HOST1XW 54 +#define TEGRA210_MC_MPCOREW 57 +#define TEGRA210_MC_PPCSAHBDMAW 59 +#define TEGRA210_MC_PPCSAHBSLVW 60 +#define TEGRA210_MC_SATAW 61 +#define TEGRA210_MC_ISPRA 68 +#define TEGRA210_MC_ISPWA 70 +#define TEGRA210_MC_ISPWB 71 +#define TEGRA210_MC_XUSB_HOSTR 74 +#define TEGRA210_MC_XUSB_HOSTW 75 +#define TEGRA210_MC_XUSB_DEVR 76 +#define TEGRA210_MC_XUSB_DEVW 77 +#define TEGRA210_MC_ISPRAB 78 +#define TEGRA210_MC_ISPWAB 80 +#define TEGRA210_MC_ISPWBB 81 +#define TEGRA210_MC_TSECSRD 84 +#define TEGRA210_MC_TSECSWR 85 +#define TEGRA210_MC_A9AVPSCR 86 +#define TEGRA210_MC_A9AVPSCW 87 +#define TEGRA210_MC_GPUSRD 88 +#define TEGRA210_MC_GPUSWR 89 +#define TEGRA210_MC_DISPLAYT 90 +#define TEGRA210_MC_SDMMCRA 96 +#define TEGRA210_MC_SDMMCRAA 97 +#define TEGRA210_MC_SDMMCR 98 +#define TEGRA210_MC_SDMMCRAB 99 +#define TEGRA210_MC_SDMMCWA 100 +#define TEGRA210_MC_SDMMCWAA 101 +#define TEGRA210_MC_SDMMCW 102 +#define TEGRA210_MC_SDMMCWAB 103 +#define TEGRA210_MC_VICSRD 108 +#define TEGRA210_MC_VICSWR 109 +#define TEGRA210_MC_VIW 114 +#define TEGRA210_MC_DISPLAYD 115 +#define TEGRA210_MC_NVDECSRD 120 +#define TEGRA210_MC_NVDECSWR 121 +#define TEGRA210_MC_APER 122 +#define TEGRA210_MC_APEW 123 +#define TEGRA210_MC_NVJPGRD 126 +#define TEGRA210_MC_NVJPGWR 127 +#define TEGRA210_MC_SESRD 128 +#define TEGRA210_MC_SESWR 129 +#define TEGRA210_MC_AXIAPR 130 +#define TEGRA210_MC_AXIAPW 131 +#define TEGRA210_MC_ETRR 132 +#define TEGRA210_MC_ETRW 133 +#define TEGRA210_MC_TSECSRDB 134 +#define TEGRA210_MC_TSECSWRB 135 +#define TEGRA210_MC_GPUSRD2 136 +#define TEGRA210_MC_GPUSWR2 137 + #endif -- cgit v1.2.3 From 7d6ca84aa985fc940f5544ed7feedb1b4a82b96b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:26 -0700 Subject: KVM: arm64: vgic: Drop stale comment on IRQ active state While LPIs lack an active state, KVM unconditionally folds the active state from the LR into the vgic_irq struct meaning this field cannot be 'creatively' reused for something else. Drop the misleading comment to reflect this. Link: https://lore.kernel.org/r/20250905100531.282980-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 404883c7af6e..9f8a116925ca 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -139,7 +139,7 @@ struct vgic_irq { bool pending_latch; /* The pending latch state used to calculate * the pending state for both level * and edge triggered IRQs. */ - bool active; /* not used for LPIs */ + bool active; bool enabled; bool hw; /* Tied to HW IRQ */ struct kref refcount; /* Used for LPIs */ -- cgit v1.2.3 From 3a08a6ca7c373198c84e2a8c025c395ee966ff8a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:27 -0700 Subject: KVM: arm64: vgic-v3: Use bare refcount for VGIC LPIs KVM's use of krefs to manage LPIs isn't adding much, especially considering vgic_irq_release() is a noop due to the lack of sufficient context. Switch to using a regular refcount in anticipation of adding a meaningful release concept for LPIs. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250905100531.282980-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-debug.c | 2 +- arch/arm64/kvm/vgic/vgic-init.c | 4 ++-- arch/arm64/kvm/vgic/vgic-its.c | 8 ++++---- arch/arm64/kvm/vgic/vgic-v4.c | 2 +- arch/arm64/kvm/vgic/vgic.c | 17 ++++------------- arch/arm64/kvm/vgic/vgic.h | 8 ++++---- include/kvm/arm_vgic.h | 4 ++-- 7 files changed, 18 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 2684f273d9e1..4c1209261b65 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm) int nr_lpis = 0; xa_for_each(&dist->lpi_xa, intid, irq) { - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) continue; xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1e680ad6e863..4c777136ea5f 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) raw_spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V2: irq->targets = 0; @@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 7368c13f16b7..f162206adb48 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -99,7 +99,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 1); irq->intid = intid; irq->target_vcpu = vcpu; irq->group = 1; @@ -111,7 +111,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, * check that we don't add a second list entry with the same LPI. */ oldirq = xa_load(&dist->lpi_xa, intid); - if (vgic_try_get_irq_kref(oldirq)) { + if (vgic_try_get_irq_ref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; @@ -547,7 +547,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, rcu_read_lock(); irq = xa_load(&its->translation_cache, cache_key); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -571,7 +571,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, * its_lock, as the ITE (and the reference it holds) cannot be freed. */ lockdep_assert_held(&its->its_lock); - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 4d9343d2b0b1..548aec9d5a72 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) if (!irq->hw || irq->host_irq != host_irq) continue; - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) return NULL; return irq; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index f5148b38120a..a1d6fab895c4 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) rcu_read_lock(); irq = xa_load(&dist->lpi_xa, intid); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -114,15 +114,6 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) return vgic_get_irq(vcpu->kvm, intid); } -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) -{ -} - void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -131,7 +122,7 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return; - if (!kref_put(&irq->refcount, vgic_irq_release)) + if (!refcount_dec_and_test(&irq->refcount)) return; xa_lock_irqsave(&dist->lpi_xa, flags); @@ -399,7 +390,7 @@ retry: * now in the ap_list. This is safe as the caller must already hold a * reference on the irq. */ - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; @@ -657,7 +648,7 @@ retry: /* * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, + * vgic_get_irq_ref in vgic_queue_irq_unlock, * where we added the LPI to the ap_list. As * we remove the irq from the list, we drop * also drop the refcount. diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index de1c1d3261c3..ac5f9c5d2b98 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); -static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) +static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) { if (!irq) return false; @@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return true; - return kref_get_unless_zero(&irq->refcount); + return refcount_inc_not_zero(&irq->refcount); } -static inline void vgic_get_irq_kref(struct vgic_irq *irq) +static inline void vgic_get_irq_ref(struct vgic_irq *irq) { - WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9f8a116925ca..640555ff5b54 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -142,7 +142,7 @@ struct vgic_irq { bool active; bool enabled; bool hw; /* Tied to HW IRQ */ - struct kref refcount; /* Used for LPIs */ + refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ union { -- cgit v1.2.3 From d54594accf732d17891d276aa1f545ef25606555 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 5 Sep 2025 03:05:29 -0700 Subject: KVM: arm64: vgic-v3: Erase LPIs from xarray outside of raw spinlocks syzkaller has caught us red-handed once more, this time nesting regular spinlocks behind raw spinlocks: ============================= [ BUG: Invalid wait context ] 6.16.0-rc3-syzkaller-g7b8346bd9fce #0 Not tainted ----------------------------- syz.0.29/3743 is trying to lock: a3ff80008e2e9e18 (&xa->xa_lock#20){....}-{3:3}, at: vgic_put_irq+0xb4/0x190 arch/arm64/kvm/vgic/vgic.c:137 other info that might help us debug this: context-{5:5} 3 locks held by syz.0.29/3743: #0: a3ff80008e2e90a8 (&kvm->slots_lock){+.+.}-{4:4}, at: kvm_vgic_destroy+0x50/0x624 arch/arm64/kvm/vgic/vgic-init.c:499 #1: a3ff80008e2e9fa0 (&kvm->arch.config_lock){+.+.}-{4:4}, at: kvm_vgic_destroy+0x5c/0x624 arch/arm64/kvm/vgic/vgic-init.c:500 #2: 58f0000021be1428 (&vgic_cpu->ap_list_lock){....}-{2:2}, at: vgic_flush_pending_lpis+0x3c/0x31c arch/arm64/kvm/vgic/vgic.c:150 stack backtrace: CPU: 0 UID: 0 PID: 3743 Comm: syz.0.29 Not tainted 6.16.0-rc3-syzkaller-g7b8346bd9fce #0 PREEMPT Hardware name: linux,dummy-virt (DT) Call trace: show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:466 (C) __dump_stack+0x30/0x40 lib/dump_stack.c:94 dump_stack_lvl+0xd8/0x12c lib/dump_stack.c:120 dump_stack+0x1c/0x28 lib/dump_stack.c:129 print_lock_invalid_wait_context kernel/locking/lockdep.c:4833 [inline] check_wait_context kernel/locking/lockdep.c:4905 [inline] __lock_acquire+0x978/0x299c kernel/locking/lockdep.c:5190 lock_acquire+0x14c/0x2e0 kernel/locking/lockdep.c:5871 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x5c/0x7c kernel/locking/spinlock.c:162 vgic_put_irq+0xb4/0x190 arch/arm64/kvm/vgic/vgic.c:137 vgic_flush_pending_lpis+0x24c/0x31c arch/arm64/kvm/vgic/vgic.c:158 __kvm_vgic_vcpu_destroy+0x44/0x500 arch/arm64/kvm/vgic/vgic-init.c:455 kvm_vgic_destroy+0x100/0x624 arch/arm64/kvm/vgic/vgic-init.c:505 kvm_arch_destroy_vm+0x80/0x138 arch/arm64/kvm/arm.c:244 kvm_destroy_vm virt/kvm/kvm_main.c:1308 [inline] kvm_put_kvm+0x800/0xff8 virt/kvm/kvm_main.c:1344 kvm_vm_release+0x58/0x78 virt/kvm/kvm_main.c:1367 __fput+0x4ac/0x980 fs/file_table.c:465 ____fput+0x20/0x58 fs/file_table.c:493 task_work_run+0x1bc/0x254 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] do_notify_resume+0x1b4/0x270 arch/arm64/kernel/entry-common.c:151 exit_to_user_mode_prepare arch/arm64/kernel/entry-common.c:169 [inline] exit_to_user_mode arch/arm64/kernel/entry-common.c:178 [inline] el0_svc+0xb4/0x160 arch/arm64/kernel/entry-common.c:768 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 This is of course no good, but is at odds with how LPI refcounts are managed. Solve the locking mess by deferring the release of unreferenced LPIs after the ap_list_lock is released. Mark these to-be-released LPIs specially to avoid racing with vgic_put_irq() and causing a double-free. Since references can only be taken on LPIs with a nonzero refcount, extending the lifetime of freed LPIs is still safe. Reviewed-by: Marc Zyngier Reported-by: syzbot+cef594105ac7e60c6d93@syzkaller.appspotmail.com Closes: https://lore.kernel.org/kvmarm/68acd0d9.a00a0220.33401d.048b.GAE@google.com/ Link: https://lore.kernel.org/r/20250905100531.282980-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 41 +++++++++++++++++++++++++++++++++++++---- include/kvm/arm_vgic.h | 3 +++ 2 files changed, 40 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index ec4d70936a5b..480391a0dcad 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, @@ -129,6 +129,15 @@ static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) return refcount_dec_and_test(&irq->refcount); } +static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) +{ + if (!__vgic_put_irq(kvm, irq)) + return false; + + irq->pending_release = true; + return true; +} + void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -142,10 +151,27 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) xa_unlock_irqrestore(&dist->lpi_xa, flags); } +static void vgic_release_deleted_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags, intid; + struct vgic_irq *irq; + + xa_lock_irqsave(&dist->lpi_xa, flags); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (irq->pending_release) + vgic_release_lpi_locked(dist, irq); + } + + xa_unlock_irqrestore(&dist->lpi_xa, flags); +} + void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted = false; unsigned long flags; raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); @@ -156,11 +182,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); } } raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + if (deleted) + vgic_release_deleted_lpis(vcpu->kvm); } void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) @@ -631,6 +660,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted_lpis = false; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); @@ -663,7 +693,7 @@ retry: * we remove the irq from the list, we drop * also drop the refcount. */ - vgic_put_irq(vcpu->kvm, irq); + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); continue; } @@ -726,6 +756,9 @@ retry: } raw_spin_unlock(&vgic_cpu->ap_list_lock); + + if (unlikely(deleted_lpis)) + vgic_release_deleted_lpis(vcpu->kvm); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 640555ff5b54..4000ff16f295 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -140,6 +140,9 @@ struct vgic_irq { * the pending state for both level * and edge triggered IRQs. */ bool active; + bool pending_release; /* Used for LPIs only, unreferenced IRQ + * pending a release */ + bool enabled; bool hw; /* Tied to HW IRQ */ refcount_t refcount; /* Used for LPIs */ -- cgit v1.2.3 From 1733e88874838ddebf7774440c285700865e6b08 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:41 +0800 Subject: block: cleanup bio_issue Now that bio->bi_issue is only used by blk-iolatency to get bio issue time, replace bio_issue with u64 time directly and remove bio_issue to make code cleaner. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-cgroup.h | 2 +- block/blk-iolatency.c | 14 +++----------- block/blk.h | 42 ------------------------------------------ include/linux/blk_types.h | 7 ++----- 5 files changed, 7 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 9603ca3ec770..3a1a848940dd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; - bio->bi_issue.value = 0; + bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 83367086cb6a..8328427e3165 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -372,7 +372,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) static inline void blkcg_bio_issue_init(struct bio *bio) { - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio->issue_time_ns = blk_time_get_ns(); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f8fdecdd7a9..554b191a6892 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) mod_timer(&blkiolat->timer, jiffies + HZ); } -static void iolatency_record_time(struct iolatency_grp *iolat, - struct bio_issue *issue, u64 now, - bool issue_as_root) +static void iolatency_record_time(struct iolatency_grp *iolat, u64 start, + u64 now, bool issue_as_root) { - u64 start = bio_issue_time(issue); u64 req_time; - /* - * Have to do this so we are truncated to the correct time that our - * issue is truncated to. - */ - now = __bio_issue_time(now); - if (now <= start) return; @@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { - iolatency_record_time(iolat, &bio->bi_issue, now, + iolatency_record_time(iolat, bio->issue_time_ns, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && diff --git a/block/blk.h b/block/blk.h index 7d420c247d81..41397688b941 100644 --- a/block/blk.h +++ b/block/blk.h @@ -681,48 +681,6 @@ static inline ktime_t blk_time_get(void) return ns_to_ktime(blk_time_get_ns()); } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4bd098fd61cb..8e8d1cc8b06c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct bio_issue { - u64 value; -}; - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; @@ -242,7 +238,8 @@ struct bio { * on release of the bio. */ struct blkcg_gq *bi_blkg; - struct bio_issue bi_issue; + /* Time that this bio was issued. */ + u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif -- cgit v1.2.3 From ea3d1f104db60f9d5074b33819ccea3c216e0bee Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:43 +0800 Subject: blk-mq: add QUEUE_FLAG_BIO_ISSUE_TIME bio->issue_time_ns is initialized for every bio, however, it's only used by blk-iolatency. Add a new queue_flag and only set this flag when blk-iolatency is enabled, so that extra blk_time_get_ns() can be saved for disks that blk-iolatency is not enabled. Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 5 +++++ block/blk-mq-debugfs.c | 1 + block/blk-mq.c | 8 +++++--- include/linux/blkdev.h | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 554b191a6892..45bd18f68541 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -742,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { + struct request_queue *q = blkiolat->rqos.disk->queue; unsigned int memflags; memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; + if (enabled) + blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q); + else + blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q); blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 32c65efdda46..4896525b1c05 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -96,6 +96,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), + QUEUE_FLAG_NAME(BIO_ISSUE_TIME), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c0c8ead7080..31cc743ffad7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -396,10 +396,12 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) #endif } -static inline void blk_mq_bio_issue_init(struct bio *bio) +static inline void blk_mq_bio_issue_init(struct request_queue *q, + struct bio *bio) { #ifdef CONFIG_BLK_CGROUP - bio->issue_time_ns = blk_time_get_ns(); + if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); #endif } @@ -3175,7 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) goto queue_exit; - blk_mq_bio_issue_init(bio); + blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8cb08b2ed29..7c542b1851fa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -657,6 +657,7 @@ enum { QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ QUEUE_FLAG_MAX }; -- cgit v1.2.3 From e37b5596a19be9a150cb194ec32e78f295a3574b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:46 +0800 Subject: block: factor out a helper bio_submit_split_bioset() No functional changes are intended, some drivers like mdraid will split bio by internal processing, prepare to unify bio split codes. Signed-off-by: Yu Kuai Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 59 ++++++++++++++++++++++++++++++++++---------------- include/linux/blkdev.h | 2 ++ 2 files changed, 42 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/blk-merge.c b/block/blk-merge.c index 354a3070e6a1..f680ccf81864 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -104,33 +104,54 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } +/* + * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector + * @bio: the original bio to be submitted and split + * @split_sectors: the sector count at which to split + * @bs: the bio set used for allocating the new split bio + * + * The original bio is modified to contain the remaining sectors and submitted. + * The caller is responsible for submitting the returned bio. + * + * If succeed, the newly allocated bio representing the initial part will be + * returned, on failure NULL will be returned and original bio will fail. + */ +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs) +{ + struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return NULL; + } + + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + + return split; +} +EXPORT_SYMBOL_GPL(bio_submit_split_bioset); + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) - goto error; + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } if (split_sectors) { - struct bio *split; - - split = bio_split(bio, split_sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, split_sectors, &bio->bi_bdev->bd_disk->bio_split); - if (IS_ERR(split)) { - split_sectors = PTR_ERR(split); - goto error; - } - split->bi_opf |= REQ_NOMERGE; - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; + if (bio) + bio->bi_opf |= REQ_NOMERGE; } return bio; -error: - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7c542b1851fa..066e5309bd45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); -- cgit v1.2.3 From 448097bbd3836d2ee46fa6eabd18661e9a3c8be8 Mon Sep 17 00:00:00 2001 From: Jean-François Lessard Date: Tue, 2 Sep 2025 15:04:39 -0400 Subject: device property: Add scoped fwnode child node iterators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add scoped versions of fwnode child node iterators that automatically handle reference counting cleanup using the __free() attribute: - fwnode_for_each_child_node_scoped() - fwnode_for_each_available_child_node_scoped() These macros follow the same pattern as existing scoped iterators in the kernel, ensuring fwnode references are automatically released when the iterator variable goes out of scope. This prevents resource leaks and eliminates the need for manual cleanup in error paths. The implementation mirrors the non-scoped variants but uses __free(fwnode_handle) for automatic resource management, providing a safer and more convenient interface for drivers iterating over firmware node children. Signed-off-by: Jean-François Lessard Acked-by: Danilo Krummrich Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/property.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e..862e208133f3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -176,6 +176,16 @@ struct fwnode_handle *fwnode_get_next_available_child_node( for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\ child = fwnode_get_next_available_child_node(fwnode, child)) +#define fwnode_for_each_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_child_node(fwnode, child)) + +#define fwnode_for_each_available_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_available_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_available_child_node(fwnode, child)) + struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child); -- cgit v1.2.3 From c4272905c37930c19b54fa3549b22899122ce69e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Sep 2025 09:00:34 -0700 Subject: cxl/acpi: Rename CFMW coherency restrictions ACPICA commit 710745713ad3a2543dbfb70e84764f31f0e46bdc This has been renamed in more recent CXL specs, as type3 (memory expanders) can also use HDM-DB for device coherent memory. Link: https://github.com/acpica/acpica/commit/710745713ad3a2543dbfb70e84764f31f0e46bdc Acked-by: Rafael J. Wysocki (Intel) Signed-off-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20250908160034.86471-1-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 4 ++-- include/acpi/actbl1.h | 4 ++-- tools/testing/cxl/test/cxl.c | 18 +++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index f1625212b08b..ff15dac3f294 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -116,9 +116,9 @@ static unsigned long cfmws_to_decoder_flags(int restrictions) { unsigned long flags = CXL_DECODER_F_ENABLE; - if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE2) + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_DEVMEM) flags |= CXL_DECODER_F_TYPE2; - if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE3) + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM) flags |= CXL_DECODER_F_TYPE3; if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_VOLATILE) flags |= CXL_DECODER_F_RAM; diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 99fd1588ff38..eb787dfbd2fa 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -560,8 +560,8 @@ struct acpi_cedt_cfmws_target_element { /* Values for Restrictions field above */ -#define ACPI_CEDT_CFMWS_RESTRICT_TYPE2 (1) -#define ACPI_CEDT_CFMWS_RESTRICT_TYPE3 (1<<1) +#define ACPI_CEDT_CFMWS_RESTRICT_DEVMEM (1) +#define ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM (1<<1) #define ACPI_CEDT_CFMWS_RESTRICT_VOLATILE (1<<2) #define ACPI_CEDT_CFMWS_RESTRICT_PMEM (1<<3) #define ACPI_CEDT_CFMWS_RESTRICT_FIXED (1<<4) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 6a25cca5636f..ba50338f8ada 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -210,7 +210,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, @@ -225,7 +225,7 @@ static struct { }, .interleave_ways = 1, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -240,7 +240,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, @@ -255,7 +255,7 @@ static struct { }, .interleave_ways = 1, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -270,7 +270,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 4UL, @@ -285,7 +285,7 @@ static struct { }, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M, @@ -302,7 +302,7 @@ static struct { .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, .interleave_ways = 0, .granularity = 4, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -318,7 +318,7 @@ static struct { .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, .interleave_ways = 1, .granularity = 0, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_256M * 8UL, @@ -334,7 +334,7 @@ static struct { .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, .interleave_ways = 8, .granularity = 1, - .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | + .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, .window_size = SZ_512M * 6UL, -- cgit v1.2.3 From a1ffc8ad3165fa1cf6a60c6a4b4e00dfd6603cf2 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:33 +0800 Subject: cgroup: refactor the cgroup_attach_lock code to make it clearer Dynamic cgroup migration involving threadgroup locks can be in one of two states: no lock held, or holding the global lock. Explicitly declaring the different lock modes to make the code easier to understand and facilitates future extensions of the lock modes. Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 8 +++++ kernel/cgroup/cgroup-internal.h | 9 +++--- kernel/cgroup/cgroup-v1.c | 14 ++++----- kernel/cgroup/cgroup.c | 67 ++++++++++++++++++++++++++++++----------- 4 files changed, 69 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 92ed6d18266d..ff3c7d0e3e01 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -140,6 +140,14 @@ enum { __CFTYPE_ADDED = (1 << 18), }; +enum cgroup_attach_lock_mode { + /* Default */ + CGRP_ATTACH_LOCK_GLOBAL, + + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ + CGRP_ATTACH_LOCK_NONE, +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a34..a6d6f30b6f65 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,13 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0a23b65de013..852ebe7ca3a1 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); for_each_root(root) { struct cgroup *from_cgrp; @@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); cgroup_unlock(); return retval; @@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); cgroup_unlock(); return ret; } @@ -503,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -532,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0607c5d09237..0994eeaa2f69 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2482,7 +2482,7 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2503,21 +2503,39 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether to up_write cgroup_threadgroup_rwsem */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2991,7 +3009,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -3008,8 +3026,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); + + if (pid || threadgroup) + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + *lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(*lock_mode); rcu_read_lock(); if (pid) { @@ -3040,19 +3063,20 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; + cgroup_attach_unlock(*lock_mode); + *lock_mode = CGRP_ATTACH_LOCK_NONE; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - cgroup_attach_unlock(threadgroup_locked); + cgroup_attach_unlock(lock_mode); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3104,6 +3128,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3135,7 +3160,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3156,7 +3187,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode); return ret; } @@ -5279,13 +5310,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5311,7 +5342,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); -- cgit v1.2.3 From 0568f89d4fb82d98001baeb870e92f43cd1f7317 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:35 +0800 Subject: cgroup: replace global percpu_rwsem with per threadgroup resem when writing to cgroup.procs The static usage pattern of creating a cgroup, enabling controllers, and then seeding it with CLONE_INTO_CGROUP doesn't require write locking cgroup_threadgroup_rwsem and thus doesn't benefit from this patch. To avoid affecting other users, the per threadgroup rwsem is only used when the favordynmods is enabled. As computer hardware advances, modern systems are typically equipped with many CPU cores and large amounts of memory, enabling the deployment of numerous applications. On such systems, container creation and deletion become frequent operations, making cgroup process migration no longer a cold path. This leads to noticeable contention with common process operations such as fork, exec, and exit. To alleviate the contention between cgroup process migration and operations like process fork, this patch modifies lock to take the write lock on signal_struct->group_rwsem when writing pid to cgroup.procs/threads instead of holding a global write lock. Cgroup process migration has historically relied on signal_struct->group_rwsem to protect thread group integrity. In commit <1ed1328792ff> ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"), this was changed to a global cgroup_threadgroup_rwsem. The advantage of using a global lock was simplified handling of process group migrations. This patch retains the use of the global lock for protecting process group migration, while reducing contention by using per thread group lock during cgroup.procs/threads writes. The locking behavior is as follows: write cgroup.procs/threads | process fork,exec,exit | process group migration ------------------------------------------------------------------------------ cgroup_lock() | down_read(&g_rwsem) | cgroup_lock() down_write(&p_rwsem) | down_read(&p_rwsem) | down_write(&g_rwsem) critical section | critical section | critical section up_write(&p_rwsem) | up_read(&p_rwsem) | up_write(&g_rwsem) cgroup_unlock() | up_read(&g_rwsem) | cgroup_unlock() g_rwsem denotes cgroup_threadgroup_rwsem, p_rwsem denotes signal_struct->group_rwsem. This patch eliminates contention between cgroup migration and fork operations for threads that belong to different thread groups, thereby reducing the long-tail latency of cgroup migrations and lowering system load. With this patch, under heavy fork and exec interference, the long-tail latency of cgroup migration has been reduced from milliseconds to microseconds. Under heavy cgroup migration interference, the multi-CPU score of the spawn test case in UnixBench increased by 9%. tj: Update comment in cgroup_favor_dynmods() and switch WARN_ONCE() to pr_warn_once(). Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 17 +++++++++- include/linux/sched/signal.h | 4 +++ init/init_task.c | 3 ++ kernel/cgroup/cgroup-internal.h | 6 ++-- kernel/cgroup/cgroup-v1.c | 8 ++--- kernel/cgroup/cgroup.c | 73 ++++++++++++++++++++++++++++++++--------- kernel/fork.c | 4 +++ 7 files changed, 93 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ff3c7d0e3e01..93318fce31f3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -91,6 +91,12 @@ enum { * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * + * Alleviate the contention between fork, exec, exit operations and + * writing to cgroup.procs by taking a per threadgroup rwsem instead of + * the global cgroup_threadgroup_rwsem. Fork and other operations + * from threads in different thread groups no longer contend with + * writing to cgroup.procs. + * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from @@ -146,6 +152,9 @@ enum cgroup_attach_lock_mode { /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ CGRP_ATTACH_LOCK_NONE, + + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ + CGRP_ATTACH_LOCK_PER_THREADGROUP, }; /* @@ -846,6 +855,7 @@ struct cgroup_subsys { }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; @@ -857,11 +867,14 @@ struct cgroup_of_peak { * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); + if (cgroup_enable_per_threadgroup_rwsem) + down_read(&tsk->signal->cgroup_threadgroup_rwsem); } /** @@ -872,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { + if (cgroup_enable_per_threadgroup_rwsem) + up_read(&tsk->signal->cgroup_threadgroup_rwsem); percpu_up_read(&cgroup_threadgroup_rwsem); } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1ef1edbaaf79..7d6449982822 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,6 +226,10 @@ struct signal_struct { struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + /* * Thread is the potential origin of an oom condition; kill first on * oom diff --git a/init/init_task.c b/init/init_task.c index e557f622bd90..a55e2189206f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -27,6 +27,9 @@ static struct signal_struct init_signals = { }, .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, +#ifdef CONFIG_CGROUPS + .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), +#endif .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index a6d6f30b6f65..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,8 +249,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode); -void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 852ebe7ca3a1..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a6b81b48bb70..fed701df1167 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -239,6 +239,14 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1325,14 +1333,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2482,7 +2506,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_mode: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2502,8 +2527,15 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); @@ -2513,6 +2545,9 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; @@ -2521,9 +2556,11 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_mode: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { switch (lock_mode) { case CGRP_ATTACH_LOCK_NONE: @@ -2531,6 +2568,9 @@ void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; @@ -3042,7 +3082,6 @@ retry_find_task: tsk = ERR_PTR(-EINVAL); goto out_unlock_rcu; } - get_task_struct(tsk); rcu_read_unlock(); @@ -3055,12 +3094,16 @@ retry_find_task: */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) - *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; - else + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { *lock_mode = CGRP_ATTACH_LOCK_NONE; + } - cgroup_attach_lock(*lock_mode); + cgroup_attach_lock(*lock_mode, tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { @@ -3069,7 +3112,7 @@ retry_find_task: * may strip us of our leadership. If this happens, * throw this task away and try again. */ - cgroup_attach_unlock(*lock_mode); + cgroup_attach_unlock(*lock_mode, tsk); put_task_struct(tsk); goto retry_find_task; } @@ -3085,10 +3128,10 @@ out_unlock_rcu: void cgroup_procs_write_finish(struct task_struct *task, enum cgroup_attach_lock_mode lock_mode) { + cgroup_attach_unlock(lock_mode, task); + /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(lock_mode); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3178,7 +3221,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) else lock_mode = CGRP_ATTACH_LOCK_NONE; - cgroup_attach_lock(lock_mode); + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3199,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(lock_mode); + cgroup_attach_unlock(lock_mode, NULL); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd..9a039867ecfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1688,6 +1688,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.2.3 From 1dfdf4527fd391f653c53b634af9122613c58904 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 18:48:32 +0200 Subject: iio: adc: exynos_adc: Drop platform data support There are no Samsung Exynos SoC ADC driver users which bind via platform ID, thus platform data is never set and can be dropped. Reviewed-by: Andy Shevchenko Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250830-s3c-cleanup-adc-v2-3-4f8299343d32@linaro.org Signed-off-by: Jonathan Cameron --- drivers/iio/adc/exynos_adc.c | 11 +---------- include/linux/platform_data/touchscreen-s3c2410.h | 22 ---------------------- 2 files changed, 1 insertion(+), 32 deletions(-) delete mode 100644 include/linux/platform_data/touchscreen-s3c2410.h (limited to 'include') diff --git a/drivers/iio/adc/exynos_adc.c b/drivers/iio/adc/exynos_adc.c index dd4b9921a8b5..1484adff00df 100644 --- a/drivers/iio/adc/exynos_adc.c +++ b/drivers/iio/adc/exynos_adc.c @@ -29,8 +29,6 @@ #include #include -#include - /* S3C/EXYNOS4412/5250 ADC_V1 registers definitions */ #define ADC_V1_CON(x) ((x) + 0x00) #define ADC_V1_DLY(x) ((x) + 0x08) @@ -106,7 +104,6 @@ struct exynos_adc { struct clk *clk; struct clk *sclk; unsigned int irq; - unsigned int delay; struct regulator *vdd; struct completion completion; @@ -213,7 +210,7 @@ static void exynos_adc_v1_init_hw(struct exynos_adc *info) writel(con1, ADC_V1_CON(info->regs)); /* set touchscreen delay */ - writel(info->delay, ADC_V1_DLY(info->regs)); + writel(10000, ADC_V1_DLY(info->regs)); } static void exynos_adc_v1_exit_hw(struct exynos_adc *info) @@ -556,7 +553,6 @@ static int exynos_adc_probe(struct platform_device *pdev) { struct exynos_adc *info = NULL; struct device_node *np = pdev->dev.of_node; - struct s3c2410_ts_mach_info *pdata = dev_get_platdata(&pdev->dev); struct iio_dev *indio_dev = NULL; int ret; int irq; @@ -655,11 +651,6 @@ static int exynos_adc_probe(struct platform_device *pdev) if (info->data->init_hw) info->data->init_hw(info); - if (pdata) - info->delay = pdata->delay; - else - info->delay = 10000; - ret = of_platform_populate(np, exynos_adc_match, NULL, &indio_dev->dev); if (ret < 0) { dev_err(&pdev->dev, "failed adding child nodes\n"); diff --git a/include/linux/platform_data/touchscreen-s3c2410.h b/include/linux/platform_data/touchscreen-s3c2410.h deleted file mode 100644 index bf8d3b9d7c6a..000000000000 --- a/include/linux/platform_data/touchscreen-s3c2410.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2005 Arnaud Patard -*/ - -#ifndef __TOUCHSCREEN_S3C2410_H -#define __TOUCHSCREEN_S3C2410_H - -struct s3c2410_ts_mach_info { - int delay; - int presc; - int oversampling_shift; - void (*cfg_gpio)(struct platform_device *dev); -}; - -extern void s3c24xx_ts_set_platdata(struct s3c2410_ts_mach_info *); -extern void s3c64xx_ts_set_platdata(struct s3c2410_ts_mach_info *); - -/* defined by architecture to configure gpio */ -extern void s3c24xx_ts_cfg_gpio(struct platform_device *dev); - -#endif /*__TOUCHSCREEN_S3C2410_H */ -- cgit v1.2.3 From cec1aec9c46305743a2d4a1bfb06f6b0374d5ed0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Aug 2025 12:48:22 +0200 Subject: iio: consumers: Add an iio_multiply_value() helper function The channel-scale handling in iio_convert_raw_to_processed() in essence does the following: processed = raw * caller-provided-scale * channel-scale Which can also be written as: multiplier = raw * caller-provided-scale iio-value = channel-scale processed = multiplier * iio-value Where iio-value is a set of IIO_VAL_* type + val + val2 integers, being able to handle multiplication of iio-values like this is something which is useful to have in general and, as previous bugfixes to iio_convert_raw_to_processed() have shown, also tricky to implement. Split the iio-value multiplication code from iio_convert_raw_to_processed() out into a new iio_multiply_value() helper. This serves multiple purposes: 1. Having this split out allows writing a KUnit test for this. 2. Having this split out allows re-use to get better precision when scaling values in iio_read_channel_processed_scale(). Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://patch.msgid.link/20250831104825.15097-4-hansg@kernel.org Signed-off-by: Jonathan Cameron --- drivers/iio/inkern.c | 72 ++++++++++++++++++++++++++------------------ include/linux/iio/consumer.h | 18 +++++++++++ 2 files changed, 60 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c index 642beb4b3360..158d54de14a7 100644 --- a/drivers/iio/inkern.c +++ b/drivers/iio/inkern.c @@ -599,13 +599,50 @@ int iio_read_channel_average_raw(struct iio_channel *chan, int *val) } EXPORT_SYMBOL_GPL(iio_read_channel_average_raw); +int iio_multiply_value(int *result, s64 multiplier, + unsigned int type, int val, int val2) +{ + s64 denominator; + + switch (type) { + case IIO_VAL_INT: + *result = multiplier * val; + return IIO_VAL_INT; + case IIO_VAL_INT_PLUS_MICRO: + case IIO_VAL_INT_PLUS_NANO: + switch (type) { + case IIO_VAL_INT_PLUS_MICRO: + denominator = MICRO; + break; + case IIO_VAL_INT_PLUS_NANO: + denominator = NANO; + break; + } + *result = multiplier * abs(val); + *result += div_s64(multiplier * abs(val2), denominator); + if (val < 0 || val2 < 0) + *result *= -1; + return IIO_VAL_INT; + case IIO_VAL_FRACTIONAL: + *result = div_s64(multiplier * val, val2); + return IIO_VAL_INT; + case IIO_VAL_FRACTIONAL_LOG2: + *result = (multiplier * val) >> val2; + return IIO_VAL_INT; + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_NS_GPL(iio_multiply_value, "IIO_UNIT_TEST"); + static int iio_convert_raw_to_processed_unlocked(struct iio_channel *chan, int raw, int *processed, unsigned int scale) { int scale_type, scale_val, scale_val2; int offset_type, offset_val, offset_val2; - s64 denominator, raw64 = raw; + s64 raw64 = raw; + int ret; offset_type = iio_channel_read(chan, &offset_val, &offset_val2, IIO_CHAN_INFO_OFFSET); @@ -644,35 +681,10 @@ static int iio_convert_raw_to_processed_unlocked(struct iio_channel *chan, return 0; } - switch (scale_type) { - case IIO_VAL_INT: - *processed = raw64 * scale_val * scale; - break; - case IIO_VAL_INT_PLUS_MICRO: - case IIO_VAL_INT_PLUS_NANO: - switch (scale_type) { - case IIO_VAL_INT_PLUS_MICRO: - denominator = MICRO; - break; - case IIO_VAL_INT_PLUS_NANO: - denominator = NANO; - break; - } - *processed = raw64 * scale * abs(scale_val); - *processed += div_s64(raw64 * scale * abs(scale_val2), denominator); - if (scale_val < 0 || scale_val2 < 0) - *processed *= -1; - break; - case IIO_VAL_FRACTIONAL: - *processed = div_s64(raw64 * (s64)scale_val * scale, - scale_val2); - break; - case IIO_VAL_FRACTIONAL_LOG2: - *processed = (raw64 * (s64)scale_val * scale) >> scale_val2; - break; - default: - return -EINVAL; - } + ret = iio_multiply_value(processed, raw64 * scale, + scale_type, scale_val, scale_val2); + if (ret < 0) + return ret; return 0; } diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 6a4479616479..a38b277c2c02 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -381,6 +381,24 @@ int iio_read_channel_offset(struct iio_channel *chan, int *val, int iio_read_channel_scale(struct iio_channel *chan, int *val, int *val2); +/** + * iio_multiply_value() - Multiply an IIO value + * @result: Destination pointer for the multiplication result + * @multiplier: Multiplier. + * @type: One of the IIO_VAL_* constants. This decides how the @val and + * @val2 parameters are interpreted. + * @val: Value being multiplied. + * @val2: Value being multiplied. @val2 use depends on type. + * + * Multiply an IIO value with a s64 multiplier storing the result as + * IIO_VAL_INT. This is typically used for scaling. + * + * Returns: + * IIO_VAL_INT on success or a negative error-number on failure. + */ +int iio_multiply_value(int *result, s64 multiplier, + unsigned int type, int val, int val2); + /** * iio_convert_raw_to_processed() - Converts a raw value to a processed value * @chan: The channel being queried -- cgit v1.2.3 From b2461e20fa9ac18b1305bba5bc7e22ebf644ea01 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:30 +0800 Subject: firmware: imx: Add stub functions for SCMI MISC API To ensure successful builds when CONFIG_IMX_SCMI_MISC_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_misc_ctrl_get() - scmi_imx_misc_ctrl_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. This patch also drops the changes in commit 540c830212ed ("firmware: imx: remove duplicate scmi_imx_misc_ctrl_get()"). The original change aimed to simplify the handling of optional features by removing conditional stubs. However, the use of conditional stubs is necessary when CONFIG_IMX_SCMI_MISC_DRV is n, while consumer driver is set to y. This is not a matter of preserving legacy patterns, but rather to ensure that there is no link error whether for module or built-in. Fixes: 0b4f8a68b292 ("firmware: imx: Add i.MX95 MISC driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index d4212bc42b2c..99c15bbb46aa 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -26,8 +26,20 @@ #define SCMI_IMX94_CTRL_SAI3_MCLK 5U /*!< WAKE SAI3 MCLK */ #define SCMI_IMX94_CTRL_SAI4_MCLK 6U /*!< WAKE SAI4 MCLK */ +#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +#else +static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) +{ + return -EOPNOTSUPP; +} +#endif int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); -- cgit v1.2.3 From 3fb91b5c86d0fb5ff6f65c30a4f20193166e22fe Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:31 +0800 Subject: firmware: imx: Add stub functions for SCMI LMM API To ensure successful builds when CONFIG_IMX_SCMI_LMM_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_lmm_operation() - scmi_imx_lmm_info() - scmi_imx_lmm_reset_vector_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. Fixes: 7242bbf418f0 ("firmware: imx: Add i.MX95 SCMI LMM driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index 99c15bbb46aa..f2a72177bb37 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -56,7 +56,24 @@ enum scmi_imx_lmm_op { #define SCMI_IMX_LMM_OP_FORCEFUL 0 #define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) +#if IS_ENABLED(CONFIG_IMX_SCMI_LMM_DRV) int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); +#else +static inline int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector) +{ + return -EOPNOTSUPP; +} +#endif #endif -- cgit v1.2.3 From 222accf05fc42f68ae02065d9c1542c20315118b Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:32 +0800 Subject: firmware: imx: Add stub functions for SCMI CPU API To ensure successful builds when CONFIG_IMX_SCMI_CPU_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_cpu_start() - scmi_imx_cpu_started() - scmi_imx_cpu_reset_vector_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. Fixes: 1055faa5d660 ("firmware: imx: Add i.MX95 SCMI CPU driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index f2a72177bb37..a33b45027356 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -41,10 +41,28 @@ static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) } #endif +#if IS_ENABLED(CONFIG_IMX_SCMI_CPU_DRV) int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, bool resume); +#else +static inline int scmi_imx_cpu_start(u32 cpuid, bool start) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_started(u32 cpuid, bool *started) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, + bool boot, bool resume) +{ + return -EOPNOTSUPP; +} +#endif enum scmi_imx_lmm_op { SCMI_IMX_LMM_BOOT, -- cgit v1.2.3 From 83597c841ed53807a99a2ee837a8cbc3541ce62a Mon Sep 17 00:00:00 2001 From: Abhijit Gangurde Date: Wed, 3 Sep 2025 11:45:59 +0530 Subject: RDMA: Add IONIC to rdma_driver_id definition Define RDMA_DRIVER_IONIC in enum rdma_driver_id. Signed-off-by: Abhijit Gangurde Link: https://patch.msgid.link/20250903061606.4139957-8-abhijit.gangurde@amd.com Signed-off-by: Leon Romanovsky --- include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index fe15bc7e9f70..89e6a3f13191 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -255,6 +255,7 @@ enum rdma_driver_id { RDMA_DRIVER_SIW, RDMA_DRIVER_ERDMA, RDMA_DRIVER_MANA, + RDMA_DRIVER_IONIC, }; enum ib_uverbs_gid_type { -- cgit v1.2.3 From e8521822c733c6deab0f339843cd37cd62c12795 Mon Sep 17 00:00:00 2001 From: Abhijit Gangurde Date: Wed, 3 Sep 2025 11:46:02 +0530 Subject: RDMA/ionic: Register device ops for control path Implement device supported verb APIs for control path. Co-developed-by: Andrew Boyer Signed-off-by: Andrew Boyer Co-developed-by: Allen Hubbe Signed-off-by: Allen Hubbe Signed-off-by: Abhijit Gangurde Link: https://patch.msgid.link/20250903061606.4139957-11-abhijit.gangurde@amd.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/ionic/ionic_admin.c | 104 + drivers/infiniband/hw/ionic/ionic_controlpath.c | 2498 +++++++++++++++++++++++ drivers/infiniband/hw/ionic/ionic_fw.h | 717 +++++++ drivers/infiniband/hw/ionic/ionic_ibdev.c | 46 +- drivers/infiniband/hw/ionic/ionic_ibdev.h | 251 ++- drivers/infiniband/hw/ionic/ionic_pgtbl.c | 19 + include/uapi/rdma/ionic-abi.h | 115 ++ 7 files changed, 3741 insertions(+), 9 deletions(-) create mode 100644 include/uapi/rdma/ionic-abi.h (limited to 'include') diff --git a/drivers/infiniband/hw/ionic/ionic_admin.c b/drivers/infiniband/hw/ionic/ionic_admin.c index 845c03f6d9fb..1ba7a8ecc073 100644 --- a/drivers/infiniband/hw/ionic/ionic_admin.c +++ b/drivers/infiniband/hw/ionic/ionic_admin.c @@ -627,6 +627,44 @@ err_cmd: return ERR_PTR(rc); } +static void ionic_flush_qs(struct ionic_ibdev *dev) +{ + struct ionic_qp *qp, *qp_tmp; + struct ionic_cq *cq, *cq_tmp; + LIST_HEAD(flush_list); + unsigned long index; + + WARN_ON(!irqs_disabled()); + + /* Flush qp send and recv */ + xa_lock(&dev->qp_tbl); + xa_for_each(&dev->qp_tbl, index, qp) { + kref_get(&qp->qp_kref); + list_add_tail(&qp->ibkill_flush_ent, &flush_list); + } + xa_unlock(&dev->qp_tbl); + + list_for_each_entry_safe(qp, qp_tmp, &flush_list, ibkill_flush_ent) { + ionic_flush_qp(dev, qp); + kref_put(&qp->qp_kref, ionic_qp_complete); + list_del(&qp->ibkill_flush_ent); + } + + /* Notify completions */ + xa_lock(&dev->cq_tbl); + xa_for_each(&dev->cq_tbl, index, cq) { + kref_get(&cq->cq_kref); + list_add_tail(&cq->ibkill_flush_ent, &flush_list); + } + xa_unlock(&dev->cq_tbl); + + list_for_each_entry_safe(cq, cq_tmp, &flush_list, ibkill_flush_ent) { + ionic_notify_flush_cq(cq); + kref_put(&cq->cq_kref, ionic_cq_complete); + list_del(&cq->ibkill_flush_ent); + } +} + static void ionic_kill_ibdev(struct ionic_ibdev *dev, bool fatal_path) { unsigned long irqflags; @@ -650,6 +688,9 @@ static void ionic_kill_ibdev(struct ionic_ibdev *dev, bool fatal_path) spin_unlock(&aq->lock); } + if (do_flush) + ionic_flush_qs(dev); + local_irq_restore(irqflags); /* Post a fatal event if requested */ @@ -789,6 +830,65 @@ static void ionic_cq_event(struct ionic_ibdev *dev, u32 cqid, u8 code) kref_put(&cq->cq_kref, ionic_cq_complete); } +static void ionic_qp_event(struct ionic_ibdev *dev, u32 qpid, u8 code) +{ + unsigned long irqflags; + struct ib_event ibev; + struct ionic_qp *qp; + + xa_lock_irqsave(&dev->qp_tbl, irqflags); + qp = xa_load(&dev->qp_tbl, qpid); + if (qp) + kref_get(&qp->qp_kref); + xa_unlock_irqrestore(&dev->qp_tbl, irqflags); + + if (!qp) { + ibdev_dbg(&dev->ibdev, + "missing qpid %#x code %u\n", qpid, code); + return; + } + + ibev.device = &dev->ibdev; + ibev.element.qp = &qp->ibqp; + + switch (code) { + case IONIC_V1_EQE_SQ_DRAIN: + ibev.event = IB_EVENT_SQ_DRAINED; + break; + + case IONIC_V1_EQE_QP_COMM_EST: + ibev.event = IB_EVENT_COMM_EST; + break; + + case IONIC_V1_EQE_QP_LAST_WQE: + ibev.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + + case IONIC_V1_EQE_QP_ERR: + ibev.event = IB_EVENT_QP_FATAL; + break; + + case IONIC_V1_EQE_QP_ERR_REQUEST: + ibev.event = IB_EVENT_QP_REQ_ERR; + break; + + case IONIC_V1_EQE_QP_ERR_ACCESS: + ibev.event = IB_EVENT_QP_ACCESS_ERR; + break; + + default: + ibdev_dbg(&dev->ibdev, + "unrecognized qpid %#x code %u\n", qpid, code); + goto out; + } + + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ibev, qp->ibqp.qp_context); + +out: + kref_put(&qp->qp_kref, ionic_qp_complete); +} + static u16 ionic_poll_eq(struct ionic_eq *eq, u16 budget) { struct ionic_ibdev *dev = eq->dev; @@ -818,6 +918,10 @@ static u16 ionic_poll_eq(struct ionic_eq *eq, u16 budget) ionic_cq_event(dev, qid, code); break; + case IONIC_V1_EQE_TYPE_QP: + ionic_qp_event(dev, qid, code); + break; + default: ibdev_dbg(&dev->ibdev, "unknown event %#x type %u\n", evt, type); diff --git a/drivers/infiniband/hw/ionic/ionic_controlpath.c b/drivers/infiniband/hw/ionic/ionic_controlpath.c index e1130573bd39..9ce7c2e6d7a8 100644 --- a/drivers/infiniband/hw/ionic/ionic_controlpath.c +++ b/drivers/infiniband/hw/ionic/ionic_controlpath.c @@ -1,8 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ +#include +#include +#include +#include +#include +#include + +#include "ionic_fw.h" #include "ionic_ibdev.h" +#define ionic_set_ecn(tos) (((tos) | 2u) & ~1u) +#define ionic_clear_ecn(tos) ((tos) & ~3u) + static int ionic_validate_qdesc(struct ionic_qdesc *q) { if (!q->addr || !q->size || !q->mask || @@ -179,3 +190,2490 @@ void ionic_destroy_cq_common(struct ionic_ibdev *dev, struct ionic_cq *cq) cq->vcq = NULL; } + +static int ionic_validate_qdesc_zero(struct ionic_qdesc *q) +{ + if (q->addr || q->size || q->mask || q->depth_log2 || q->stride_log2) + return -EINVAL; + + return 0; +} + +static int ionic_get_pdid(struct ionic_ibdev *dev, u32 *pdid) +{ + int rc; + + rc = ionic_resid_get(&dev->inuse_pdid); + if (rc < 0) + return rc; + + *pdid = rc; + return 0; +} + +static int ionic_get_ahid(struct ionic_ibdev *dev, u32 *ahid) +{ + int rc; + + rc = ionic_resid_get(&dev->inuse_ahid); + if (rc < 0) + return rc; + + *ahid = rc; + return 0; +} + +static int ionic_get_mrid(struct ionic_ibdev *dev, u32 *mrid) +{ + int rc; + + /* wrap to 1, skip reserved lkey */ + rc = ionic_resid_get_shared(&dev->inuse_mrid, 1, + dev->inuse_mrid.inuse_size); + if (rc < 0) + return rc; + + *mrid = ionic_mrid(rc, dev->next_mrkey++); + return 0; +} + +static int ionic_get_gsi_qpid(struct ionic_ibdev *dev, u32 *qpid) +{ + int rc = 0; + + rc = ionic_resid_get_shared(&dev->inuse_qpid, IB_QPT_GSI, IB_QPT_GSI + 1); + if (rc < 0) + return rc; + + *qpid = IB_QPT_GSI; + return 0; +} + +static int ionic_get_qpid(struct ionic_ibdev *dev, u32 *qpid, + u8 *udma_idx, u8 udma_mask) +{ + unsigned int size, base, bound; + int udma_i, udma_x, udma_ix; + int rc = -EINVAL; + + udma_x = dev->next_qpid_udma_idx; + + dev->next_qpid_udma_idx ^= dev->lif_cfg.udma_count - 1; + + for (udma_i = 0; udma_i < dev->lif_cfg.udma_count; ++udma_i) { + udma_ix = udma_i ^ udma_x; + + if (!(udma_mask & BIT(udma_ix))) + continue; + + size = dev->lif_cfg.qp_count / dev->lif_cfg.udma_count; + base = size * udma_ix; + bound = base + size; + + /* skip reserved SMI and GSI qpids in group zero */ + if (!base) + base = 2; + + rc = ionic_resid_get_shared(&dev->inuse_qpid, base, bound); + if (rc >= 0) { + *qpid = ionic_bitid_to_qid(rc, + dev->lif_cfg.udma_qgrp_shift, + dev->half_qpid_udma_shift); + *udma_idx = udma_ix; + + rc = 0; + break; + } + } + + return rc; +} + +static int ionic_get_dbid(struct ionic_ibdev *dev, u32 *dbid, phys_addr_t *addr) +{ + int rc, dbpage_num; + + /* wrap to 1, skip kernel reserved */ + rc = ionic_resid_get_shared(&dev->inuse_dbid, 1, + dev->inuse_dbid.inuse_size); + if (rc < 0) + return rc; + + dbpage_num = (dev->lif_cfg.lif_hw_index * dev->lif_cfg.dbid_count) + rc; + *addr = dev->lif_cfg.db_phys + ((phys_addr_t)dbpage_num << PAGE_SHIFT); + + *dbid = rc; + + return 0; +} + +static void ionic_put_pdid(struct ionic_ibdev *dev, u32 pdid) +{ + ionic_resid_put(&dev->inuse_pdid, pdid); +} + +static void ionic_put_ahid(struct ionic_ibdev *dev, u32 ahid) +{ + ionic_resid_put(&dev->inuse_ahid, ahid); +} + +static void ionic_put_mrid(struct ionic_ibdev *dev, u32 mrid) +{ + ionic_resid_put(&dev->inuse_mrid, ionic_mrid_index(mrid)); +} + +static void ionic_put_qpid(struct ionic_ibdev *dev, u32 qpid) +{ + u32 bitid = ionic_qid_to_bitid(qpid, + dev->lif_cfg.udma_qgrp_shift, + dev->half_qpid_udma_shift); + + ionic_resid_put(&dev->inuse_qpid, bitid); +} + +static void ionic_put_dbid(struct ionic_ibdev *dev, u32 dbid) +{ + ionic_resid_put(&dev->inuse_dbid, dbid); +} + +static struct rdma_user_mmap_entry* +ionic_mmap_entry_insert(struct ionic_ctx *ctx, unsigned long size, + unsigned long pfn, u8 mmap_flags, u64 *offset) +{ + struct ionic_mmap_entry *entry; + int rc; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->size = size; + entry->pfn = pfn; + entry->mmap_flags = mmap_flags; + + rc = rdma_user_mmap_entry_insert(&ctx->ibctx, &entry->rdma_entry, + entry->size); + if (rc) { + kfree(entry); + return NULL; + } + + if (offset) + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} + +int ionic_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device); + struct ionic_ctx *ctx = to_ionic_ctx(ibctx); + struct ionic_ctx_resp resp = {}; + struct ionic_ctx_req req; + phys_addr_t db_phys = 0; + int rc; + + rc = ib_copy_from_udata(&req, udata, sizeof(req)); + if (rc) + return rc; + + /* try to allocate dbid for user ctx */ + rc = ionic_get_dbid(dev, &ctx->dbid, &db_phys); + if (rc < 0) + return rc; + + ibdev_dbg(&dev->ibdev, "user space dbid %u\n", ctx->dbid); + + ctx->mmap_dbell = ionic_mmap_entry_insert(ctx, PAGE_SIZE, + PHYS_PFN(db_phys), 0, NULL); + if (!ctx->mmap_dbell) { + rc = -ENOMEM; + goto err_mmap_dbell; + } + + resp.page_shift = PAGE_SHIFT; + + resp.dbell_offset = db_phys & ~PAGE_MASK; + + resp.version = dev->lif_cfg.rdma_version; + resp.qp_opcodes = dev->lif_cfg.qp_opcodes; + resp.admin_opcodes = dev->lif_cfg.admin_opcodes; + + resp.sq_qtype = dev->lif_cfg.sq_qtype; + resp.rq_qtype = dev->lif_cfg.rq_qtype; + resp.cq_qtype = dev->lif_cfg.cq_qtype; + resp.admin_qtype = dev->lif_cfg.aq_qtype; + resp.max_stride = dev->lif_cfg.max_stride; + resp.max_spec = IONIC_SPEC_HIGH; + + resp.udma_count = dev->lif_cfg.udma_count; + resp.expdb_mask = dev->lif_cfg.expdb_mask; + + if (dev->lif_cfg.sq_expdb) + resp.expdb_qtypes |= IONIC_EXPDB_SQ; + if (dev->lif_cfg.rq_expdb) + resp.expdb_qtypes |= IONIC_EXPDB_RQ; + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + + return 0; + +err_resp: + rdma_user_mmap_entry_remove(ctx->mmap_dbell); +err_mmap_dbell: + ionic_put_dbid(dev, ctx->dbid); + + return rc; +} + +void ionic_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device); + struct ionic_ctx *ctx = to_ionic_ctx(ibctx); + + rdma_user_mmap_entry_remove(ctx->mmap_dbell); + ionic_put_dbid(dev, ctx->dbid); +} + +int ionic_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device); + struct ionic_ctx *ctx = to_ionic_ctx(ibctx); + struct rdma_user_mmap_entry *rdma_entry; + struct ionic_mmap_entry *ionic_entry; + int rc = 0; + + rdma_entry = rdma_user_mmap_entry_get(&ctx->ibctx, vma); + if (!rdma_entry) { + ibdev_dbg(&dev->ibdev, "not found %#lx\n", + vma->vm_pgoff << PAGE_SHIFT); + return -EINVAL; + } + + ionic_entry = container_of(rdma_entry, struct ionic_mmap_entry, + rdma_entry); + + ibdev_dbg(&dev->ibdev, "writecombine? %d\n", + ionic_entry->mmap_flags & IONIC_MMAP_WC); + if (ionic_entry->mmap_flags & IONIC_MMAP_WC) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + ibdev_dbg(&dev->ibdev, "remap st %#lx pf %#lx sz %#lx\n", + vma->vm_start, ionic_entry->pfn, ionic_entry->size); + rc = rdma_user_mmap_io(&ctx->ibctx, vma, ionic_entry->pfn, + ionic_entry->size, vma->vm_page_prot, + rdma_entry); + if (rc) + ibdev_dbg(&dev->ibdev, "remap failed %d\n", rc); + + rdma_user_mmap_entry_put(rdma_entry); + return rc; +} + +void ionic_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct ionic_mmap_entry *ionic_entry; + + ionic_entry = container_of(rdma_entry, struct ionic_mmap_entry, + rdma_entry); + kfree(ionic_entry); +} + +int ionic_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + + return ionic_get_pdid(dev, &pd->pdid); +} + +int ionic_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + + ionic_put_pdid(dev, pd->pdid); + + return 0; +} + +static int ionic_build_hdr(struct ionic_ibdev *dev, + struct ib_ud_header *hdr, + const struct rdma_ah_attr *attr, + u16 sport, bool want_ecn) +{ + const struct ib_global_route *grh; + enum rdma_network_type net; + u16 vlan; + int rc; + + if (attr->ah_flags != IB_AH_GRH) + return -EINVAL; + if (attr->type != RDMA_AH_ATTR_TYPE_ROCE) + return -EINVAL; + + grh = rdma_ah_read_grh(attr); + + rc = rdma_read_gid_l2_fields(grh->sgid_attr, &vlan, &hdr->eth.smac_h[0]); + if (rc) + return rc; + + net = rdma_gid_attr_network_type(grh->sgid_attr); + + rc = ib_ud_header_init(0, /* no payload */ + 0, /* no lrh */ + 1, /* yes eth */ + vlan != 0xffff, + 0, /* no grh */ + net == RDMA_NETWORK_IPV4 ? 4 : 6, + 1, /* yes udp */ + 0, /* no imm */ + hdr); + if (rc) + return rc; + + ether_addr_copy(hdr->eth.dmac_h, attr->roce.dmac); + + if (net == RDMA_NETWORK_IPV4) { + hdr->eth.type = cpu_to_be16(ETH_P_IP); + hdr->ip4.frag_off = cpu_to_be16(0x4000); /* don't fragment */ + hdr->ip4.ttl = grh->hop_limit; + hdr->ip4.tot_len = cpu_to_be16(0xffff); + hdr->ip4.saddr = + *(const __be32 *)(grh->sgid_attr->gid.raw + 12); + hdr->ip4.daddr = *(const __be32 *)(grh->dgid.raw + 12); + + if (want_ecn) + hdr->ip4.tos = ionic_set_ecn(grh->traffic_class); + else + hdr->ip4.tos = ionic_clear_ecn(grh->traffic_class); + } else { + hdr->eth.type = cpu_to_be16(ETH_P_IPV6); + hdr->grh.flow_label = cpu_to_be32(grh->flow_label); + hdr->grh.hop_limit = grh->hop_limit; + hdr->grh.source_gid = grh->sgid_attr->gid; + hdr->grh.destination_gid = grh->dgid; + + if (want_ecn) + hdr->grh.traffic_class = + ionic_set_ecn(grh->traffic_class); + else + hdr->grh.traffic_class = + ionic_clear_ecn(grh->traffic_class); + } + + if (vlan != 0xffff) { + vlan |= rdma_ah_get_sl(attr) << VLAN_PRIO_SHIFT; + hdr->vlan.tag = cpu_to_be16(vlan); + hdr->vlan.type = hdr->eth.type; + hdr->eth.type = cpu_to_be16(ETH_P_8021Q); + } + + hdr->udp.sport = cpu_to_be16(sport); + hdr->udp.dport = cpu_to_be16(ROCE_V2_UDP_DPORT); + + return 0; +} + +static void ionic_set_ah_attr(struct ionic_ibdev *dev, + struct rdma_ah_attr *ah_attr, + struct ib_ud_header *hdr, + int sgid_index) +{ + u32 flow_label; + u16 vlan = 0; + u8 tos, ttl; + + if (hdr->vlan_present) + vlan = be16_to_cpu(hdr->vlan.tag); + + if (hdr->ipv4_present) { + flow_label = 0; + ttl = hdr->ip4.ttl; + tos = hdr->ip4.tos; + *(__be16 *)(hdr->grh.destination_gid.raw + 10) = cpu_to_be16(0xffff); + *(__be32 *)(hdr->grh.destination_gid.raw + 12) = hdr->ip4.daddr; + } else { + flow_label = be32_to_cpu(hdr->grh.flow_label); + ttl = hdr->grh.hop_limit; + tos = hdr->grh.traffic_class; + } + + memset(ah_attr, 0, sizeof(*ah_attr)); + ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; + if (hdr->eth_present) + memcpy(&ah_attr->roce.dmac, &hdr->eth.dmac_h, ETH_ALEN); + rdma_ah_set_sl(ah_attr, vlan >> VLAN_PRIO_SHIFT); + rdma_ah_set_port_num(ah_attr, 1); + rdma_ah_set_grh(ah_attr, NULL, flow_label, sgid_index, ttl, tos); + rdma_ah_set_dgid_raw(ah_attr, &hdr->grh.destination_gid); +} + +static int ionic_create_ah_cmd(struct ionic_ibdev *dev, + struct ionic_ah *ah, + struct ionic_pd *pd, + struct rdma_ah_attr *attr, + u32 flags) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_AH, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_AH_IN_V1_LEN), + .cmd.create_ah = { + .pd_id = cpu_to_le32(pd->pdid), + .dbid_flags = cpu_to_le16(dev->lif_cfg.dbid), + .id_ver = cpu_to_le32(ah->ahid), + } + } + }; + enum ionic_admin_flags admin_flags = 0; + dma_addr_t hdr_dma = 0; + void *hdr_buf; + gfp_t gfp = GFP_ATOMIC; + int rc, hdr_len = 0; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_AH) + return -EBADRQC; + + if (flags & RDMA_CREATE_AH_SLEEPABLE) + gfp = GFP_KERNEL; + else + admin_flags |= IONIC_ADMIN_F_BUSYWAIT; + + rc = ionic_build_hdr(dev, &ah->hdr, attr, IONIC_ROCE_UDP_SPORT, false); + if (rc) + return rc; + + if (ah->hdr.eth.type == cpu_to_be16(ETH_P_8021Q)) { + if (ah->hdr.vlan.type == cpu_to_be16(ETH_P_IP)) + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP; + else + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP; + } else { + if (ah->hdr.eth.type == cpu_to_be16(ETH_P_IP)) + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP; + else + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP; + } + + ah->sgid_index = rdma_ah_read_grh(attr)->sgid_index; + + hdr_buf = kmalloc(PAGE_SIZE, gfp); + if (!hdr_buf) + return -ENOMEM; + + hdr_len = ib_ud_header_pack(&ah->hdr, hdr_buf); + hdr_len -= IB_BTH_BYTES; + hdr_len -= IB_DETH_BYTES; + ibdev_dbg(&dev->ibdev, "roce packet header template\n"); + print_hex_dump_debug("hdr ", DUMP_PREFIX_OFFSET, 16, 1, + hdr_buf, hdr_len, true); + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, hdr_len, + DMA_TO_DEVICE); + + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_dma; + + wr.wqe.cmd.create_ah.dma_addr = cpu_to_le64(hdr_dma); + wr.wqe.cmd.create_ah.length = cpu_to_le32(hdr_len); + + ionic_admin_post(dev, &wr); + rc = ionic_admin_wait(dev, &wr, admin_flags); + + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, hdr_len, + DMA_TO_DEVICE); +err_dma: + kfree(hdr_buf); + + return rc; +} + +static int ionic_destroy_ah_cmd(struct ionic_ibdev *dev, u32 ahid, u32 flags) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_AH, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_AH_IN_V1_LEN), + .cmd.destroy_ah = { + .ah_id = cpu_to_le32(ahid), + }, + } + }; + enum ionic_admin_flags admin_flags = IONIC_ADMIN_F_TEARDOWN; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_AH) + return -EBADRQC; + + if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) + admin_flags |= IONIC_ADMIN_F_BUSYWAIT; + + ionic_admin_post(dev, &wr); + ionic_admin_wait(dev, &wr, admin_flags); + + /* No host-memory resource is associated with ah, so it is ok + * to "succeed" and complete this destroy ah on the host. + */ + return 0; +} + +int ionic_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device); + struct rdma_ah_attr *attr = init_attr->ah_attr; + struct ionic_pd *pd = to_ionic_pd(ibah->pd); + struct ionic_ah *ah = to_ionic_ah(ibah); + struct ionic_ah_resp resp = {}; + u32 flags = init_attr->flags; + int rc; + + rc = ionic_get_ahid(dev, &ah->ahid); + if (rc) + return rc; + + rc = ionic_create_ah_cmd(dev, ah, pd, attr, flags); + if (rc) + goto err_cmd; + + if (udata) { + resp.ahid = ah->ahid; + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + } + + return 0; + +err_resp: + ionic_destroy_ah_cmd(dev, ah->ahid, flags); +err_cmd: + ionic_put_ahid(dev, ah->ahid); + return rc; +} + +int ionic_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device); + struct ionic_ah *ah = to_ionic_ah(ibah); + + ionic_set_ah_attr(dev, ah_attr, &ah->hdr, ah->sgid_index); + + return 0; +} + +int ionic_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device); + struct ionic_ah *ah = to_ionic_ah(ibah); + int rc; + + rc = ionic_destroy_ah_cmd(dev, ah->ahid, flags); + if (rc) + return rc; + + ionic_put_ahid(dev, ah->ahid); + + return 0; +} + +static int ionic_create_mr_cmd(struct ionic_ibdev *dev, + struct ionic_pd *pd, + struct ionic_mr *mr, + u64 addr, + u64 length) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_MR, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_MR_IN_V1_LEN), + .cmd.create_mr = { + .va = cpu_to_le64(addr), + .length = cpu_to_le64(length), + .pd_id = cpu_to_le32(pd->pdid), + .page_size_log2 = mr->buf.page_size_log2, + .tbl_index = cpu_to_le32(~0), + .map_count = cpu_to_le32(mr->buf.tbl_pages), + .dma_addr = ionic_pgtbl_dma(&mr->buf, addr), + .dbid_flags = cpu_to_le16(mr->flags), + .id_ver = cpu_to_le32(mr->mrid), + } + } + }; + int rc; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_MR) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + rc = ionic_admin_wait(dev, &wr, 0); + if (!rc) + mr->created = true; + + return rc; +} + +static int ionic_destroy_mr_cmd(struct ionic_ibdev *dev, u32 mrid) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_MR, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_MR_IN_V1_LEN), + .cmd.destroy_mr = { + .mr_id = cpu_to_le32(mrid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_MR) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN); +} + +struct ib_mr *ionic_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ionic_mr *mr; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->ibmr.lkey = IONIC_DMA_LKEY; + mr->ibmr.rkey = IONIC_DMA_RKEY; + + if (pd) + pd->flags |= IONIC_QPF_PRIVILEGED; + + return &mr->ibmr; +} + +struct ib_mr *ionic_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 addr, int access, struct ib_dmah *dmah, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ionic_mr *mr; + unsigned long pg_sz; + int rc; + + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + goto err_mrid; + + mr->ibmr.lkey = mr->mrid; + mr->ibmr.rkey = mr->mrid; + mr->ibmr.iova = addr; + mr->ibmr.length = length; + + mr->flags = IONIC_MRF_USER_MR | to_ionic_mr_flags(access); + + mr->umem = ib_umem_get(&dev->ibdev, start, length, access); + if (IS_ERR(mr->umem)) { + rc = PTR_ERR(mr->umem); + goto err_umem; + } + + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->lif_cfg.page_size_supported, + addr); + if (!pg_sz) { + rc = -EINVAL; + goto err_pgtbl; + } + + rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, 1, pg_sz); + if (rc) + goto err_pgtbl; + + rc = ionic_create_mr_cmd(dev, pd, mr, addr, length); + if (rc) + goto err_cmd; + + ionic_pgtbl_unbuf(dev, &mr->buf); + + return &mr->ibmr; + +err_cmd: + ionic_pgtbl_unbuf(dev, &mr->buf); +err_pgtbl: + ib_umem_release(mr->umem); +err_umem: + ionic_put_mrid(dev, mr->mrid); +err_mrid: + kfree(mr); + return ERR_PTR(rc); +} + +struct ib_mr *ionic_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 offset, + u64 length, u64 addr, int fd, int access, + struct ib_dmah *dmah, + struct uverbs_attr_bundle *attrs) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ib_umem_dmabuf *umem_dmabuf; + struct ionic_mr *mr; + u64 pg_sz; + int rc; + + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + goto err_mrid; + + mr->ibmr.lkey = mr->mrid; + mr->ibmr.rkey = mr->mrid; + mr->ibmr.iova = addr; + mr->ibmr.length = length; + + mr->flags = IONIC_MRF_USER_MR | to_ionic_mr_flags(access); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(&dev->ibdev, offset, length, + fd, access); + if (IS_ERR(umem_dmabuf)) { + rc = PTR_ERR(umem_dmabuf); + goto err_umem; + } + + mr->umem = &umem_dmabuf->umem; + + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->lif_cfg.page_size_supported, + addr); + if (!pg_sz) { + rc = -EINVAL; + goto err_pgtbl; + } + + rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, 1, pg_sz); + if (rc) + goto err_pgtbl; + + rc = ionic_create_mr_cmd(dev, pd, mr, addr, length); + if (rc) + goto err_cmd; + + ionic_pgtbl_unbuf(dev, &mr->buf); + + return &mr->ibmr; + +err_cmd: + ionic_pgtbl_unbuf(dev, &mr->buf); +err_pgtbl: + ib_umem_release(mr->umem); +err_umem: + ionic_put_mrid(dev, mr->mrid); +err_mrid: + kfree(mr); + return ERR_PTR(rc); +} + +int ionic_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device); + struct ionic_mr *mr = to_ionic_mr(ibmr); + int rc; + + if (!mr->ibmr.lkey) + goto out; + + if (mr->created) { + rc = ionic_destroy_mr_cmd(dev, mr->mrid); + if (rc) + return rc; + } + + ionic_pgtbl_unbuf(dev, &mr->buf); + + if (mr->umem) + ib_umem_release(mr->umem); + + ionic_put_mrid(dev, mr->mrid); + +out: + kfree(mr); + + return 0; +} + +struct ib_mr *ionic_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type type, + u32 max_sg) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ionic_mr *mr; + int rc; + + if (type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + goto err_mrid; + + mr->ibmr.lkey = mr->mrid; + mr->ibmr.rkey = mr->mrid; + + mr->flags = IONIC_MRF_PHYS_MR; + + rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, max_sg, PAGE_SIZE); + if (rc) + goto err_pgtbl; + + mr->buf.tbl_pages = 0; + + rc = ionic_create_mr_cmd(dev, pd, mr, 0, 0); + if (rc) + goto err_cmd; + + return &mr->ibmr; + +err_cmd: + ionic_pgtbl_unbuf(dev, &mr->buf); +err_pgtbl: + ionic_put_mrid(dev, mr->mrid); +err_mrid: + kfree(mr); + return ERR_PTR(rc); +} + +static int ionic_map_mr_page(struct ib_mr *ibmr, u64 dma) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device); + struct ionic_mr *mr = to_ionic_mr(ibmr); + + ibdev_dbg(&dev->ibdev, "dma %p\n", (void *)dma); + return ionic_pgtbl_page(&mr->buf, dma); +} + +int ionic_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device); + struct ionic_mr *mr = to_ionic_mr(ibmr); + int rc; + + /* mr must be allocated using ib_alloc_mr() */ + if (unlikely(!mr->buf.tbl_limit)) + return -EINVAL; + + mr->buf.tbl_pages = 0; + + if (mr->buf.tbl_buf) + dma_sync_single_for_cpu(dev->lif_cfg.hwdev, mr->buf.tbl_dma, + mr->buf.tbl_size, DMA_TO_DEVICE); + + ibdev_dbg(&dev->ibdev, "sg %p nent %d\n", sg, sg_nents); + rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ionic_map_mr_page); + + mr->buf.page_size_log2 = order_base_2(ibmr->page_size); + + if (mr->buf.tbl_buf) + dma_sync_single_for_device(dev->lif_cfg.hwdev, mr->buf.tbl_dma, + mr->buf.tbl_size, DMA_TO_DEVICE); + + return rc; +} + +int ionic_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmw->device); + struct ionic_pd *pd = to_ionic_pd(ibmw->pd); + struct ionic_mr *mr = to_ionic_mw(ibmw); + int rc; + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + return rc; + + mr->ibmw.rkey = mr->mrid; + + if (mr->ibmw.type == IB_MW_TYPE_1) + mr->flags = IONIC_MRF_MW_1; + else + mr->flags = IONIC_MRF_MW_2; + + rc = ionic_create_mr_cmd(dev, pd, mr, 0, 0); + if (rc) + goto err_cmd; + + return 0; + +err_cmd: + ionic_put_mrid(dev, mr->mrid); + return rc; +} + +int ionic_dealloc_mw(struct ib_mw *ibmw) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmw->device); + struct ionic_mr *mr = to_ionic_mw(ibmw); + int rc; + + rc = ionic_destroy_mr_cmd(dev, mr->mrid); + if (rc) + return rc; + + ionic_put_mrid(dev, mr->mrid); + + return 0; +} + +static int ionic_create_cq_cmd(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_cq *cq, + struct ionic_tbl_buf *buf) +{ + const u16 dbid = ionic_ctx_dbid(dev, ctx); + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_CQ, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_CQ_IN_V1_LEN), + .cmd.create_cq = { + .eq_id = cpu_to_le32(cq->eqid), + .depth_log2 = cq->q.depth_log2, + .stride_log2 = cq->q.stride_log2, + .page_size_log2 = buf->page_size_log2, + .tbl_index = cpu_to_le32(~0), + .map_count = cpu_to_le32(buf->tbl_pages), + .dma_addr = ionic_pgtbl_dma(buf, 0), + .dbid_flags = cpu_to_le16(dbid), + .id_ver = cpu_to_le32(cq->cqid), + } + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_CQ) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, 0); +} + +static int ionic_destroy_cq_cmd(struct ionic_ibdev *dev, u32 cqid) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_CQ, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN), + .cmd.destroy_cq = { + .cq_id = cpu_to_le32(cqid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_CQ) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN); +} + +int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device); + struct ib_udata *udata = &attrs->driver_udata; + struct ionic_ctx *ctx = + rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); + struct ionic_vcq *vcq = to_ionic_vcq(ibcq); + struct ionic_tbl_buf buf = {}; + struct ionic_cq_resp resp; + struct ionic_cq_req req; + int udma_idx = 0, rc; + + if (udata) { + rc = ib_copy_from_udata(&req, udata, sizeof(req)); + if (rc) + return rc; + } + + vcq->udma_mask = BIT(dev->lif_cfg.udma_count) - 1; + + if (udata) + vcq->udma_mask &= req.udma_mask; + + if (!vcq->udma_mask) { + rc = -EINVAL; + goto err_init; + } + + for (; udma_idx < dev->lif_cfg.udma_count; ++udma_idx) { + if (!(vcq->udma_mask & BIT(udma_idx))) + continue; + + rc = ionic_create_cq_common(vcq, &buf, attr, ctx, udata, + &req.cq[udma_idx], + &resp.cqid[udma_idx], + udma_idx); + if (rc) + goto err_init; + + rc = ionic_create_cq_cmd(dev, ctx, &vcq->cq[udma_idx], &buf); + if (rc) + goto err_cmd; + + ionic_pgtbl_unbuf(dev, &buf); + } + + vcq->ibcq.cqe = attr->cqe; + + if (udata) { + resp.udma_mask = vcq->udma_mask; + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + } + + return 0; + +err_resp: + while (udma_idx) { + --udma_idx; + if (!(vcq->udma_mask & BIT(udma_idx))) + continue; + ionic_destroy_cq_cmd(dev, vcq->cq[udma_idx].cqid); +err_cmd: + ionic_pgtbl_unbuf(dev, &buf); + ionic_destroy_cq_common(dev, &vcq->cq[udma_idx]); +err_init: + ; + } + + return rc; +} + +int ionic_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device); + struct ionic_vcq *vcq = to_ionic_vcq(ibcq); + int udma_idx, rc_tmp, rc = 0; + + for (udma_idx = dev->lif_cfg.udma_count; udma_idx; ) { + --udma_idx; + + if (!(vcq->udma_mask & BIT(udma_idx))) + continue; + + rc_tmp = ionic_destroy_cq_cmd(dev, vcq->cq[udma_idx].cqid); + if (rc_tmp) { + if (!rc) + rc = rc_tmp; + + continue; + } + + ionic_destroy_cq_common(dev, &vcq->cq[udma_idx]); + } + + return rc; +} + +static bool pd_remote_privileged(struct ib_pd *pd) +{ + return pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; +} + +static int ionic_create_qp_cmd(struct ionic_ibdev *dev, + struct ionic_pd *pd, + struct ionic_cq *send_cq, + struct ionic_cq *recv_cq, + struct ionic_qp *qp, + struct ionic_tbl_buf *sq_buf, + struct ionic_tbl_buf *rq_buf, + struct ib_qp_init_attr *attr) +{ + const u16 dbid = ionic_obj_dbid(dev, pd->ibpd.uobject); + const u32 flags = to_ionic_qp_flags(0, 0, + qp->sq_cmb & IONIC_CMB_ENABLE, + qp->rq_cmb & IONIC_CMB_ENABLE, + qp->sq_spec, qp->rq_spec, + pd->flags & IONIC_QPF_PRIVILEGED, + pd_remote_privileged(&pd->ibpd)); + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_QP, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_QP_IN_V1_LEN), + .cmd.create_qp = { + .pd_id = cpu_to_le32(pd->pdid), + .priv_flags = cpu_to_be32(flags), + .type_state = to_ionic_qp_type(attr->qp_type), + .dbid_flags = cpu_to_le16(dbid), + .id_ver = cpu_to_le32(qp->qpid), + } + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_QP) + return -EBADRQC; + + if (qp->has_sq) { + wr.wqe.cmd.create_qp.sq_cq_id = cpu_to_le32(send_cq->cqid); + wr.wqe.cmd.create_qp.sq_depth_log2 = qp->sq.depth_log2; + wr.wqe.cmd.create_qp.sq_stride_log2 = qp->sq.stride_log2; + wr.wqe.cmd.create_qp.sq_page_size_log2 = sq_buf->page_size_log2; + wr.wqe.cmd.create_qp.sq_tbl_index_xrcd_id = cpu_to_le32(~0); + wr.wqe.cmd.create_qp.sq_map_count = + cpu_to_le32(sq_buf->tbl_pages); + wr.wqe.cmd.create_qp.sq_dma_addr = ionic_pgtbl_dma(sq_buf, 0); + } + + if (qp->has_rq) { + wr.wqe.cmd.create_qp.rq_cq_id = cpu_to_le32(recv_cq->cqid); + wr.wqe.cmd.create_qp.rq_depth_log2 = qp->rq.depth_log2; + wr.wqe.cmd.create_qp.rq_stride_log2 = qp->rq.stride_log2; + wr.wqe.cmd.create_qp.rq_page_size_log2 = rq_buf->page_size_log2; + wr.wqe.cmd.create_qp.rq_tbl_index_srq_id = cpu_to_le32(~0); + wr.wqe.cmd.create_qp.rq_map_count = + cpu_to_le32(rq_buf->tbl_pages); + wr.wqe.cmd.create_qp.rq_dma_addr = ionic_pgtbl_dma(rq_buf, 0); + } + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, 0); +} + +static int ionic_modify_qp_cmd(struct ionic_ibdev *dev, + struct ionic_pd *pd, + struct ionic_qp *qp, + struct ib_qp_attr *attr, + int mask) +{ + const u32 flags = to_ionic_qp_flags(attr->qp_access_flags, + attr->en_sqd_async_notify, + qp->sq_cmb & IONIC_CMB_ENABLE, + qp->rq_cmb & IONIC_CMB_ENABLE, + qp->sq_spec, qp->rq_spec, + pd->flags & IONIC_QPF_PRIVILEGED, + pd_remote_privileged(qp->ibqp.pd)); + const u8 state = to_ionic_qp_modify_state(attr->qp_state, + attr->cur_qp_state); + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_MODIFY_QP, + .len = cpu_to_le16(IONIC_ADMIN_MODIFY_QP_IN_V1_LEN), + .cmd.mod_qp = { + .attr_mask = cpu_to_be32(mask), + .access_flags = cpu_to_be16(flags), + .rq_psn = cpu_to_le32(attr->rq_psn), + .sq_psn = cpu_to_le32(attr->sq_psn), + .rate_limit_kbps = + cpu_to_le32(attr->rate_limit), + .pmtu = (attr->path_mtu + 7), + .retry = (attr->retry_cnt | + (attr->rnr_retry << 4)), + .rnr_timer = attr->min_rnr_timer, + .retry_timeout = attr->timeout, + .type_state = state, + .id_ver = cpu_to_le32(qp->qpid), + } + } + }; + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + void *hdr_buf = NULL; + dma_addr_t hdr_dma = 0; + int rc, hdr_len = 0; + u16 sport; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_MODIFY_QP) + return -EBADRQC; + + if ((mask & IB_QP_MAX_DEST_RD_ATOMIC) && attr->max_dest_rd_atomic) { + /* Note, round up/down was already done for allocating + * resources on the device. The allocation order is in cache + * line size. We can't use the order of the resource + * allocation to determine the order wqes here, because for + * queue length <= one cache line it is not distinct. + * + * Therefore, order wqes is computed again here. + * + * Account for hole and round up to the next order. + */ + wr.wqe.cmd.mod_qp.rsq_depth = + order_base_2(attr->max_dest_rd_atomic + 1); + wr.wqe.cmd.mod_qp.rsq_index = cpu_to_le32(~0); + } + + if ((mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) { + /* Account for hole and round down to the next order */ + wr.wqe.cmd.mod_qp.rrq_depth = + order_base_2(attr->max_rd_atomic + 2) - 1; + wr.wqe.cmd.mod_qp.rrq_index = cpu_to_le32(~0); + } + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + wr.wqe.cmd.mod_qp.qkey_dest_qpn = + cpu_to_le32(attr->dest_qp_num); + else + wr.wqe.cmd.mod_qp.qkey_dest_qpn = cpu_to_le32(attr->qkey); + + if (mask & IB_QP_AV) { + if (!qp->hdr) + return -ENOMEM; + + sport = rdma_get_udp_sport(grh->flow_label, + qp->qpid, + attr->dest_qp_num); + + rc = ionic_build_hdr(dev, qp->hdr, &attr->ah_attr, sport, true); + if (rc) + return rc; + + qp->sgid_index = grh->sgid_index; + + hdr_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!hdr_buf) + return -ENOMEM; + + hdr_len = ib_ud_header_pack(qp->hdr, hdr_buf); + hdr_len -= IB_BTH_BYTES; + hdr_len -= IB_DETH_BYTES; + ibdev_dbg(&dev->ibdev, "roce packet header template\n"); + print_hex_dump_debug("hdr ", DUMP_PREFIX_OFFSET, 16, 1, + hdr_buf, hdr_len, true); + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, hdr_len, + DMA_TO_DEVICE); + + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_dma; + + if (qp->hdr->ipv4_present) { + wr.wqe.cmd.mod_qp.tfp_csum_profile = + qp->hdr->vlan_present ? + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP : + IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP; + } else { + wr.wqe.cmd.mod_qp.tfp_csum_profile = + qp->hdr->vlan_present ? + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP : + IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP; + } + + wr.wqe.cmd.mod_qp.ah_id_len = + cpu_to_le32(qp->ahid | (hdr_len << 24)); + wr.wqe.cmd.mod_qp.dma_addr = cpu_to_le64(hdr_dma); + + wr.wqe.cmd.mod_qp.en_pcp = attr->ah_attr.sl; + wr.wqe.cmd.mod_qp.ip_dscp = grh->traffic_class >> 2; + } + + ionic_admin_post(dev, &wr); + + rc = ionic_admin_wait(dev, &wr, 0); + + if (mask & IB_QP_AV) + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, hdr_len, + DMA_TO_DEVICE); +err_dma: + if (mask & IB_QP_AV) + kfree(hdr_buf); + + return rc; +} + +static int ionic_query_qp_cmd(struct ionic_ibdev *dev, + struct ionic_qp *qp, + struct ib_qp_attr *attr, + int mask) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_QUERY_QP, + .len = cpu_to_le16(IONIC_ADMIN_QUERY_QP_IN_V1_LEN), + .cmd.query_qp = { + .id_ver = cpu_to_le32(qp->qpid), + }, + } + }; + struct ionic_v1_admin_query_qp_sq *query_sqbuf; + struct ionic_v1_admin_query_qp_rq *query_rqbuf; + dma_addr_t query_sqdma; + dma_addr_t query_rqdma; + dma_addr_t hdr_dma = 0; + void *hdr_buf = NULL; + int flags, rc; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_QUERY_QP) + return -EBADRQC; + + if (qp->has_sq) { + bool expdb = !!(qp->sq_cmb & IONIC_CMB_EXPDB); + + attr->cap.max_send_sge = + ionic_v1_send_wqe_max_sge(qp->sq.stride_log2, + qp->sq_spec, + expdb); + attr->cap.max_inline_data = + ionic_v1_send_wqe_max_data(qp->sq.stride_log2, expdb); + } + + if (qp->has_rq) { + attr->cap.max_recv_sge = + ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2, + qp->rq_spec, + qp->rq_cmb & IONIC_CMB_EXPDB); + } + + query_sqbuf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!query_sqbuf) + return -ENOMEM; + + query_rqbuf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!query_rqbuf) { + rc = -ENOMEM; + goto err_rqbuf; + } + + query_sqdma = dma_map_single(dev->lif_cfg.hwdev, query_sqbuf, PAGE_SIZE, + DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, query_sqdma); + if (rc) + goto err_sqdma; + + query_rqdma = dma_map_single(dev->lif_cfg.hwdev, query_rqbuf, PAGE_SIZE, + DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, query_rqdma); + if (rc) + goto err_rqdma; + + if (mask & IB_QP_AV) { + hdr_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!hdr_buf) { + rc = -ENOMEM; + goto err_hdrbuf; + } + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, + PAGE_SIZE, DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_hdrdma; + } + + wr.wqe.cmd.query_qp.sq_dma_addr = cpu_to_le64(query_sqdma); + wr.wqe.cmd.query_qp.rq_dma_addr = cpu_to_le64(query_rqdma); + wr.wqe.cmd.query_qp.hdr_dma_addr = cpu_to_le64(hdr_dma); + wr.wqe.cmd.query_qp.ah_id = cpu_to_le32(qp->ahid); + + ionic_admin_post(dev, &wr); + + rc = ionic_admin_wait(dev, &wr, 0); + + if (rc) + goto err_hdrdma; + + flags = be16_to_cpu(query_sqbuf->access_perms_flags | + query_rqbuf->access_perms_flags); + + print_hex_dump_debug("sqbuf ", DUMP_PREFIX_OFFSET, 16, 1, + query_sqbuf, sizeof(*query_sqbuf), true); + print_hex_dump_debug("rqbuf ", DUMP_PREFIX_OFFSET, 16, 1, + query_rqbuf, sizeof(*query_rqbuf), true); + ibdev_dbg(&dev->ibdev, "query qp %u state_pmtu %#x flags %#x", + qp->qpid, query_rqbuf->state_pmtu, flags); + + attr->qp_state = from_ionic_qp_state(query_rqbuf->state_pmtu >> 4); + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = (query_rqbuf->state_pmtu & 0xf) - 7; + attr->path_mig_state = IB_MIG_MIGRATED; + attr->qkey = be32_to_cpu(query_sqbuf->qkey_dest_qpn); + attr->rq_psn = be32_to_cpu(query_sqbuf->rq_psn); + attr->sq_psn = be32_to_cpu(query_rqbuf->sq_psn); + attr->dest_qp_num = attr->qkey; + attr->qp_access_flags = from_ionic_qp_flags(flags); + attr->pkey_index = 0; + attr->alt_pkey_index = 0; + attr->en_sqd_async_notify = !!(flags & IONIC_QPF_SQD_NOTIFY); + attr->sq_draining = !!(flags & IONIC_QPF_SQ_DRAINING); + attr->max_rd_atomic = BIT(query_rqbuf->rrq_depth) - 1; + attr->max_dest_rd_atomic = BIT(query_rqbuf->rsq_depth) - 1; + attr->min_rnr_timer = query_sqbuf->rnr_timer; + attr->port_num = 0; + attr->timeout = query_sqbuf->retry_timeout; + attr->retry_cnt = query_rqbuf->retry_rnrtry & 0xf; + attr->rnr_retry = query_rqbuf->retry_rnrtry >> 4; + attr->alt_port_num = 0; + attr->alt_timeout = 0; + attr->rate_limit = be32_to_cpu(query_sqbuf->rate_limit_kbps); + + if (mask & IB_QP_AV) + ionic_set_ah_attr(dev, &attr->ah_attr, + qp->hdr, qp->sgid_index); + +err_hdrdma: + if (mask & IB_QP_AV) { + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, + PAGE_SIZE, DMA_FROM_DEVICE); + kfree(hdr_buf); + } +err_hdrbuf: + dma_unmap_single(dev->lif_cfg.hwdev, query_rqdma, sizeof(*query_rqbuf), + DMA_FROM_DEVICE); +err_rqdma: + dma_unmap_single(dev->lif_cfg.hwdev, query_sqdma, sizeof(*query_sqbuf), + DMA_FROM_DEVICE); +err_sqdma: + kfree(query_rqbuf); +err_rqbuf: + kfree(query_sqbuf); + + return rc; +} + +static int ionic_destroy_qp_cmd(struct ionic_ibdev *dev, u32 qpid) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_QP, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_QP_IN_V1_LEN), + .cmd.destroy_qp = { + .qp_id = cpu_to_le32(qpid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_QP) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN); +} + +static bool ionic_expdb_wqe_size_supported(struct ionic_ibdev *dev, + uint32_t wqe_size) +{ + switch (wqe_size) { + case 64: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_64; + case 128: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_128; + case 256: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_256; + case 512: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_512; + } + + return false; +} + +static void ionic_qp_sq_init_cmb(struct ionic_ibdev *dev, + struct ionic_qp *qp, + struct ib_udata *udata, + int max_data) +{ + u8 expdb_stride_log2 = 0; + bool expdb; + int rc; + + if (!(qp->sq_cmb & IONIC_CMB_ENABLE)) + goto not_in_cmb; + + if (qp->sq_cmb & ~IONIC_CMB_SUPPORTED) { + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->sq_cmb &= IONIC_CMB_SUPPORTED; + } + + if ((qp->sq_cmb & IONIC_CMB_EXPDB) && !dev->lif_cfg.sq_expdb) { + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->sq_cmb &= ~IONIC_CMB_EXPDB; + } + + qp->sq_cmb_order = order_base_2(qp->sq.size / PAGE_SIZE); + + if (qp->sq_cmb_order >= IONIC_SQCMB_ORDER) + goto not_in_cmb; + + if (qp->sq_cmb & IONIC_CMB_EXPDB) + expdb_stride_log2 = qp->sq.stride_log2; + + rc = ionic_get_cmb(dev->lif_cfg.lif, &qp->sq_cmb_pgid, + &qp->sq_cmb_addr, qp->sq_cmb_order, + expdb_stride_log2, &expdb); + if (rc) + goto not_in_cmb; + + if ((qp->sq_cmb & IONIC_CMB_EXPDB) && !expdb) { + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + goto err_map; + + qp->sq_cmb &= ~IONIC_CMB_EXPDB; + } + + return; + +err_map: + ionic_put_cmb(dev->lif_cfg.lif, qp->sq_cmb_pgid, qp->sq_cmb_order); +not_in_cmb: + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + ibdev_dbg(&dev->ibdev, "could not place sq in cmb as required\n"); + + qp->sq_cmb = 0; + qp->sq_cmb_order = IONIC_RES_INVALID; + qp->sq_cmb_pgid = 0; + qp->sq_cmb_addr = 0; +} + +static void ionic_qp_sq_destroy_cmb(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!(qp->sq_cmb & IONIC_CMB_ENABLE)) + return; + + if (ctx) + rdma_user_mmap_entry_remove(qp->mmap_sq_cmb); + + ionic_put_cmb(dev->lif_cfg.lif, qp->sq_cmb_pgid, qp->sq_cmb_order); +} + +static int ionic_qp_sq_init(struct ionic_ibdev *dev, struct ionic_ctx *ctx, + struct ionic_qp *qp, struct ionic_qdesc *sq, + struct ionic_tbl_buf *buf, int max_wr, int max_sge, + int max_data, int sq_spec, struct ib_udata *udata) +{ + u32 wqe_size; + int rc = 0; + + qp->sq_msn_prod = 0; + qp->sq_msn_cons = 0; + + if (!qp->has_sq) { + if (buf) { + buf->tbl_buf = NULL; + buf->tbl_limit = 0; + buf->tbl_pages = 0; + } + if (udata) + rc = ionic_validate_qdesc_zero(sq); + + return rc; + } + + rc = -EINVAL; + + if (max_wr < 0 || max_wr > 0xffff) + return rc; + + if (max_sge < 1) + return rc; + + if (max_sge > min(ionic_v1_send_wqe_max_sge(dev->lif_cfg.max_stride, 0, + qp->sq_cmb & + IONIC_CMB_EXPDB), + IONIC_SPEC_HIGH)) + return rc; + + if (max_data < 0) + return rc; + + if (max_data > ionic_v1_send_wqe_max_data(dev->lif_cfg.max_stride, + qp->sq_cmb & IONIC_CMB_EXPDB)) + return rc; + + if (udata) { + rc = ionic_validate_qdesc(sq); + if (rc) + return rc; + + qp->sq_spec = sq_spec; + + qp->sq.ptr = NULL; + qp->sq.size = sq->size; + qp->sq.mask = sq->mask; + qp->sq.depth_log2 = sq->depth_log2; + qp->sq.stride_log2 = sq->stride_log2; + + qp->sq_meta = NULL; + qp->sq_msn_idx = NULL; + + qp->sq_umem = ib_umem_get(&dev->ibdev, sq->addr, sq->size, 0); + if (IS_ERR(qp->sq_umem)) + return PTR_ERR(qp->sq_umem); + } else { + qp->sq_umem = NULL; + + qp->sq_spec = ionic_v1_use_spec_sge(max_sge, sq_spec); + if (sq_spec && !qp->sq_spec) + ibdev_dbg(&dev->ibdev, + "init sq: max_sge %u disables spec\n", + max_sge); + + if (qp->sq_cmb & IONIC_CMB_EXPDB) { + wqe_size = ionic_v1_send_wqe_min_size(max_sge, max_data, + qp->sq_spec, + true); + + if (!ionic_expdb_wqe_size_supported(dev, wqe_size)) + qp->sq_cmb &= ~IONIC_CMB_EXPDB; + } + + if (!(qp->sq_cmb & IONIC_CMB_EXPDB)) + wqe_size = ionic_v1_send_wqe_min_size(max_sge, max_data, + qp->sq_spec, + false); + + rc = ionic_queue_init(&qp->sq, dev->lif_cfg.hwdev, + max_wr, wqe_size); + if (rc) + return rc; + + ionic_queue_dbell_init(&qp->sq, qp->qpid); + + qp->sq_meta = kmalloc_array((u32)qp->sq.mask + 1, + sizeof(*qp->sq_meta), + GFP_KERNEL); + if (!qp->sq_meta) { + rc = -ENOMEM; + goto err_sq_meta; + } + + qp->sq_msn_idx = kmalloc_array((u32)qp->sq.mask + 1, + sizeof(*qp->sq_msn_idx), + GFP_KERNEL); + if (!qp->sq_msn_idx) { + rc = -ENOMEM; + goto err_sq_msn; + } + } + + ionic_qp_sq_init_cmb(dev, qp, udata, max_data); + + if (qp->sq_cmb & IONIC_CMB_ENABLE) + rc = ionic_pgtbl_init(dev, buf, NULL, + (u64)qp->sq_cmb_pgid << PAGE_SHIFT, + 1, PAGE_SIZE); + else + rc = ionic_pgtbl_init(dev, buf, + qp->sq_umem, qp->sq.dma, 1, PAGE_SIZE); + if (rc) + goto err_sq_tbl; + + return 0; + +err_sq_tbl: + ionic_qp_sq_destroy_cmb(dev, ctx, qp); + kfree(qp->sq_msn_idx); +err_sq_msn: + kfree(qp->sq_meta); +err_sq_meta: + if (qp->sq_umem) + ib_umem_release(qp->sq_umem); + else + ionic_queue_destroy(&qp->sq, dev->lif_cfg.hwdev); + return rc; +} + +static void ionic_qp_sq_destroy(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!qp->has_sq) + return; + + ionic_qp_sq_destroy_cmb(dev, ctx, qp); + + kfree(qp->sq_msn_idx); + kfree(qp->sq_meta); + + if (qp->sq_umem) + ib_umem_release(qp->sq_umem); + else + ionic_queue_destroy(&qp->sq, dev->lif_cfg.hwdev); +} + +static void ionic_qp_rq_init_cmb(struct ionic_ibdev *dev, + struct ionic_qp *qp, + struct ib_udata *udata) +{ + u8 expdb_stride_log2 = 0; + bool expdb; + int rc; + + if (!(qp->rq_cmb & IONIC_CMB_ENABLE)) + goto not_in_cmb; + + if (qp->rq_cmb & ~IONIC_CMB_SUPPORTED) { + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->rq_cmb &= IONIC_CMB_SUPPORTED; + } + + if ((qp->rq_cmb & IONIC_CMB_EXPDB) && !dev->lif_cfg.rq_expdb) { + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->rq_cmb &= ~IONIC_CMB_EXPDB; + } + + qp->rq_cmb_order = order_base_2(qp->rq.size / PAGE_SIZE); + + if (qp->rq_cmb_order >= IONIC_RQCMB_ORDER) + goto not_in_cmb; + + if (qp->rq_cmb & IONIC_CMB_EXPDB) + expdb_stride_log2 = qp->rq.stride_log2; + + rc = ionic_get_cmb(dev->lif_cfg.lif, &qp->rq_cmb_pgid, + &qp->rq_cmb_addr, qp->rq_cmb_order, + expdb_stride_log2, &expdb); + if (rc) + goto not_in_cmb; + + if ((qp->rq_cmb & IONIC_CMB_EXPDB) && !expdb) { + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + goto err_map; + + qp->rq_cmb &= ~IONIC_CMB_EXPDB; + } + + return; + +err_map: + ionic_put_cmb(dev->lif_cfg.lif, qp->rq_cmb_pgid, qp->rq_cmb_order); +not_in_cmb: + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + ibdev_dbg(&dev->ibdev, "could not place rq in cmb as required\n"); + + qp->rq_cmb = 0; + qp->rq_cmb_order = IONIC_RES_INVALID; + qp->rq_cmb_pgid = 0; + qp->rq_cmb_addr = 0; +} + +static void ionic_qp_rq_destroy_cmb(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!(qp->rq_cmb & IONIC_CMB_ENABLE)) + return; + + if (ctx) + rdma_user_mmap_entry_remove(qp->mmap_rq_cmb); + + ionic_put_cmb(dev->lif_cfg.lif, qp->rq_cmb_pgid, qp->rq_cmb_order); +} + +static int ionic_qp_rq_init(struct ionic_ibdev *dev, struct ionic_ctx *ctx, + struct ionic_qp *qp, struct ionic_qdesc *rq, + struct ionic_tbl_buf *buf, int max_wr, int max_sge, + int rq_spec, struct ib_udata *udata) +{ + int rc = 0, i; + u32 wqe_size; + + if (!qp->has_rq) { + if (buf) { + buf->tbl_buf = NULL; + buf->tbl_limit = 0; + buf->tbl_pages = 0; + } + if (udata) + rc = ionic_validate_qdesc_zero(rq); + + return rc; + } + + rc = -EINVAL; + + if (max_wr < 0 || max_wr > 0xffff) + return rc; + + if (max_sge < 1) + return rc; + + if (max_sge > min(ionic_v1_recv_wqe_max_sge(dev->lif_cfg.max_stride, 0, false), + IONIC_SPEC_HIGH)) + return rc; + + if (udata) { + rc = ionic_validate_qdesc(rq); + if (rc) + return rc; + + qp->rq_spec = rq_spec; + + qp->rq.ptr = NULL; + qp->rq.size = rq->size; + qp->rq.mask = rq->mask; + qp->rq.depth_log2 = rq->depth_log2; + qp->rq.stride_log2 = rq->stride_log2; + + qp->rq_meta = NULL; + + qp->rq_umem = ib_umem_get(&dev->ibdev, rq->addr, rq->size, 0); + if (IS_ERR(qp->rq_umem)) + return PTR_ERR(qp->rq_umem); + } else { + qp->rq_umem = NULL; + + qp->rq_spec = ionic_v1_use_spec_sge(max_sge, rq_spec); + if (rq_spec && !qp->rq_spec) + ibdev_dbg(&dev->ibdev, + "init rq: max_sge %u disables spec\n", + max_sge); + + if (qp->rq_cmb & IONIC_CMB_EXPDB) { + wqe_size = ionic_v1_recv_wqe_min_size(max_sge, + qp->rq_spec, + true); + + if (!ionic_expdb_wqe_size_supported(dev, wqe_size)) + qp->rq_cmb &= ~IONIC_CMB_EXPDB; + } + + if (!(qp->rq_cmb & IONIC_CMB_EXPDB)) + wqe_size = ionic_v1_recv_wqe_min_size(max_sge, + qp->rq_spec, + false); + + rc = ionic_queue_init(&qp->rq, dev->lif_cfg.hwdev, + max_wr, wqe_size); + if (rc) + return rc; + + ionic_queue_dbell_init(&qp->rq, qp->qpid); + + qp->rq_meta = kmalloc_array((u32)qp->rq.mask + 1, + sizeof(*qp->rq_meta), + GFP_KERNEL); + if (!qp->rq_meta) { + rc = -ENOMEM; + goto err_rq_meta; + } + + for (i = 0; i < qp->rq.mask; ++i) + qp->rq_meta[i].next = &qp->rq_meta[i + 1]; + qp->rq_meta[i].next = IONIC_META_LAST; + qp->rq_meta_head = &qp->rq_meta[0]; + } + + ionic_qp_rq_init_cmb(dev, qp, udata); + + if (qp->rq_cmb & IONIC_CMB_ENABLE) + rc = ionic_pgtbl_init(dev, buf, NULL, + (u64)qp->rq_cmb_pgid << PAGE_SHIFT, + 1, PAGE_SIZE); + else + rc = ionic_pgtbl_init(dev, buf, + qp->rq_umem, qp->rq.dma, 1, PAGE_SIZE); + if (rc) + goto err_rq_tbl; + + return 0; + +err_rq_tbl: + ionic_qp_rq_destroy_cmb(dev, ctx, qp); + kfree(qp->rq_meta); +err_rq_meta: + if (qp->rq_umem) + ib_umem_release(qp->rq_umem); + else + ionic_queue_destroy(&qp->rq, dev->lif_cfg.hwdev); + return rc; +} + +static void ionic_qp_rq_destroy(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!qp->has_rq) + return; + + ionic_qp_rq_destroy_cmb(dev, ctx, qp); + + kfree(qp->rq_meta); + + if (qp->rq_umem) + ib_umem_release(qp->rq_umem); + else + ionic_queue_destroy(&qp->rq, dev->lif_cfg.hwdev); +} + +int ionic_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_tbl_buf sq_buf = {}, rq_buf = {}; + struct ionic_pd *pd = to_ionic_pd(ibqp->pd); + struct ionic_qp *qp = to_ionic_qp(ibqp); + struct ionic_ctx *ctx = + rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); + struct ionic_qp_resp resp = {}; + struct ionic_qp_req req = {}; + struct ionic_cq *cq; + u8 udma_mask; + void *entry; + int rc; + + if (udata) { + rc = ib_copy_from_udata(&req, udata, sizeof(req)); + if (rc) + return rc; + } else { + req.sq_spec = IONIC_SPEC_HIGH; + req.rq_spec = IONIC_SPEC_HIGH; + } + + if (attr->qp_type == IB_QPT_SMI || attr->qp_type > IB_QPT_UD) + return -EOPNOTSUPP; + + qp->state = IB_QPS_RESET; + + INIT_LIST_HEAD(&qp->cq_poll_sq); + INIT_LIST_HEAD(&qp->cq_flush_sq); + INIT_LIST_HEAD(&qp->cq_flush_rq); + + spin_lock_init(&qp->sq_lock); + spin_lock_init(&qp->rq_lock); + + qp->has_sq = 1; + qp->has_rq = 1; + + if (attr->qp_type == IB_QPT_GSI) { + rc = ionic_get_gsi_qpid(dev, &qp->qpid); + } else { + udma_mask = BIT(dev->lif_cfg.udma_count) - 1; + + if (qp->has_sq) + udma_mask &= to_ionic_vcq(attr->send_cq)->udma_mask; + + if (qp->has_rq) + udma_mask &= to_ionic_vcq(attr->recv_cq)->udma_mask; + + if (udata && req.udma_mask) + udma_mask &= req.udma_mask; + + if (!udma_mask) + return -EINVAL; + + rc = ionic_get_qpid(dev, &qp->qpid, &qp->udma_idx, udma_mask); + } + if (rc) + return rc; + + qp->sig_all = attr->sq_sig_type == IB_SIGNAL_ALL_WR; + qp->has_ah = attr->qp_type == IB_QPT_RC; + + if (qp->has_ah) { + qp->hdr = kzalloc(sizeof(*qp->hdr), GFP_KERNEL); + if (!qp->hdr) { + rc = -ENOMEM; + goto err_ah_alloc; + } + + rc = ionic_get_ahid(dev, &qp->ahid); + if (rc) + goto err_ahid; + } + + if (udata) { + if (req.rq_cmb & IONIC_CMB_ENABLE) + qp->rq_cmb = req.rq_cmb; + + if (req.sq_cmb & IONIC_CMB_ENABLE) + qp->sq_cmb = req.sq_cmb; + } + + rc = ionic_qp_sq_init(dev, ctx, qp, &req.sq, &sq_buf, + attr->cap.max_send_wr, attr->cap.max_send_sge, + attr->cap.max_inline_data, req.sq_spec, udata); + if (rc) + goto err_sq; + + rc = ionic_qp_rq_init(dev, ctx, qp, &req.rq, &rq_buf, + attr->cap.max_recv_wr, attr->cap.max_recv_sge, + req.rq_spec, udata); + if (rc) + goto err_rq; + + rc = ionic_create_qp_cmd(dev, pd, + to_ionic_vcq_cq(attr->send_cq, qp->udma_idx), + to_ionic_vcq_cq(attr->recv_cq, qp->udma_idx), + qp, &sq_buf, &rq_buf, attr); + if (rc) + goto err_cmd; + + if (udata) { + resp.qpid = qp->qpid; + resp.udma_idx = qp->udma_idx; + + if (qp->sq_cmb & IONIC_CMB_ENABLE) { + bool wc; + + if ((qp->sq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) == + (IONIC_CMB_WC | IONIC_CMB_UC)) { + ibdev_dbg(&dev->ibdev, + "Both sq_cmb flags IONIC_CMB_WC and IONIC_CMB_UC are set, using default driver mapping\n"); + qp->sq_cmb &= ~(IONIC_CMB_WC | IONIC_CMB_UC); + } + + wc = (qp->sq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) + != IONIC_CMB_UC; + + /* let userspace know the mapping */ + if (wc) + qp->sq_cmb |= IONIC_CMB_WC; + else + qp->sq_cmb |= IONIC_CMB_UC; + + qp->mmap_sq_cmb = + ionic_mmap_entry_insert(ctx, + qp->sq.size, + PHYS_PFN(qp->sq_cmb_addr), + wc ? IONIC_MMAP_WC : 0, + &resp.sq_cmb_offset); + if (!qp->mmap_sq_cmb) { + rc = -ENOMEM; + goto err_mmap_sq; + } + + resp.sq_cmb = qp->sq_cmb; + } + + if (qp->rq_cmb & IONIC_CMB_ENABLE) { + bool wc; + + if ((qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) == + (IONIC_CMB_WC | IONIC_CMB_UC)) { + ibdev_dbg(&dev->ibdev, + "Both rq_cmb flags IONIC_CMB_WC and IONIC_CMB_UC are set, using default driver mapping\n"); + qp->rq_cmb &= ~(IONIC_CMB_WC | IONIC_CMB_UC); + } + + if (qp->rq_cmb & IONIC_CMB_EXPDB) + wc = (qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) + == IONIC_CMB_WC; + else + wc = (qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) + != IONIC_CMB_UC; + + /* let userspace know the mapping */ + if (wc) + qp->rq_cmb |= IONIC_CMB_WC; + else + qp->rq_cmb |= IONIC_CMB_UC; + + qp->mmap_rq_cmb = + ionic_mmap_entry_insert(ctx, + qp->rq.size, + PHYS_PFN(qp->rq_cmb_addr), + wc ? IONIC_MMAP_WC : 0, + &resp.rq_cmb_offset); + if (!qp->mmap_rq_cmb) { + rc = -ENOMEM; + goto err_mmap_rq; + } + + resp.rq_cmb = qp->rq_cmb; + } + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + } + + ionic_pgtbl_unbuf(dev, &rq_buf); + ionic_pgtbl_unbuf(dev, &sq_buf); + + qp->ibqp.qp_num = qp->qpid; + + init_completion(&qp->qp_rel_comp); + kref_init(&qp->qp_kref); + + entry = xa_store_irq(&dev->qp_tbl, qp->qpid, qp, GFP_KERNEL); + if (entry) { + if (!xa_is_err(entry)) + rc = -EINVAL; + else + rc = xa_err(entry); + + goto err_resp; + } + + if (qp->has_sq) { + cq = to_ionic_vcq_cq(attr->send_cq, qp->udma_idx); + + attr->cap.max_send_wr = qp->sq.mask; + attr->cap.max_send_sge = + ionic_v1_send_wqe_max_sge(qp->sq.stride_log2, + qp->sq_spec, + qp->sq_cmb & IONIC_CMB_EXPDB); + attr->cap.max_inline_data = + ionic_v1_send_wqe_max_data(qp->sq.stride_log2, + qp->sq_cmb & + IONIC_CMB_EXPDB); + qp->sq_cqid = cq->cqid; + } + + if (qp->has_rq) { + cq = to_ionic_vcq_cq(attr->recv_cq, qp->udma_idx); + + attr->cap.max_recv_wr = qp->rq.mask; + attr->cap.max_recv_sge = + ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2, + qp->rq_spec, + qp->rq_cmb & IONIC_CMB_EXPDB); + qp->rq_cqid = cq->cqid; + } + + return 0; + +err_resp: + if (udata && (qp->rq_cmb & IONIC_CMB_ENABLE)) + rdma_user_mmap_entry_remove(qp->mmap_rq_cmb); +err_mmap_rq: + if (udata && (qp->sq_cmb & IONIC_CMB_ENABLE)) + rdma_user_mmap_entry_remove(qp->mmap_sq_cmb); +err_mmap_sq: + ionic_destroy_qp_cmd(dev, qp->qpid); +err_cmd: + ionic_pgtbl_unbuf(dev, &rq_buf); + ionic_qp_rq_destroy(dev, ctx, qp); +err_rq: + ionic_pgtbl_unbuf(dev, &sq_buf); + ionic_qp_sq_destroy(dev, ctx, qp); +err_sq: + if (qp->has_ah) + ionic_put_ahid(dev, qp->ahid); +err_ahid: + kfree(qp->hdr); +err_ah_alloc: + ionic_put_qpid(dev, qp->qpid); + return rc; +} + +void ionic_notify_flush_cq(struct ionic_cq *cq) +{ + if (cq->flush && cq->vcq->ibcq.comp_handler) + cq->vcq->ibcq.comp_handler(&cq->vcq->ibcq, + cq->vcq->ibcq.cq_context); +} + +static void ionic_notify_qp_cqs(struct ionic_ibdev *dev, struct ionic_qp *qp) +{ + if (qp->ibqp.send_cq) + ionic_notify_flush_cq(to_ionic_vcq_cq(qp->ibqp.send_cq, + qp->udma_idx)); + if (qp->ibqp.recv_cq && qp->ibqp.recv_cq != qp->ibqp.send_cq) + ionic_notify_flush_cq(to_ionic_vcq_cq(qp->ibqp.recv_cq, + qp->udma_idx)); +} + +void ionic_flush_qp(struct ionic_ibdev *dev, struct ionic_qp *qp) +{ + unsigned long irqflags; + struct ionic_cq *cq; + + if (qp->ibqp.send_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx); + + /* Hold the CQ lock and QP sq_lock to set up flush */ + spin_lock_irqsave(&cq->lock, irqflags); + spin_lock(&qp->sq_lock); + qp->sq_flush = true; + if (!ionic_queue_empty(&qp->sq)) { + cq->flush = true; + list_move_tail(&qp->cq_flush_sq, &cq->flush_sq); + } + spin_unlock(&qp->sq_lock); + spin_unlock_irqrestore(&cq->lock, irqflags); + } + + if (qp->ibqp.recv_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx); + + /* Hold the CQ lock and QP rq_lock to set up flush */ + spin_lock_irqsave(&cq->lock, irqflags); + spin_lock(&qp->rq_lock); + qp->rq_flush = true; + if (!ionic_queue_empty(&qp->rq)) { + cq->flush = true; + list_move_tail(&qp->cq_flush_rq, &cq->flush_rq); + } + spin_unlock(&qp->rq_lock); + spin_unlock_irqrestore(&cq->lock, irqflags); + } +} + +static void ionic_clean_cq(struct ionic_cq *cq, u32 qpid) +{ + struct ionic_v1_cqe *qcqe; + int prod, qtf, qid, type; + bool color; + + if (!cq->q.ptr) + return; + + color = cq->color; + prod = cq->q.prod; + qcqe = ionic_queue_at(&cq->q, prod); + + while (color == ionic_v1_cqe_color(qcqe)) { + qtf = ionic_v1_cqe_qtf(qcqe); + qid = ionic_v1_cqe_qtf_qid(qtf); + type = ionic_v1_cqe_qtf_type(qtf); + + if (qid == qpid && type != IONIC_V1_CQE_TYPE_ADMIN) + ionic_v1_cqe_clean(qcqe); + + prod = ionic_queue_next(&cq->q, prod); + qcqe = ionic_queue_at(&cq->q, prod); + color = ionic_color_wrap(prod, color); + } +} + +static void ionic_reset_qp(struct ionic_ibdev *dev, struct ionic_qp *qp) +{ + unsigned long irqflags; + struct ionic_cq *cq; + int i; + + local_irq_save(irqflags); + + if (qp->ibqp.send_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx); + spin_lock(&cq->lock); + ionic_clean_cq(cq, qp->qpid); + spin_unlock(&cq->lock); + } + + if (qp->ibqp.recv_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx); + spin_lock(&cq->lock); + ionic_clean_cq(cq, qp->qpid); + spin_unlock(&cq->lock); + } + + if (qp->has_sq) { + spin_lock(&qp->sq_lock); + qp->sq_flush = false; + qp->sq_flush_rcvd = false; + qp->sq_msn_prod = 0; + qp->sq_msn_cons = 0; + qp->sq.prod = 0; + qp->sq.cons = 0; + spin_unlock(&qp->sq_lock); + } + + if (qp->has_rq) { + spin_lock(&qp->rq_lock); + qp->rq_flush = false; + qp->rq.prod = 0; + qp->rq.cons = 0; + if (qp->rq_meta) { + for (i = 0; i < qp->rq.mask; ++i) + qp->rq_meta[i].next = &qp->rq_meta[i + 1]; + qp->rq_meta[i].next = IONIC_META_LAST; + } + qp->rq_meta_head = &qp->rq_meta[0]; + spin_unlock(&qp->rq_lock); + } + + local_irq_restore(irqflags); +} + +static bool ionic_qp_cur_state_is_ok(enum ib_qp_state q_state, + enum ib_qp_state attr_state) +{ + if (q_state == attr_state) + return true; + + if (attr_state == IB_QPS_ERR) + return true; + + if (attr_state == IB_QPS_SQE) + return q_state == IB_QPS_RTS || q_state == IB_QPS_SQD; + + return false; +} + +static int ionic_check_modify_qp(struct ionic_qp *qp, struct ib_qp_attr *attr, + int mask) +{ + enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->state; + enum ib_qp_state next_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if ((mask & IB_QP_CUR_STATE) && + !ionic_qp_cur_state_is_ok(qp->state, attr->cur_qp_state)) + return -EINVAL; + + if (!ib_modify_qp_is_ok(cur_state, next_state, qp->ibqp.qp_type, mask)) + return -EINVAL; + + /* unprivileged qp not allowed privileged qkey */ + if ((mask & IB_QP_QKEY) && (attr->qkey & 0x80000000) && + qp->ibqp.uobject) + return -EPERM; + + return 0; +} + +int ionic_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_pd *pd = to_ionic_pd(ibqp->pd); + struct ionic_qp *qp = to_ionic_qp(ibqp); + int rc; + + rc = ionic_check_modify_qp(qp, attr, mask); + if (rc) + return rc; + + if (mask & IB_QP_CAP) + return -EINVAL; + + rc = ionic_modify_qp_cmd(dev, pd, qp, attr, mask); + if (rc) + return rc; + + if (mask & IB_QP_STATE) { + qp->state = attr->qp_state; + + if (attr->qp_state == IB_QPS_ERR) { + ionic_flush_qp(dev, qp); + ionic_notify_qp_cqs(dev, qp); + } else if (attr->qp_state == IB_QPS_RESET) { + ionic_reset_qp(dev, qp); + } + } + + return 0; +} + +int ionic_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init_attr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_qp *qp = to_ionic_qp(ibqp); + int rc; + + memset(attr, 0, sizeof(*attr)); + memset(init_attr, 0, sizeof(*init_attr)); + + rc = ionic_query_qp_cmd(dev, qp, attr, mask); + if (rc) + return rc; + + if (qp->has_sq) + attr->cap.max_send_wr = qp->sq.mask; + + if (qp->has_rq) + attr->cap.max_recv_wr = qp->rq.mask; + + init_attr->event_handler = ibqp->event_handler; + init_attr->qp_context = ibqp->qp_context; + init_attr->send_cq = ibqp->send_cq; + init_attr->recv_cq = ibqp->recv_cq; + init_attr->srq = ibqp->srq; + init_attr->xrcd = ibqp->xrcd; + init_attr->cap = attr->cap; + init_attr->sq_sig_type = qp->sig_all ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + init_attr->qp_type = ibqp->qp_type; + init_attr->create_flags = 0; + init_attr->port_num = 0; + init_attr->rwq_ind_tbl = ibqp->rwq_ind_tbl; + init_attr->source_qpn = 0; + + return rc; +} + +int ionic_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct ionic_ctx *ctx = + rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_qp *qp = to_ionic_qp(ibqp); + unsigned long irqflags; + struct ionic_cq *cq; + int rc; + + rc = ionic_destroy_qp_cmd(dev, qp->qpid); + if (rc) + return rc; + + xa_erase_irq(&dev->qp_tbl, qp->qpid); + + kref_put(&qp->qp_kref, ionic_qp_complete); + wait_for_completion(&qp->qp_rel_comp); + + if (qp->ibqp.send_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx); + spin_lock_irqsave(&cq->lock, irqflags); + ionic_clean_cq(cq, qp->qpid); + list_del(&qp->cq_poll_sq); + list_del(&qp->cq_flush_sq); + spin_unlock_irqrestore(&cq->lock, irqflags); + } + + if (qp->ibqp.recv_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx); + spin_lock_irqsave(&cq->lock, irqflags); + ionic_clean_cq(cq, qp->qpid); + list_del(&qp->cq_flush_rq); + spin_unlock_irqrestore(&cq->lock, irqflags); + } + + ionic_qp_rq_destroy(dev, ctx, qp); + ionic_qp_sq_destroy(dev, ctx, qp); + if (qp->has_ah) { + ionic_put_ahid(dev, qp->ahid); + kfree(qp->hdr); + } + ionic_put_qpid(dev, qp->qpid); + + return 0; +} diff --git a/drivers/infiniband/hw/ionic/ionic_fw.h b/drivers/infiniband/hw/ionic/ionic_fw.h index 44ec69487519..62afe9220717 100644 --- a/drivers/infiniband/hw/ionic/ionic_fw.h +++ b/drivers/infiniband/hw/ionic/ionic_fw.h @@ -5,6 +5,266 @@ #define _IONIC_FW_H_ #include +#include + +/* common for ib spec */ + +#define IONIC_EXP_DBELL_SZ 8 + +enum ionic_mrid_bits { + IONIC_MRID_INDEX_SHIFT = 8, +}; + +static inline u32 ionic_mrid(u32 index, u8 key) +{ + return (index << IONIC_MRID_INDEX_SHIFT) | key; +} + +static inline u32 ionic_mrid_index(u32 lrkey) +{ + return lrkey >> IONIC_MRID_INDEX_SHIFT; +} + +/* common to all versions */ + +/* wqe scatter gather element */ +struct ionic_sge { + __be64 va; + __be32 len; + __be32 lkey; +}; + +/* admin queue mr type */ +enum ionic_mr_flags { + /* bits that determine mr access */ + IONIC_MRF_LOCAL_WRITE = BIT(0), + IONIC_MRF_REMOTE_WRITE = BIT(1), + IONIC_MRF_REMOTE_READ = BIT(2), + IONIC_MRF_REMOTE_ATOMIC = BIT(3), + IONIC_MRF_MW_BIND = BIT(4), + IONIC_MRF_ZERO_BASED = BIT(5), + IONIC_MRF_ON_DEMAND = BIT(6), + IONIC_MRF_PB = BIT(7), + IONIC_MRF_ACCESS_MASK = BIT(12) - 1, + + /* bits that determine mr type */ + IONIC_MRF_UKEY_EN = BIT(13), + IONIC_MRF_IS_MW = BIT(14), + IONIC_MRF_INV_EN = BIT(15), + + /* base flags combinations for mr types */ + IONIC_MRF_USER_MR = 0, + IONIC_MRF_PHYS_MR = (IONIC_MRF_UKEY_EN | + IONIC_MRF_INV_EN), + IONIC_MRF_MW_1 = (IONIC_MRF_UKEY_EN | + IONIC_MRF_IS_MW), + IONIC_MRF_MW_2 = (IONIC_MRF_UKEY_EN | + IONIC_MRF_IS_MW | + IONIC_MRF_INV_EN), +}; + +static inline int to_ionic_mr_flags(int access) +{ + int flags = 0; + + if (access & IB_ACCESS_LOCAL_WRITE) + flags |= IONIC_MRF_LOCAL_WRITE; + + if (access & IB_ACCESS_REMOTE_READ) + flags |= IONIC_MRF_REMOTE_READ; + + if (access & IB_ACCESS_REMOTE_WRITE) + flags |= IONIC_MRF_REMOTE_WRITE; + + if (access & IB_ACCESS_REMOTE_ATOMIC) + flags |= IONIC_MRF_REMOTE_ATOMIC; + + if (access & IB_ACCESS_MW_BIND) + flags |= IONIC_MRF_MW_BIND; + + if (access & IB_ZERO_BASED) + flags |= IONIC_MRF_ZERO_BASED; + + return flags; +} + +enum ionic_qp_flags { + /* bits that determine qp access */ + IONIC_QPF_REMOTE_WRITE = BIT(0), + IONIC_QPF_REMOTE_READ = BIT(1), + IONIC_QPF_REMOTE_ATOMIC = BIT(2), + + /* bits that determine other qp behavior */ + IONIC_QPF_SQ_PB = BIT(6), + IONIC_QPF_RQ_PB = BIT(7), + IONIC_QPF_SQ_SPEC = BIT(8), + IONIC_QPF_RQ_SPEC = BIT(9), + IONIC_QPF_REMOTE_PRIVILEGED = BIT(10), + IONIC_QPF_SQ_DRAINING = BIT(11), + IONIC_QPF_SQD_NOTIFY = BIT(12), + IONIC_QPF_SQ_CMB = BIT(13), + IONIC_QPF_RQ_CMB = BIT(14), + IONIC_QPF_PRIVILEGED = BIT(15), +}; + +static inline int from_ionic_qp_flags(int flags) +{ + int access_flags = 0; + + if (flags & IONIC_QPF_REMOTE_WRITE) + access_flags |= IB_ACCESS_REMOTE_WRITE; + + if (flags & IONIC_QPF_REMOTE_READ) + access_flags |= IB_ACCESS_REMOTE_READ; + + if (flags & IONIC_QPF_REMOTE_ATOMIC) + access_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return access_flags; +} + +static inline int to_ionic_qp_flags(int access, bool sqd_notify, + bool sq_is_cmb, bool rq_is_cmb, + bool sq_spec, bool rq_spec, + bool privileged, bool remote_privileged) +{ + int flags = 0; + + if (access & IB_ACCESS_REMOTE_WRITE) + flags |= IONIC_QPF_REMOTE_WRITE; + + if (access & IB_ACCESS_REMOTE_READ) + flags |= IONIC_QPF_REMOTE_READ; + + if (access & IB_ACCESS_REMOTE_ATOMIC) + flags |= IONIC_QPF_REMOTE_ATOMIC; + + if (sqd_notify) + flags |= IONIC_QPF_SQD_NOTIFY; + + if (sq_is_cmb) + flags |= IONIC_QPF_SQ_CMB; + + if (rq_is_cmb) + flags |= IONIC_QPF_RQ_CMB; + + if (sq_spec) + flags |= IONIC_QPF_SQ_SPEC; + + if (rq_spec) + flags |= IONIC_QPF_RQ_SPEC; + + if (privileged) + flags |= IONIC_QPF_PRIVILEGED; + + if (remote_privileged) + flags |= IONIC_QPF_REMOTE_PRIVILEGED; + + return flags; +} + +/* admin queue qp type */ +enum ionic_qp_type { + IONIC_QPT_RC, + IONIC_QPT_UC, + IONIC_QPT_RD, + IONIC_QPT_UD, + IONIC_QPT_SRQ, + IONIC_QPT_XRC_INI, + IONIC_QPT_XRC_TGT, + IONIC_QPT_XRC_SRQ, +}; + +static inline int to_ionic_qp_type(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_GSI: + case IB_QPT_UD: + return IONIC_QPT_UD; + case IB_QPT_RC: + return IONIC_QPT_RC; + case IB_QPT_UC: + return IONIC_QPT_UC; + case IB_QPT_XRC_INI: + return IONIC_QPT_XRC_INI; + case IB_QPT_XRC_TGT: + return IONIC_QPT_XRC_TGT; + default: + return -EINVAL; + } +} + +/* admin queue qp state */ +enum ionic_qp_state { + IONIC_QPS_RESET, + IONIC_QPS_INIT, + IONIC_QPS_RTR, + IONIC_QPS_RTS, + IONIC_QPS_SQD, + IONIC_QPS_SQE, + IONIC_QPS_ERR, +}; + +static inline int from_ionic_qp_state(enum ionic_qp_state state) +{ + switch (state) { + case IONIC_QPS_RESET: + return IB_QPS_RESET; + case IONIC_QPS_INIT: + return IB_QPS_INIT; + case IONIC_QPS_RTR: + return IB_QPS_RTR; + case IONIC_QPS_RTS: + return IB_QPS_RTS; + case IONIC_QPS_SQD: + return IB_QPS_SQD; + case IONIC_QPS_SQE: + return IB_QPS_SQE; + case IONIC_QPS_ERR: + return IB_QPS_ERR; + default: + return -EINVAL; + } +} + +static inline int to_ionic_qp_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return IONIC_QPS_RESET; + case IB_QPS_INIT: + return IONIC_QPS_INIT; + case IB_QPS_RTR: + return IONIC_QPS_RTR; + case IB_QPS_RTS: + return IONIC_QPS_RTS; + case IB_QPS_SQD: + return IONIC_QPS_SQD; + case IB_QPS_SQE: + return IONIC_QPS_SQE; + case IB_QPS_ERR: + return IONIC_QPS_ERR; + default: + return 0; + } +} + +static inline int to_ionic_qp_modify_state(enum ib_qp_state to_state, + enum ib_qp_state from_state) +{ + return to_ionic_qp_state(to_state) | + (to_ionic_qp_state(from_state) << 4); +} + +/* fw abi v1 */ + +/* data payload part of v1 wqe */ +union ionic_v1_pld { + struct ionic_sge sgl[2]; + __be32 spec32[8]; + __be16 spec16[16]; + __u8 data[32]; +}; /* completion queue v1 cqe */ struct ionic_v1_cqe { @@ -78,6 +338,390 @@ static inline u32 ionic_v1_cqe_qtf_qid(u32 qtf) return qtf >> IONIC_V1_CQE_QID_SHIFT; } +/* v1 base wqe header */ +struct ionic_v1_base_hdr { + __u64 wqe_id; + __u8 op; + __u8 num_sge_key; + __be16 flags; + __be32 imm_data_key; +}; + +/* v1 receive wqe body */ +struct ionic_v1_recv_bdy { + __u8 rsvd[16]; + union ionic_v1_pld pld; +}; + +/* v1 send/rdma wqe body (common, has sgl) */ +struct ionic_v1_common_bdy { + union { + struct { + __be32 ah_id; + __be32 dest_qpn; + __be32 dest_qkey; + } send; + struct { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + } rdma; + }; + __be32 length; + union ionic_v1_pld pld; +}; + +/* v1 atomic wqe body */ +struct ionic_v1_atomic_bdy { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + __be32 swap_add_high; + __be32 swap_add_low; + __be32 compare_high; + __be32 compare_low; + __u8 rsvd[4]; + struct ionic_sge sge; +}; + +/* v1 reg mr wqe body */ +struct ionic_v1_reg_mr_bdy { + __be64 va; + __be64 length; + __be64 offset; + __be64 dma_addr; + __be32 map_count; + __be16 flags; + __u8 dir_size_log2; + __u8 page_size_log2; + __u8 rsvd[8]; +}; + +/* v1 bind mw wqe body */ +struct ionic_v1_bind_mw_bdy { + __be64 va; + __be64 length; + __be32 lkey; + __be16 flags; + __u8 rsvd[26]; +}; + +/* v1 send/recv wqe */ +struct ionic_v1_wqe { + struct ionic_v1_base_hdr base; + union { + struct ionic_v1_recv_bdy recv; + struct ionic_v1_common_bdy common; + struct ionic_v1_atomic_bdy atomic; + struct ionic_v1_reg_mr_bdy reg_mr; + struct ionic_v1_bind_mw_bdy bind_mw; + }; +}; + +/* queue pair v1 send opcodes */ +enum ionic_v1_op { + IONIC_V1_OP_SEND, + IONIC_V1_OP_SEND_INV, + IONIC_V1_OP_SEND_IMM, + IONIC_V1_OP_RDMA_READ, + IONIC_V1_OP_RDMA_WRITE, + IONIC_V1_OP_RDMA_WRITE_IMM, + IONIC_V1_OP_ATOMIC_CS, + IONIC_V1_OP_ATOMIC_FA, + IONIC_V1_OP_REG_MR, + IONIC_V1_OP_LOCAL_INV, + IONIC_V1_OP_BIND_MW, + + /* flags */ + IONIC_V1_FLAG_FENCE = BIT(0), + IONIC_V1_FLAG_SOL = BIT(1), + IONIC_V1_FLAG_INL = BIT(2), + IONIC_V1_FLAG_SIG = BIT(3), + + /* flags last four bits for sgl spec format */ + IONIC_V1_FLAG_SPEC32 = (1u << 12), + IONIC_V1_FLAG_SPEC16 = (2u << 12), + IONIC_V1_SPEC_FIRST_SGE = 2, +}; + +static inline size_t ionic_v1_send_wqe_min_size(int min_sge, int min_data, + int spec, bool expdb) +{ + size_t sz_wqe, sz_sgl, sz_data; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + min_sge += IONIC_V1_SPEC_FIRST_SGE; + + if (expdb) { + min_sge += 1; + min_data += IONIC_EXP_DBELL_SZ; + } + + sz_wqe = sizeof(struct ionic_v1_wqe); + sz_sgl = offsetof(struct ionic_v1_wqe, common.pld.sgl[min_sge]); + sz_data = offsetof(struct ionic_v1_wqe, common.pld.data[min_data]); + + if (sz_sgl > sz_wqe) + sz_wqe = sz_sgl; + + if (sz_data > sz_wqe) + sz_wqe = sz_data; + + return sz_wqe; +} + +static inline int ionic_v1_send_wqe_max_sge(u8 stride_log2, int spec, + bool expdb) +{ + struct ionic_sge *sge = (void *)(1ull << stride_log2); + struct ionic_v1_wqe *wqe = (void *)0; + int num_sge = 0; + + if (expdb) + sge -= 1; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + num_sge = IONIC_V1_SPEC_FIRST_SGE; + + num_sge = sge - &wqe->common.pld.sgl[num_sge]; + + if (spec && num_sge > spec) + num_sge = spec; + + return num_sge; +} + +static inline int ionic_v1_send_wqe_max_data(u8 stride_log2, bool expdb) +{ + struct ionic_v1_wqe *wqe = (void *)0; + __u8 *data = (void *)(1ull << stride_log2); + + if (expdb) + data -= IONIC_EXP_DBELL_SZ; + + return data - wqe->common.pld.data; +} + +static inline size_t ionic_v1_recv_wqe_min_size(int min_sge, int spec, + bool expdb) +{ + size_t sz_wqe, sz_sgl; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + min_sge += IONIC_V1_SPEC_FIRST_SGE; + + if (expdb) + min_sge += 1; + + sz_wqe = sizeof(struct ionic_v1_wqe); + sz_sgl = offsetof(struct ionic_v1_wqe, recv.pld.sgl[min_sge]); + + if (sz_sgl > sz_wqe) + sz_wqe = sz_sgl; + + return sz_wqe; +} + +static inline int ionic_v1_recv_wqe_max_sge(u8 stride_log2, int spec, + bool expdb) +{ + struct ionic_sge *sge = (void *)(1ull << stride_log2); + struct ionic_v1_wqe *wqe = (void *)0; + int num_sge = 0; + + if (expdb) + sge -= 1; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + num_sge = IONIC_V1_SPEC_FIRST_SGE; + + num_sge = sge - &wqe->recv.pld.sgl[num_sge]; + + if (spec && num_sge > spec) + num_sge = spec; + + return num_sge; +} + +static inline int ionic_v1_use_spec_sge(int min_sge, int spec) +{ + if (!spec || min_sge > spec) + return 0; + + if (min_sge <= IONIC_V1_SPEC_FIRST_SGE) + return IONIC_V1_SPEC_FIRST_SGE; + + return spec; +} + +struct ionic_admin_create_ah { + __le64 dma_addr; + __le32 length; + __le32 pd_id; + __le32 id_ver; + __le16 dbid_flags; + __u8 csum_profile; + __u8 crypto; +} __packed; + +#define IONIC_ADMIN_CREATE_AH_IN_V1_LEN 24 +static_assert(sizeof(struct ionic_admin_create_ah) == + IONIC_ADMIN_CREATE_AH_IN_V1_LEN); + +struct ionic_admin_destroy_ah { + __le32 ah_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_AH_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_ah) == + IONIC_ADMIN_DESTROY_AH_IN_V1_LEN); + +struct ionic_admin_query_ah { + __le64 dma_addr; +} __packed; + +#define IONIC_ADMIN_QUERY_AH_IN_V1_LEN 8 +static_assert(sizeof(struct ionic_admin_query_ah) == + IONIC_ADMIN_QUERY_AH_IN_V1_LEN); + +struct ionic_admin_create_mr { + __le64 va; + __le64 length; + __le32 pd_id; + __le32 id_ver; + __le32 tbl_index; + __le32 map_count; + __le64 dma_addr; + __le16 dbid_flags; + __u8 pt_type; + __u8 dir_size_log2; + __u8 page_size_log2; +} __packed; + +#define IONIC_ADMIN_CREATE_MR_IN_V1_LEN 45 +static_assert(sizeof(struct ionic_admin_create_mr) == + IONIC_ADMIN_CREATE_MR_IN_V1_LEN); + +struct ionic_admin_destroy_mr { + __le32 mr_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_MR_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_mr) == + IONIC_ADMIN_DESTROY_MR_IN_V1_LEN); + +struct ionic_admin_create_cq { + __le32 eq_id; + __u8 depth_log2; + __u8 stride_log2; + __u8 dir_size_log2_rsvd; + __u8 page_size_log2; + __le32 cq_flags; + __le32 id_ver; + __le32 tbl_index; + __le32 map_count; + __le64 dma_addr; + __le16 dbid_flags; +} __packed; + +#define IONIC_ADMIN_CREATE_CQ_IN_V1_LEN 34 +static_assert(sizeof(struct ionic_admin_create_cq) == + IONIC_ADMIN_CREATE_CQ_IN_V1_LEN); + +struct ionic_admin_destroy_cq { + __le32 cq_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_cq) == + IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN); + +struct ionic_admin_create_qp { + __le32 pd_id; + __be32 priv_flags; + __le32 sq_cq_id; + __u8 sq_depth_log2; + __u8 sq_stride_log2; + __u8 sq_dir_size_log2_rsvd; + __u8 sq_page_size_log2; + __le32 sq_tbl_index_xrcd_id; + __le32 sq_map_count; + __le64 sq_dma_addr; + __le32 rq_cq_id; + __u8 rq_depth_log2; + __u8 rq_stride_log2; + __u8 rq_dir_size_log2_rsvd; + __u8 rq_page_size_log2; + __le32 rq_tbl_index_srq_id; + __le32 rq_map_count; + __le64 rq_dma_addr; + __le32 id_ver; + __le16 dbid_flags; + __u8 type_state; + __u8 rsvd; +} __packed; + +#define IONIC_ADMIN_CREATE_QP_IN_V1_LEN 64 +static_assert(sizeof(struct ionic_admin_create_qp) == + IONIC_ADMIN_CREATE_QP_IN_V1_LEN); + +struct ionic_admin_destroy_qp { + __le32 qp_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_QP_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_qp) == + IONIC_ADMIN_DESTROY_QP_IN_V1_LEN); + +struct ionic_admin_mod_qp { + __be32 attr_mask; + __u8 dcqcn_profile; + __u8 tfp_csum_profile; + __be16 access_flags; + __le32 rq_psn; + __le32 sq_psn; + __le32 qkey_dest_qpn; + __le32 rate_limit_kbps; + __u8 pmtu; + __u8 retry; + __u8 rnr_timer; + __u8 retry_timeout; + __u8 rsq_depth; + __u8 rrq_depth; + __le16 pkey_id; + __le32 ah_id_len; + __u8 en_pcp; + __u8 ip_dscp; + __u8 rsvd2; + __u8 type_state; + union { + struct { + __le16 rsvd1; + }; + __le32 rrq_index; + }; + __le32 rsq_index; + __le64 dma_addr; + __le32 id_ver; +} __packed; + +#define IONIC_ADMIN_MODIFY_QP_IN_V1_LEN 60 +static_assert(sizeof(struct ionic_admin_mod_qp) == + IONIC_ADMIN_MODIFY_QP_IN_V1_LEN); + +struct ionic_admin_query_qp { + __le64 hdr_dma_addr; + __le64 sq_dma_addr; + __le64 rq_dma_addr; + __le32 ah_id; + __le32 id_ver; + __le16 dbid_flags; +} __packed; + +#define IONIC_ADMIN_QUERY_QP_IN_V1_LEN 34 +static_assert(sizeof(struct ionic_admin_query_qp) == + IONIC_ADMIN_QUERY_QP_IN_V1_LEN); + #define ADMIN_WQE_STRIDE 64 #define ADMIN_WQE_HDR_LEN 4 @@ -88,9 +732,66 @@ struct ionic_v1_admin_wqe { __le16 len; union { + struct ionic_admin_create_ah create_ah; + struct ionic_admin_destroy_ah destroy_ah; + struct ionic_admin_query_ah query_ah; + struct ionic_admin_create_mr create_mr; + struct ionic_admin_destroy_mr destroy_mr; + struct ionic_admin_create_cq create_cq; + struct ionic_admin_destroy_cq destroy_cq; + struct ionic_admin_create_qp create_qp; + struct ionic_admin_destroy_qp destroy_qp; + struct ionic_admin_mod_qp mod_qp; + struct ionic_admin_query_qp query_qp; } cmd; }; +/* side data for query qp */ +struct ionic_v1_admin_query_qp_sq { + __u8 rnr_timer; + __u8 retry_timeout; + __be16 access_perms_flags; + __be16 rsvd; + __be16 pkey_id; + __be32 qkey_dest_qpn; + __be32 rate_limit_kbps; + __be32 rq_psn; +}; + +struct ionic_v1_admin_query_qp_rq { + __u8 state_pmtu; + __u8 retry_rnrtry; + __u8 rrq_depth; + __u8 rsq_depth; + __be32 sq_psn; + __be16 access_perms_flags; + __be16 rsvd; +}; + +/* admin queue v1 opcodes */ +enum ionic_v1_admin_op { + IONIC_V1_ADMIN_NOOP, + IONIC_V1_ADMIN_CREATE_CQ, + IONIC_V1_ADMIN_CREATE_QP, + IONIC_V1_ADMIN_CREATE_MR, + IONIC_V1_ADMIN_STATS_HDRS, + IONIC_V1_ADMIN_STATS_VALS, + IONIC_V1_ADMIN_DESTROY_MR, + IONIC_V1_ADMIN_RSVD_7, /* RESIZE_CQ */ + IONIC_V1_ADMIN_DESTROY_CQ, + IONIC_V1_ADMIN_MODIFY_QP, + IONIC_V1_ADMIN_QUERY_QP, + IONIC_V1_ADMIN_DESTROY_QP, + IONIC_V1_ADMIN_DEBUG, + IONIC_V1_ADMIN_CREATE_AH, + IONIC_V1_ADMIN_QUERY_AH, + IONIC_V1_ADMIN_MODIFY_DCQCN, + IONIC_V1_ADMIN_DESTROY_AH, + IONIC_V1_ADMIN_QP_STATS_HDRS, + IONIC_V1_ADMIN_QP_STATS_VALS, + IONIC_V1_ADMIN_OPCODES_MAX, +}; + /* admin queue v1 cqe status */ enum ionic_v1_admin_status { IONIC_V1_ASTS_OK, @@ -136,6 +837,22 @@ enum ionic_v1_eqe_evt_bits { IONIC_V1_EQE_QP_ERR_ACCESS = 10, }; +enum ionic_tfp_csum_profiles { + IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP = 0, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP = 1, + IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP = 2, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP = 3, + IONIC_TFP_CSUM_PROF_IPV4_UDP_VXLAN_ETH_QTAG_IPV4_UDP = 4, + IONIC_TFP_CSUM_PROF_IPV4_UDP_VXLAN_ETH_QTAG_IPV6_UDP = 5, + IONIC_TFP_CSUM_PROF_QTAG_IPV4_UDP_VXLAN_ETH_QTAG_IPV4_UDP = 6, + IONIC_TFP_CSUM_PROF_QTAG_IPV4_UDP_VXLAN_ETH_QTAG_IPV6_UDP = 7, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_ESP_IPV4_UDP = 8, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_ESP_UDP = 9, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_ESP_UDP = 10, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_ESP_UDP = 11, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_CSUM = 12, +}; + static inline bool ionic_v1_eqe_color(struct ionic_v1_eqe *eqe) { return eqe->evt & cpu_to_be32(IONIC_V1_EQE_COLOR); diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c index 7710190ff65f..6833abbfb1dc 100644 --- a/drivers/infiniband/hw/ionic/ionic_ibdev.c +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c @@ -15,6 +15,44 @@ MODULE_DESCRIPTION(DRIVER_DESCRIPTION); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("NET_IONIC"); +static const struct ib_device_ops ionic_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_IONIC, + .uverbs_abi_ver = IONIC_ABI_VERSION, + + .alloc_ucontext = ionic_alloc_ucontext, + .dealloc_ucontext = ionic_dealloc_ucontext, + .mmap = ionic_mmap, + .mmap_free = ionic_mmap_free, + .alloc_pd = ionic_alloc_pd, + .dealloc_pd = ionic_dealloc_pd, + .create_ah = ionic_create_ah, + .query_ah = ionic_query_ah, + .destroy_ah = ionic_destroy_ah, + .create_user_ah = ionic_create_ah, + .get_dma_mr = ionic_get_dma_mr, + .reg_user_mr = ionic_reg_user_mr, + .reg_user_mr_dmabuf = ionic_reg_user_mr_dmabuf, + .dereg_mr = ionic_dereg_mr, + .alloc_mr = ionic_alloc_mr, + .map_mr_sg = ionic_map_mr_sg, + .alloc_mw = ionic_alloc_mw, + .dealloc_mw = ionic_dealloc_mw, + .create_cq = ionic_create_cq, + .destroy_cq = ionic_destroy_cq, + .create_qp = ionic_create_qp, + .modify_qp = ionic_modify_qp, + .query_qp = ionic_query_qp, + .destroy_qp = ionic_destroy_qp, + + INIT_RDMA_OBJ_SIZE(ib_ucontext, ionic_ctx, ibctx), + INIT_RDMA_OBJ_SIZE(ib_pd, ionic_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ah, ionic_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, ionic_vcq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_qp, ionic_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_mw, ionic_mr, ibmw), +}; + static void ionic_init_resids(struct ionic_ibdev *dev) { ionic_resid_init(&dev->inuse_cqid, dev->lif_cfg.cq_count); @@ -48,6 +86,8 @@ static void ionic_destroy_ibdev(struct ionic_ibdev *dev) ib_unregister_device(&dev->ibdev); ionic_destroy_rdma_admin(dev); ionic_destroy_resids(dev); + WARN_ON(!xa_empty(&dev->qp_tbl)); + xa_destroy(&dev->qp_tbl); WARN_ON(!xa_empty(&dev->cq_tbl)); xa_destroy(&dev->cq_tbl); ib_dealloc_device(&dev->ibdev); @@ -66,6 +106,7 @@ static struct ionic_ibdev *ionic_create_ibdev(struct ionic_aux_dev *ionic_adev) ionic_fill_lif_cfg(ionic_adev->lif, &dev->lif_cfg); + xa_init_flags(&dev->qp_tbl, GFP_ATOMIC); xa_init_flags(&dev->cq_tbl, GFP_ATOMIC); ionic_init_resids(dev); @@ -98,6 +139,8 @@ static struct ionic_ibdev *ionic_create_ibdev(struct ionic_aux_dev *ionic_adev) if (rc) goto err_admin; + ib_set_device_ops(&dev->ibdev, &ionic_dev_ops); + rc = ib_register_device(ibdev, "ionic_%d", ibdev->dev.parent); if (rc) goto err_register; @@ -110,6 +153,7 @@ err_admin: ionic_destroy_rdma_admin(dev); err_reset: ionic_destroy_resids(dev); + xa_destroy(&dev->qp_tbl); xa_destroy(&dev->cq_tbl); ib_dealloc_device(&dev->ibdev); @@ -161,7 +205,7 @@ static int __init ionic_mod_init(void) { int rc; - ionic_evt_workq = create_workqueue(DRIVER_NAME "-evt"); + ionic_evt_workq = create_workqueue(KBUILD_MODNAME "-evt"); if (!ionic_evt_workq) return -ENOMEM; diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.h b/drivers/infiniband/hw/ionic/ionic_ibdev.h index 490897628f41..94ef76aaca43 100644 --- a/drivers/infiniband/hw/ionic/ionic_ibdev.h +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.h @@ -6,7 +6,10 @@ #include #include +#include +#include +#include #include #include @@ -24,9 +27,26 @@ #define IONIC_EQ_ISR_BUDGET 10 #define IONIC_EQ_WORK_BUDGET 1000 #define IONIC_MAX_PD 1024 +#define IONIC_SPEC_HIGH 8 +#define IONIC_SQCMB_ORDER 5 +#define IONIC_RQCMB_ORDER 0 + +#define IONIC_META_LAST ((void *)1ul) +#define IONIC_META_POSTED ((void *)2ul) #define IONIC_CQ_GRACE 100 +#define IONIC_ROCE_UDP_SPORT 28272 +#define IONIC_DMA_LKEY 0 +#define IONIC_DMA_RKEY IONIC_DMA_LKEY + +#define IONIC_CMB_SUPPORTED \ + (IONIC_CMB_ENABLE | IONIC_CMB_REQUIRE | IONIC_CMB_EXPDB | \ + IONIC_CMB_WC | IONIC_CMB_UC) + +/* resource is not reserved on the device, indicated in tbl_order */ +#define IONIC_RES_INVALID -1 + struct ionic_aq; struct ionic_cq; struct ionic_eq; @@ -44,14 +64,6 @@ enum ionic_admin_flags { IONIC_ADMIN_F_INTERRUPT = BIT(2), /* Interruptible w/timeout */ }; -struct ionic_qdesc { - __aligned_u64 addr; - __u32 size; - __u16 mask; - __u8 depth_log2; - __u8 stride_log2; -}; - enum ionic_mmap_flag { IONIC_MMAP_WC = BIT(0), }; @@ -160,6 +172,13 @@ struct ionic_tbl_buf { u8 page_size_log2; }; +struct ionic_pd { + struct ib_pd ibpd; + + u32 pdid; + u32 flags; +}; + struct ionic_cq { struct ionic_vcq *vcq; @@ -193,11 +212,188 @@ struct ionic_vcq { u8 poll_idx; }; +struct ionic_sq_meta { + u64 wrid; + u32 len; + u16 seq; + u8 ibop; + u8 ibsts; + u8 remote:1; + u8 signal:1; + u8 local_comp:1; +}; + +struct ionic_rq_meta { + struct ionic_rq_meta *next; + u64 wrid; +}; + +struct ionic_qp { + struct ib_qp ibqp; + enum ib_qp_state state; + + u32 qpid; + u32 ahid; + u32 sq_cqid; + u32 rq_cqid; + u8 udma_idx; + u8 has_ah:1; + u8 has_sq:1; + u8 has_rq:1; + u8 sig_all:1; + + struct list_head qp_list_counter; + + struct list_head cq_poll_sq; + struct list_head cq_flush_sq; + struct list_head cq_flush_rq; + struct list_head ibkill_flush_ent; + + spinlock_t sq_lock; /* for posting and polling */ + struct ionic_queue sq; + struct ionic_sq_meta *sq_meta; + u16 *sq_msn_idx; + int sq_spec; + u16 sq_old_prod; + u16 sq_msn_prod; + u16 sq_msn_cons; + u8 sq_cmb; + bool sq_flush; + bool sq_flush_rcvd; + + spinlock_t rq_lock; /* for posting and polling */ + struct ionic_queue rq; + struct ionic_rq_meta *rq_meta; + struct ionic_rq_meta *rq_meta_head; + int rq_spec; + u16 rq_old_prod; + u8 rq_cmb; + bool rq_flush; + + struct kref qp_kref; + struct completion qp_rel_comp; + + /* infrequently accessed, keep at end */ + int sgid_index; + int sq_cmb_order; + u32 sq_cmb_pgid; + phys_addr_t sq_cmb_addr; + struct rdma_user_mmap_entry *mmap_sq_cmb; + + struct ib_umem *sq_umem; + + int rq_cmb_order; + u32 rq_cmb_pgid; + phys_addr_t rq_cmb_addr; + struct rdma_user_mmap_entry *mmap_rq_cmb; + + struct ib_umem *rq_umem; + + int dcqcn_profile; + + struct ib_ud_header *hdr; +}; + +struct ionic_ah { + struct ib_ah ibah; + u32 ahid; + int sgid_index; + struct ib_ud_header hdr; +}; + +struct ionic_mr { + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + }; + + u32 mrid; + int flags; + + struct ib_umem *umem; + struct ionic_tbl_buf buf; + bool created; +}; + static inline struct ionic_ibdev *to_ionic_ibdev(struct ib_device *ibdev) { return container_of(ibdev, struct ionic_ibdev, ibdev); } +static inline struct ionic_ctx *to_ionic_ctx(struct ib_ucontext *ibctx) +{ + return container_of(ibctx, struct ionic_ctx, ibctx); +} + +static inline struct ionic_ctx *to_ionic_ctx_uobj(struct ib_uobject *uobj) +{ + if (!uobj) + return NULL; + + if (!uobj->context) + return NULL; + + return to_ionic_ctx(uobj->context); +} + +static inline struct ionic_pd *to_ionic_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ionic_pd, ibpd); +} + +static inline struct ionic_mr *to_ionic_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ionic_mr, ibmr); +} + +static inline struct ionic_mr *to_ionic_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct ionic_mr, ibmw); +} + +static inline struct ionic_vcq *to_ionic_vcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ionic_vcq, ibcq); +} + +static inline struct ionic_cq *to_ionic_vcq_cq(struct ib_cq *ibcq, + uint8_t udma_idx) +{ + return &to_ionic_vcq(ibcq)->cq[udma_idx]; +} + +static inline struct ionic_qp *to_ionic_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ionic_qp, ibqp); +} + +static inline struct ionic_ah *to_ionic_ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ionic_ah, ibah); +} + +static inline u32 ionic_ctx_dbid(struct ionic_ibdev *dev, + struct ionic_ctx *ctx) +{ + if (!ctx) + return dev->lif_cfg.dbid; + + return ctx->dbid; +} + +static inline u32 ionic_obj_dbid(struct ionic_ibdev *dev, + struct ib_uobject *uobj) +{ + return ionic_ctx_dbid(dev, to_ionic_ctx_uobj(uobj)); +} + +static inline void ionic_qp_complete(struct kref *kref) +{ + struct ionic_qp *qp = container_of(kref, struct ionic_qp, qp_kref); + + complete(&qp->qp_rel_comp); +} + static inline void ionic_cq_complete(struct kref *kref) { struct ionic_cq *cq = container_of(kref, struct ionic_cq, cq_kref); @@ -227,8 +423,47 @@ int ionic_create_cq_common(struct ionic_vcq *vcq, __u32 *resp_cqid, int udma_idx); void ionic_destroy_cq_common(struct ionic_ibdev *dev, struct ionic_cq *cq); +void ionic_flush_qp(struct ionic_ibdev *dev, struct ionic_qp *qp); +void ionic_notify_flush_cq(struct ionic_cq *cq); + +int ionic_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata); +void ionic_dealloc_ucontext(struct ib_ucontext *ibctx); +int ionic_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma); +void ionic_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +int ionic_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int ionic_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int ionic_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int ionic_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); +int ionic_destroy_ah(struct ib_ah *ibah, u32 flags); +struct ib_mr *ionic_get_dma_mr(struct ib_pd *ibpd, int access); +struct ib_mr *ionic_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 addr, int access, struct ib_dmah *dmah, + struct ib_udata *udata); +struct ib_mr *ionic_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 offset, + u64 length, u64 addr, int fd, int access, + struct ib_dmah *dmah, + struct uverbs_attr_bundle *attrs); +int ionic_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); +struct ib_mr *ionic_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type type, + u32 max_sg); +int ionic_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +int ionic_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); +int ionic_dealloc_mw(struct ib_mw *ibmw); +int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs); +int ionic_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int ionic_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata); +int ionic_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata); +int ionic_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_qp_init_attr *init_attr); +int ionic_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); /* ionic_pgtbl.c */ +__le64 ionic_pgtbl_dma(struct ionic_tbl_buf *buf, u64 va); int ionic_pgtbl_page(struct ionic_tbl_buf *buf, u64 dma); int ionic_pgtbl_init(struct ionic_ibdev *dev, struct ionic_tbl_buf *buf, diff --git a/drivers/infiniband/hw/ionic/ionic_pgtbl.c b/drivers/infiniband/hw/ionic/ionic_pgtbl.c index 11461f7642bc..a8eb73be6f86 100644 --- a/drivers/infiniband/hw/ionic/ionic_pgtbl.c +++ b/drivers/infiniband/hw/ionic/ionic_pgtbl.c @@ -7,6 +7,25 @@ #include "ionic_fw.h" #include "ionic_ibdev.h" +__le64 ionic_pgtbl_dma(struct ionic_tbl_buf *buf, u64 va) +{ + u64 pg_mask = BIT_ULL(buf->page_size_log2) - 1; + u64 dma; + + if (!buf->tbl_pages) + return cpu_to_le64(0); + + if (buf->tbl_pages > 1) + return cpu_to_le64(buf->tbl_dma); + + if (buf->tbl_buf) + dma = le64_to_cpu(buf->tbl_buf[0]); + else + dma = buf->tbl_dma; + + return cpu_to_le64(dma + (va & pg_mask)); +} + int ionic_pgtbl_page(struct ionic_tbl_buf *buf, u64 dma) { if (unlikely(buf->tbl_pages == buf->tbl_limit)) diff --git a/include/uapi/rdma/ionic-abi.h b/include/uapi/rdma/ionic-abi.h new file mode 100644 index 000000000000..7b589d3e9728 --- /dev/null +++ b/include/uapi/rdma/ionic-abi.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc */ + +#ifndef IONIC_ABI_H +#define IONIC_ABI_H + +#include + +#define IONIC_ABI_VERSION 1 + +#define IONIC_EXPDB_64 1 +#define IONIC_EXPDB_128 2 +#define IONIC_EXPDB_256 4 +#define IONIC_EXPDB_512 8 + +#define IONIC_EXPDB_SQ 1 +#define IONIC_EXPDB_RQ 2 + +#define IONIC_CMB_ENABLE 1 +#define IONIC_CMB_REQUIRE 2 +#define IONIC_CMB_EXPDB 4 +#define IONIC_CMB_WC 8 +#define IONIC_CMB_UC 16 + +struct ionic_ctx_req { + __u32 rsvd[2]; +}; + +struct ionic_ctx_resp { + __u32 rsvd; + __u32 page_shift; + + __aligned_u64 dbell_offset; + + __u16 version; + __u8 qp_opcodes; + __u8 admin_opcodes; + + __u8 sq_qtype; + __u8 rq_qtype; + __u8 cq_qtype; + __u8 admin_qtype; + + __u8 max_stride; + __u8 max_spec; + __u8 udma_count; + __u8 expdb_mask; + __u8 expdb_qtypes; + + __u8 rsvd2[3]; +}; + +struct ionic_qdesc { + __aligned_u64 addr; + __u32 size; + __u16 mask; + __u8 depth_log2; + __u8 stride_log2; +}; + +struct ionic_ah_resp { + __u32 ahid; + __u32 pad; +}; + +struct ionic_cq_req { + struct ionic_qdesc cq[2]; + __u8 udma_mask; + __u8 rsvd[7]; +}; + +struct ionic_cq_resp { + __u32 cqid[2]; + __u8 udma_mask; + __u8 rsvd[7]; +}; + +struct ionic_qp_req { + struct ionic_qdesc sq; + struct ionic_qdesc rq; + __u8 sq_spec; + __u8 rq_spec; + __u8 sq_cmb; + __u8 rq_cmb; + __u8 udma_mask; + __u8 rsvd[3]; +}; + +struct ionic_qp_resp { + __u32 qpid; + __u8 sq_cmb; + __u8 rq_cmb; + __u8 udma_idx; + __u8 rsvd[1]; + __aligned_u64 sq_cmb_offset; + __aligned_u64 rq_cmb_offset; +}; + +struct ionic_srq_req { + struct ionic_qdesc rq; + __u8 rq_spec; + __u8 rq_cmb; + __u8 udma_mask; + __u8 rsvd[5]; +}; + +struct ionic_srq_resp { + __u32 qpid; + __u8 rq_cmb; + __u8 udma_idx; + __u8 rsvd[2]; + __aligned_u64 rq_cmb_offset; +}; + +#endif /* IONIC_ABI_H */ -- cgit v1.2.3 From c924c65f52c300ba36373e140a43a8e723c3abdd Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 13 Aug 2025 08:02:52 +0200 Subject: tee: implement protected DMA-heap Implement DMA heap for protected DMA-buf allocation in the TEE subsystem. Protected memory refers to memory buffers behind a hardware enforced firewall. It is not accessible to the kernel during normal circumstances but rather only accessible to certain hardware IPs or CPUs executing in higher or differently privileged mode than the kernel itself. This interface allows to allocate and manage such protected memory buffers via interaction with a TEE implementation. The protected memory is allocated for a specific use-case, like Secure Video Playback, Trusted UI, or Secure Video Recording where certain hardware devices can access the memory. The DMA-heaps are enabled explicitly by the TEE backend driver. The TEE backend drivers needs to implement protected memory pool to manage the protected memory. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/Kconfig | 5 + drivers/tee/Makefile | 1 + drivers/tee/tee_core.c | 4 + drivers/tee/tee_heap.c | 500 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/tee/tee_private.h | 6 + include/linux/tee_core.h | 53 +++++ 6 files changed, 569 insertions(+) create mode 100644 drivers/tee/tee_heap.c (limited to 'include') diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 61b507c18780..90600607a9d8 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -13,6 +13,11 @@ menuconfig TEE if TEE +config TEE_DMABUF_HEAPS + bool + depends on HAS_DMA && DMABUF_HEAPS + default y + source "drivers/tee/optee/Kconfig" source "drivers/tee/amdtee/Kconfig" source "drivers/tee/tstee/Kconfig" diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile index 5488cba30bd2..949a6a79fb06 100644 --- a/drivers/tee/Makefile +++ b/drivers/tee/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_TEE) += tee.o tee-objs += tee_core.o +tee-objs += tee_heap.o tee-objs += tee_shm.o tee-objs += tee_shm_pool.o obj-$(CONFIG_OPTEE) += optee/ diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index acc7998758ad..2411d1e2aa7a 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -1064,6 +1064,8 @@ void tee_device_unregister(struct tee_device *teedev) if (!teedev) return; + tee_device_put_all_dma_heaps(teedev); + if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) cdev_device_del(&teedev->cdev, &teedev->dev); @@ -1287,3 +1289,5 @@ MODULE_AUTHOR("Linaro"); MODULE_DESCRIPTION("TEE Driver"); MODULE_VERSION("1.0"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS("DMA_BUF"); +MODULE_IMPORT_NS("DMA_BUF_HEAP"); diff --git a/drivers/tee/tee_heap.c b/drivers/tee/tee_heap.c new file mode 100644 index 000000000000..d8d7735cdffb --- /dev/null +++ b/drivers/tee/tee_heap.c @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Linaro Limited + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tee_private.h" + +struct tee_dma_heap { + struct dma_heap *heap; + enum tee_dma_heap_id id; + struct kref kref; + struct tee_protmem_pool *pool; + struct tee_device *teedev; + bool shutting_down; + /* Protects pool, teedev, and shutting_down above */ + struct mutex mu; +}; + +struct tee_heap_buffer { + struct tee_dma_heap *heap; + size_t size; + size_t offs; + struct sg_table table; +}; + +struct tee_heap_attachment { + struct sg_table table; + struct device *dev; +}; + +struct tee_protmem_static_pool { + struct tee_protmem_pool pool; + struct gen_pool *gen_pool; + phys_addr_t pa_base; +}; + +#if IS_ENABLED(CONFIG_TEE_DMABUF_HEAPS) +static DEFINE_XARRAY_ALLOC(tee_dma_heap); + +static void tee_heap_release(struct kref *kref) +{ + struct tee_dma_heap *h = container_of(kref, struct tee_dma_heap, kref); + + h->pool->ops->destroy_pool(h->pool); + tee_device_put(h->teedev); + h->pool = NULL; + h->teedev = NULL; +} + +static void put_tee_heap(struct tee_dma_heap *h) +{ + kref_put(&h->kref, tee_heap_release); +} + +static void get_tee_heap(struct tee_dma_heap *h) +{ + kref_get(&h->kref); +} + +static int copy_sg_table(struct sg_table *dst, struct sg_table *src) +{ + struct scatterlist *dst_sg; + struct scatterlist *src_sg; + int ret; + int i; + + ret = sg_alloc_table(dst, src->orig_nents, GFP_KERNEL); + if (ret) + return ret; + + dst_sg = dst->sgl; + for_each_sgtable_sg(src, src_sg, i) { + sg_set_page(dst_sg, sg_page(src_sg), src_sg->length, + src_sg->offset); + dst_sg = sg_next(dst_sg); + } + + return 0; +} + +static int tee_heap_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct tee_heap_buffer *buf = dmabuf->priv; + struct tee_heap_attachment *a; + int ret; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + ret = copy_sg_table(&a->table, &buf->table); + if (ret) { + kfree(a); + return ret; + } + + a->dev = attachment->dev; + attachment->priv = a; + + return 0; +} + +static void tee_heap_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct tee_heap_attachment *a = attachment->priv; + + sg_free_table(&a->table); + kfree(a); +} + +static struct sg_table * +tee_heap_map_dma_buf(struct dma_buf_attachment *attachment, + enum dma_data_direction direction) +{ + struct tee_heap_attachment *a = attachment->priv; + int ret; + + ret = dma_map_sgtable(attachment->dev, &a->table, direction, + DMA_ATTR_SKIP_CPU_SYNC); + if (ret) + return ERR_PTR(ret); + + return &a->table; +} + +static void tee_heap_unmap_dma_buf(struct dma_buf_attachment *attachment, + struct sg_table *table, + enum dma_data_direction direction) +{ + struct tee_heap_attachment *a = attachment->priv; + + WARN_ON(&a->table != table); + + dma_unmap_sgtable(attachment->dev, table, direction, + DMA_ATTR_SKIP_CPU_SYNC); +} + +static void tee_heap_buf_free(struct dma_buf *dmabuf) +{ + struct tee_heap_buffer *buf = dmabuf->priv; + + buf->heap->pool->ops->free(buf->heap->pool, &buf->table); + mutex_lock(&buf->heap->mu); + put_tee_heap(buf->heap); + mutex_unlock(&buf->heap->mu); + kfree(buf); +} + +static const struct dma_buf_ops tee_heap_buf_ops = { + .attach = tee_heap_attach, + .detach = tee_heap_detach, + .map_dma_buf = tee_heap_map_dma_buf, + .unmap_dma_buf = tee_heap_unmap_dma_buf, + .release = tee_heap_buf_free, +}; + +static struct dma_buf *tee_dma_heap_alloc(struct dma_heap *heap, + unsigned long len, u32 fd_flags, + u64 heap_flags) +{ + struct tee_dma_heap *h = dma_heap_get_drvdata(heap); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct tee_device *teedev = NULL; + struct tee_heap_buffer *buf; + struct tee_protmem_pool *pool; + struct dma_buf *dmabuf; + int rc; + + mutex_lock(&h->mu); + if (h->teedev) { + teedev = h->teedev; + pool = h->pool; + get_tee_heap(h); + } + mutex_unlock(&h->mu); + + if (!teedev) + return ERR_PTR(-EINVAL); + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + dmabuf = ERR_PTR(-ENOMEM); + goto err; + } + buf->size = len; + buf->heap = h; + + rc = pool->ops->alloc(pool, &buf->table, len, &buf->offs); + if (rc) { + dmabuf = ERR_PTR(rc); + goto err_kfree; + } + + exp_info.ops = &tee_heap_buf_ops; + exp_info.size = len; + exp_info.priv = buf; + exp_info.flags = fd_flags; + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) + goto err_protmem_free; + + return dmabuf; + +err_protmem_free: + pool->ops->free(pool, &buf->table); +err_kfree: + kfree(buf); +err: + mutex_lock(&h->mu); + put_tee_heap(h); + mutex_unlock(&h->mu); + return dmabuf; +} + +static const struct dma_heap_ops tee_dma_heap_ops = { + .allocate = tee_dma_heap_alloc, +}; + +static const char *heap_id_2_name(enum tee_dma_heap_id id) +{ + switch (id) { + case TEE_DMA_HEAP_SECURE_VIDEO_PLAY: + return "protected,secure-video"; + case TEE_DMA_HEAP_TRUSTED_UI: + return "protected,trusted-ui"; + case TEE_DMA_HEAP_SECURE_VIDEO_RECORD: + return "protected,secure-video-record"; + default: + return NULL; + } +} + +static int alloc_dma_heap(struct tee_device *teedev, enum tee_dma_heap_id id, + struct tee_protmem_pool *pool) +{ + struct dma_heap_export_info exp_info = { + .ops = &tee_dma_heap_ops, + .name = heap_id_2_name(id), + }; + struct tee_dma_heap *h; + int rc; + + if (!exp_info.name) + return -EINVAL; + + if (xa_reserve(&tee_dma_heap, id, GFP_KERNEL)) { + if (!xa_load(&tee_dma_heap, id)) + return -EEXIST; + return -ENOMEM; + } + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + h->id = id; + kref_init(&h->kref); + h->teedev = teedev; + h->pool = pool; + mutex_init(&h->mu); + + exp_info.priv = h; + h->heap = dma_heap_add(&exp_info); + if (IS_ERR(h->heap)) { + rc = PTR_ERR(h->heap); + kfree(h); + + return rc; + } + + /* "can't fail" due to the call to xa_reserve() above */ + return WARN_ON(xa_is_err(xa_store(&tee_dma_heap, id, h, GFP_KERNEL))); +} + +int tee_device_register_dma_heap(struct tee_device *teedev, + enum tee_dma_heap_id id, + struct tee_protmem_pool *pool) +{ + struct tee_dma_heap *h; + int rc; + + if (!tee_device_get(teedev)) + return -EINVAL; + + h = xa_load(&tee_dma_heap, id); + if (h) { + mutex_lock(&h->mu); + if (h->teedev) { + rc = -EBUSY; + } else { + kref_init(&h->kref); + h->shutting_down = false; + h->teedev = teedev; + h->pool = pool; + rc = 0; + } + mutex_unlock(&h->mu); + } else { + rc = alloc_dma_heap(teedev, id, pool); + } + + if (rc) { + tee_device_put(teedev); + dev_err(&teedev->dev, "can't register DMA heap id %d (%s)\n", + id, heap_id_2_name(id)); + } + + return rc; +} +EXPORT_SYMBOL_GPL(tee_device_register_dma_heap); + +void tee_device_put_all_dma_heaps(struct tee_device *teedev) +{ + struct tee_dma_heap *h; + u_long i; + + xa_for_each(&tee_dma_heap, i, h) { + if (h) { + mutex_lock(&h->mu); + if (h->teedev == teedev && !h->shutting_down) { + h->shutting_down = true; + put_tee_heap(h); + } + mutex_unlock(&h->mu); + } + } +} +EXPORT_SYMBOL_GPL(tee_device_put_all_dma_heaps); + +int tee_heap_update_from_dma_buf(struct tee_device *teedev, + struct dma_buf *dmabuf, size_t *offset, + struct tee_shm *shm, + struct tee_shm **parent_shm) +{ + struct tee_heap_buffer *buf; + int rc; + + /* The DMA-buf must be from our heap */ + if (dmabuf->ops != &tee_heap_buf_ops) + return -EINVAL; + + buf = dmabuf->priv; + /* The buffer must be from the same teedev */ + if (buf->heap->teedev != teedev) + return -EINVAL; + + shm->size = buf->size; + + rc = buf->heap->pool->ops->update_shm(buf->heap->pool, &buf->table, + buf->offs, shm, parent_shm); + if (!rc && *parent_shm) + *offset = buf->offs; + + return rc; +} +#else +int tee_device_register_dma_heap(struct tee_device *teedev __always_unused, + enum tee_dma_heap_id id __always_unused, + struct tee_protmem_pool *pool __always_unused) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(tee_device_register_dma_heap); + +void +tee_device_put_all_dma_heaps(struct tee_device *teedev __always_unused) +{ +} +EXPORT_SYMBOL_GPL(tee_device_put_all_dma_heaps); + +int tee_heap_update_from_dma_buf(struct tee_device *teedev __always_unused, + struct dma_buf *dmabuf __always_unused, + size_t *offset __always_unused, + struct tee_shm *shm __always_unused, + struct tee_shm **parent_shm __always_unused) +{ + return -EINVAL; +} +#endif + +static struct tee_protmem_static_pool * +to_protmem_static_pool(struct tee_protmem_pool *pool) +{ + return container_of(pool, struct tee_protmem_static_pool, pool); +} + +static int protmem_pool_op_static_alloc(struct tee_protmem_pool *pool, + struct sg_table *sgt, size_t size, + size_t *offs) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + phys_addr_t pa; + int ret; + + pa = gen_pool_alloc(stp->gen_pool, size); + if (!pa) + return -ENOMEM; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (ret) { + gen_pool_free(stp->gen_pool, pa, size); + return ret; + } + + sg_set_page(sgt->sgl, phys_to_page(pa), size, 0); + *offs = pa - stp->pa_base; + + return 0; +} + +static void protmem_pool_op_static_free(struct tee_protmem_pool *pool, + struct sg_table *sgt) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + struct scatterlist *sg; + int i; + + for_each_sgtable_sg(sgt, sg, i) + gen_pool_free(stp->gen_pool, sg_phys(sg), sg->length); + sg_free_table(sgt); +} + +static int protmem_pool_op_static_update_shm(struct tee_protmem_pool *pool, + struct sg_table *sgt, size_t offs, + struct tee_shm *shm, + struct tee_shm **parent_shm) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + + shm->paddr = stp->pa_base + offs; + *parent_shm = NULL; + + return 0; +} + +static void protmem_pool_op_static_destroy_pool(struct tee_protmem_pool *pool) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + + gen_pool_destroy(stp->gen_pool); + kfree(stp); +} + +static struct tee_protmem_pool_ops protmem_pool_ops_static = { + .alloc = protmem_pool_op_static_alloc, + .free = protmem_pool_op_static_free, + .update_shm = protmem_pool_op_static_update_shm, + .destroy_pool = protmem_pool_op_static_destroy_pool, +}; + +struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, + size_t size) +{ + const size_t page_mask = PAGE_SIZE - 1; + struct tee_protmem_static_pool *stp; + int rc; + + /* Check it's page aligned */ + if ((paddr | size) & page_mask) + return ERR_PTR(-EINVAL); + + if (!pfn_valid(PHYS_PFN(paddr))) + return ERR_PTR(-EINVAL); + + stp = kzalloc(sizeof(*stp), GFP_KERNEL); + if (!stp) + return ERR_PTR(-ENOMEM); + + stp->gen_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!stp->gen_pool) { + rc = -ENOMEM; + goto err_free; + } + + rc = gen_pool_add(stp->gen_pool, paddr, size, -1); + if (rc) + goto err_free_pool; + + stp->pool.ops = &protmem_pool_ops_static; + stp->pa_base = paddr; + return &stp->pool; + +err_free_pool: + gen_pool_destroy(stp->gen_pool); +err_free: + kfree(stp); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(tee_protmem_static_pool_alloc); diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index 9bc50605227c..6c6ff5d5eed2 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -24,4 +25,9 @@ struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_user_buf(struct tee_context *ctx, unsigned long addr, size_t length); +int tee_heap_update_from_dma_buf(struct tee_device *teedev, + struct dma_buf *dmabuf, size_t *offset, + struct tee_shm *shm, + struct tee_shm **parent_shm); + #endif /*TEE_PRIVATE_H*/ diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index a38494d6b5f4..28b65010b9ed 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -8,9 +8,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -30,6 +32,12 @@ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 +enum tee_dma_heap_id { + TEE_DMA_HEAP_SECURE_VIDEO_PLAY = 1, + TEE_DMA_HEAP_TRUSTED_UI, + TEE_DMA_HEAP_SECURE_VIDEO_RECORD, +}; + /** * struct tee_device - TEE Device representation * @name: name of device @@ -116,6 +124,36 @@ struct tee_desc { u32 flags; }; +/** + * struct tee_protmem_pool - protected memory pool + * @ops: operations + * + * This is an abstract interface where this struct is expected to be + * embedded in another struct specific to the implementation. + */ +struct tee_protmem_pool { + const struct tee_protmem_pool_ops *ops; +}; + +/** + * struct tee_protmem_pool_ops - protected memory pool operations + * @alloc: called when allocating protected memory + * @free: called when freeing protected memory + * @update_shm: called when registering a dma-buf to update the @shm + * with physical address of the buffer or to return the + * @parent_shm of the memory pool + * @destroy_pool: called when destroying the pool + */ +struct tee_protmem_pool_ops { + int (*alloc)(struct tee_protmem_pool *pool, struct sg_table *sgt, + size_t size, size_t *offs); + void (*free)(struct tee_protmem_pool *pool, struct sg_table *sgt); + int (*update_shm)(struct tee_protmem_pool *pool, struct sg_table *sgt, + size_t offs, struct tee_shm *shm, + struct tee_shm **parent_shm); + void (*destroy_pool)(struct tee_protmem_pool *pool); +}; + /** * tee_device_alloc() - Allocate a new struct tee_device instance * @teedesc: Descriptor for this driver @@ -154,6 +192,11 @@ int tee_device_register(struct tee_device *teedev); */ void tee_device_unregister(struct tee_device *teedev); +int tee_device_register_dma_heap(struct tee_device *teedev, + enum tee_dma_heap_id id, + struct tee_protmem_pool *pool); +void tee_device_put_all_dma_heaps(struct tee_device *teedev); + /** * tee_device_set_dev_groups() - Set device attribute groups * @teedev: Device to register @@ -229,6 +272,16 @@ static inline void tee_shm_pool_free(struct tee_shm_pool *pool) pool->ops->destroy_pool(pool); } +/** + * tee_protmem_static_pool_alloc() - Create a protected memory manager + * @paddr: Physical address of start of pool + * @size: Size in bytes of the pool + * + * @returns pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. + */ +struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, + size_t size); + /** * tee_get_drvdata() - Return driver_data pointer * @returns the driver_data pointer supplied to tee_register(). -- cgit v1.2.3 From 146bf4e75ecab9759ed78c9d167e860042d627fb Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Wed, 13 Aug 2025 08:02:54 +0200 Subject: tee: new ioctl to a register tee_shm from a dmabuf file descriptor Add a userspace API to create a tee_shm object that refers to a dmabuf reference. Userspace registers the dmabuf file descriptor as in a tee_shm object. The registration is completed with a tee_shm returned file descriptor. Userspace is free to close the dmabuf file descriptor after it has been registered since all the resources are now held via the new tee_shm object. Closing the tee_shm file descriptor will eventually release all resources used by the tee_shm object when all references are released. The new IOCTL, TEE_IOC_SHM_REGISTER_FD, supports dmabuf references to physically contiguous memory buffers. Dmabuf references acquired from the TEE DMA-heap can be used as protected memory for Secure Video Path and such use cases. It depends on the TEE and the TEE driver if dmabuf references acquired by other means can be used. A new tee_shm flag is added to identify tee_shm objects built from a registered dmabuf, TEE_SHM_DMA_BUF. Signed-off-by: Etienne Carriere Signed-off-by: Olivier Masse Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 62 ++++++++++++++++++++++++++++++++++++++- drivers/tee/tee_private.h | 8 +++++ drivers/tee/tee_shm.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- include/linux/tee_core.h | 1 + include/linux/tee_drv.h | 10 +++++++ include/uapi/linux/tee.h | 31 ++++++++++++++++++++ 6 files changed, 182 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 4996f194cd9e..cea4da3ea46f 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -354,11 +354,49 @@ tee_ioctl_shm_register(struct tee_context *ctx, return ret; } +static int +tee_ioctl_shm_register_fd(struct tee_context *ctx, + struct tee_ioctl_shm_register_fd_data __user *udata) +{ + struct tee_ioctl_shm_register_fd_data data; + struct tee_shm *shm; + long ret; + + if (copy_from_user(&data, udata, sizeof(data))) + return -EFAULT; + + /* Currently no input flags are supported */ + if (data.flags) + return -EINVAL; + + shm = tee_shm_register_fd(ctx, data.fd); + if (IS_ERR(shm)) + return -EINVAL; + + data.id = shm->id; + data.flags = shm->flags; + data.size = shm->size; + + if (copy_to_user(udata, &data, sizeof(data))) + ret = -EFAULT; + else + ret = tee_shm_get_fd(shm); + + /* + * When user space closes the file descriptor the shared memory + * should be freed or if tee_shm_get_fd() failed then it will + * be freed immediately. + */ + tee_shm_put(shm); + return ret; +} + static int param_from_user_memref(struct tee_context *ctx, struct tee_param_memref *memref, struct tee_ioctl_param *ip) { struct tee_shm *shm; + size_t offs = 0; /* * If a NULL pointer is passed to a TA in the TEE, @@ -389,6 +427,26 @@ static int param_from_user_memref(struct tee_context *ctx, tee_shm_put(shm); return -EINVAL; } + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + if (ref->parent_shm) { + /* + * The shm already has one reference to + * ref->parent_shm so we are clear of 0. + * We're getting another reference since + * this shm will be used in the parameter + * list instead of the shm we got with + * tee_shm_get_from_id() above. + */ + refcount_inc(&ref->parent_shm->refcount); + tee_shm_put(shm); + shm = ref->parent_shm; + offs = ref->offset; + } + } } else if (ctx->cap_memref_null) { /* Pass NULL pointer to OP-TEE */ shm = NULL; @@ -396,7 +454,7 @@ static int param_from_user_memref(struct tee_context *ctx, return -EINVAL; } - memref->shm_offs = ip->a; + memref->shm_offs = ip->a + offs; memref->size = ip->b; memref->shm = shm; @@ -842,6 +900,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_shm_alloc(ctx, uarg); case TEE_IOC_SHM_REGISTER: return tee_ioctl_shm_register(ctx, uarg); + case TEE_IOC_SHM_REGISTER_FD: + return tee_ioctl_shm_register_fd(ctx, uarg); case TEE_IOC_OPEN_SESSION: return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index 6c6ff5d5eed2..a9b5e4a6a8f7 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -13,6 +13,14 @@ #include #include +/* extra references appended to shm object for registered shared memory */ +struct tee_shm_dmabuf_ref { + struct tee_shm shm; + size_t offset; + struct dma_buf *dmabuf; + struct tee_shm *parent_shm; +}; + int tee_shm_get_fd(struct tee_shm *shm); bool tee_device_get(struct tee_device *teedev); diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index daf6e5cfd59a..76195a398c89 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -45,7 +46,15 @@ static void release_registered_pages(struct tee_shm *shm) static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { - if (shm->flags & TEE_SHM_POOL) { + void *p = shm; + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + p = ref; + dma_buf_put(ref->dmabuf); + } else if (shm->flags & TEE_SHM_POOL) { teedev->pool->ops->free(teedev->pool, shm); } else if (shm->flags & TEE_SHM_DYNAMIC) { int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm); @@ -59,7 +68,7 @@ static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) teedev_ctx_put(shm->ctx); - kfree(shm); + kfree(p); tee_device_put(teedev); } @@ -169,7 +178,7 @@ struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size) * tee_client_invoke_func(). The memory allocated is later freed with a * call to tee_shm_free(). * - * @returns a pointer to 'struct tee_shm' + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) { @@ -179,6 +188,62 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) } EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf); +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd) +{ + struct tee_shm_dmabuf_ref *ref; + int rc; + + if (!tee_device_get(ctx->teedev)) + return ERR_PTR(-EINVAL); + + teedev_ctx_get(ctx); + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) { + rc = -ENOMEM; + goto err_put_tee; + } + + refcount_set(&ref->shm.refcount, 1); + ref->shm.ctx = ctx; + ref->shm.id = -1; + ref->shm.flags = TEE_SHM_DMA_BUF; + + ref->dmabuf = dma_buf_get(fd); + if (IS_ERR(ref->dmabuf)) { + rc = PTR_ERR(ref->dmabuf); + goto err_kfree_ref; + } + + rc = tee_heap_update_from_dma_buf(ctx->teedev, ref->dmabuf, + &ref->offset, &ref->shm, + &ref->parent_shm); + if (rc) + goto err_put_dmabuf; + + mutex_lock(&ref->shm.ctx->teedev->mutex); + ref->shm.id = idr_alloc(&ref->shm.ctx->teedev->idr, &ref->shm, + 1, 0, GFP_KERNEL); + mutex_unlock(&ref->shm.ctx->teedev->mutex); + if (ref->shm.id < 0) { + rc = ref->shm.id; + goto err_put_dmabuf; + } + + return &ref->shm; + +err_put_dmabuf: + dma_buf_put(ref->dmabuf); +err_kfree_ref: + kfree(ref); +err_put_tee: + teedev_ctx_put(ctx); + tee_device_put(ctx->teedev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(tee_shm_register_fd); + /** * tee_shm_alloc_priv_buf() - Allocate shared memory for a privately shared * kernel buffer @@ -442,6 +507,9 @@ static int tee_shm_fop_mmap(struct file *filp, struct vm_area_struct *vma) /* Refuse sharing shared memory provided by application */ if (shm->flags & TEE_SHM_USER_MAPPED) return -EINVAL; + /* Refuse sharing registered DMA_bufs with the application */ + if (shm->flags & TEE_SHM_DMA_BUF) + return -EINVAL; /* check for overflowing the buffer's size */ if (vma->vm_pgoff + vma_pages(vma) > shm->size >> PAGE_SHIFT) diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 28b65010b9ed..b6c54b34a8b5 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -28,6 +28,7 @@ #define TEE_SHM_USER_MAPPED BIT(1) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(2) /* Memory allocated from pool */ #define TEE_SHM_PRIV BIT(3) /* Memory private to TEE driver */ +#define TEE_SHM_DMA_BUF BIT(4) /* Memory with dma-buf handle */ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a54c203000ed..824f1251de60 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -116,6 +116,16 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_kernel_buf(struct tee_context *ctx, void *addr, size_t length); +/** + * tee_shm_register_fd() - Register shared memory from file descriptor + * + * @ctx: Context that allocates the shared memory + * @fd: Shared memory file descriptor reference + * + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure + */ +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd); + /** * tee_shm_free() - Free shared memory * @shm: Handle to shared memory to free diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d0430bee8292..d843cf980d98 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -378,6 +378,37 @@ struct tee_ioctl_shm_register_data { __s32 id; }; +/** + * struct tee_ioctl_shm_register_fd_data - Shared memory registering argument + * @fd: [in] File descriptor identifying dmabuf reference + * @size: [out] Size of referenced memory + * @flags: [in] Flags to/from allocation. + * @id: [out] Identifier of the shared memory + * + * The flags field should currently be zero as input. Updated by the call + * with actual flags as defined by TEE_IOCTL_SHM_* above. + * This structure is used as argument for TEE_IOC_SHM_REGISTER_FD below. + */ +struct tee_ioctl_shm_register_fd_data { + __s64 fd; + __u64 size; + __u32 flags; + __s32 id; +}; + +/** + * TEE_IOC_SHM_REGISTER_FD - register a shared memory from a file descriptor + * + * Returns a file descriptor on success or < 0 on failure + * + * The returned file descriptor refers to the shared memory object in the + * kernel. The supplied file deccriptor can be closed if it's not needed + * for other purposes. The shared memory is freed when the descriptor is + * closed. + */ +#define TEE_IOC_SHM_REGISTER_FD _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 8, \ + struct tee_ioctl_shm_register_fd_data) + /** * TEE_IOC_SHM_REGISTER - Register shared memory argument * -- cgit v1.2.3 From ab09dd6d9201af9930efd5a5a0cb56a0fea6a169 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 13 Aug 2025 08:02:55 +0200 Subject: tee: add tee_shm_alloc_dma_mem() Add tee_shm_alloc_dma_mem() to allocate DMA memory. The memory is represented by a tee_shm object using the new flag TEE_SHM_DMA_MEM to identify it as DMA memory. The allocated memory will later be lent to the TEE to be used as protected memory. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_shm.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/tee_core.h | 5 +++ 2 files changed, 88 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 76195a398c89..e195c892431d 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -13,9 +15,14 @@ #include #include #include -#include #include "tee_private.h" +struct tee_shm_dma_mem { + struct tee_shm shm; + dma_addr_t dma_addr; + struct page *page; +}; + static void shm_put_kernel_pages(struct page **pages, size_t page_count) { size_t n; @@ -48,7 +55,16 @@ static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { void *p = shm; - if (shm->flags & TEE_SHM_DMA_BUF) { + if (shm->flags & TEE_SHM_DMA_MEM) { +#if IS_ENABLED(CONFIG_TEE_DMABUF_HEAPS) + struct tee_shm_dma_mem *dma_mem; + + dma_mem = container_of(shm, struct tee_shm_dma_mem, shm); + p = dma_mem; + dma_free_pages(&teedev->dev, shm->size, dma_mem->page, + dma_mem->dma_addr, DMA_BIDIRECTIONAL); +#endif + } else if (shm->flags & TEE_SHM_DMA_BUF) { struct tee_shm_dmabuf_ref *ref; ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); @@ -268,6 +284,71 @@ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size) } EXPORT_SYMBOL_GPL(tee_shm_alloc_priv_buf); +#if IS_ENABLED(CONFIG_TEE_DMABUF_HEAPS) +/** + * tee_shm_alloc_dma_mem() - Allocate DMA memory as shared memory object + * @ctx: Context that allocates the shared memory + * @page_count: Number of pages + * + * The allocated memory is expected to be lent (made inaccessible to the + * kernel) to the TEE while it's used and returned (accessible to the + * kernel again) before it's freed. + * + * This function should normally only be used internally in the TEE + * drivers. + * + * @returns a pointer to 'struct tee_shm' + */ +struct tee_shm *tee_shm_alloc_dma_mem(struct tee_context *ctx, + size_t page_count) +{ + struct tee_device *teedev = ctx->teedev; + struct tee_shm_dma_mem *dma_mem; + dma_addr_t dma_addr; + struct page *page; + + if (!tee_device_get(teedev)) + return ERR_PTR(-EINVAL); + + page = dma_alloc_pages(&teedev->dev, page_count * PAGE_SIZE, + &dma_addr, DMA_BIDIRECTIONAL, GFP_KERNEL); + if (!page) + goto err_put_teedev; + + dma_mem = kzalloc(sizeof(*dma_mem), GFP_KERNEL); + if (!dma_mem) + goto err_free_pages; + + refcount_set(&dma_mem->shm.refcount, 1); + dma_mem->shm.ctx = ctx; + dma_mem->shm.paddr = page_to_phys(page); + dma_mem->dma_addr = dma_addr; + dma_mem->page = page; + dma_mem->shm.size = page_count * PAGE_SIZE; + dma_mem->shm.flags = TEE_SHM_DMA_MEM; + + teedev_ctx_get(ctx); + + return &dma_mem->shm; + +err_free_pages: + dma_free_pages(&teedev->dev, page_count * PAGE_SIZE, page, dma_addr, + DMA_BIDIRECTIONAL); +err_put_teedev: + tee_device_put(teedev); + + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(tee_shm_alloc_dma_mem); +#else +struct tee_shm *tee_shm_alloc_dma_mem(struct tee_context *ctx, + size_t page_count) +{ + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL_GPL(tee_shm_alloc_dma_mem); +#endif + int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align, int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm, diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index b6c54b34a8b5..7b0c1da2ca6c 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -29,6 +29,8 @@ #define TEE_SHM_POOL BIT(2) /* Memory allocated from pool */ #define TEE_SHM_PRIV BIT(3) /* Memory private to TEE driver */ #define TEE_SHM_DMA_BUF BIT(4) /* Memory with dma-buf handle */ +#define TEE_SHM_DMA_MEM BIT(5) /* Memory allocated with */ + /* dma_alloc_pages() */ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 @@ -298,6 +300,9 @@ void *tee_get_drvdata(struct tee_device *teedev); */ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size); +struct tee_shm *tee_shm_alloc_dma_mem(struct tee_context *ctx, + size_t page_count); + int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align, int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm, -- cgit v1.2.3 From 1827f773e416842bb0a1be93f313e02591e0b0c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 5 Sep 2025 15:15:38 -0700 Subject: net: xdp: pass full flags to xdp_update_skb_shared_info() xdp_update_skb_shared_info() needs to update skb state which was maintained in xdp_buff / frame. Pass full flags into it, instead of breaking it out bit by bit. We will need to add a bit for unreadable frags (even tho XDP doesn't support those the driver paths may be common), at which point almost all call sites would become: xdp_update_skb_shared_info(skb, num_frags, sinfo->xdp_frags_size, MY_PAGE_SIZE * num_frags, xdp_buff_is_frag_pfmemalloc(xdp), xdp_buff_is_frag_unreadable(xdp)); Keep a helper for accessing the flags, in case we need to transform them somehow in the future (e.g. to cover up xdp_buff vs xdp_frame differences). While we are touching call callers - rename the helper to xdp_update_skb_frags_info(), previous name may have implied that it's shinfo that's updated. We are updating flags in struct sk_buff based on frags that got attched. Signed-off-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20250905221539.2930285-2-kuba@kernel.org Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 +++---- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 15 +++++++-------- drivers/net/ethernet/intel/ice/ice_txrx.c | 15 +++++++-------- drivers/net/ethernet/marvell/mvneta.c | 7 +++---- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 23 +++++++++++------------ drivers/net/virtio_net.c | 7 +++---- include/net/xdp.h | 23 +++++++++++------------ net/core/xdp.c | 21 ++++++++++----------- 8 files changed, 55 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 58d579dca3f1..3e77a96e5a3e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -468,9 +468,8 @@ bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, if (!skb) return NULL; - xdp_update_skb_shared_info(skb, num_frags, - sinfo->xdp_frags_size, - BNXT_RX_PAGE_SIZE * num_frags, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, + BNXT_RX_PAGE_SIZE * num_frags, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c33039130..98601c62c592 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2151,10 +2151,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); /* First buffer has already been processed, so bump ntc */ if (++rx_ring->next_to_clean == rx_ring->count) @@ -2206,10 +2206,9 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) { - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp); } else { diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec94..107632a71f3c 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1035,10 +1035,9 @@ ice_build_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); return skb; } @@ -1115,10 +1114,10 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); } return skb; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 476e73e502fe..7351e98d73f4 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2416,10 +2416,9 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb->ip_summed = mvneta_rx_csum(pp, desc_status); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, num_frags, - sinfo->xdp_frags_size, - num_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, + num_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index b8c609d91d11..2925ece136c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1796,10 +1796,9 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi if (xdp_buff_has_frags(&mxbuf->xdp)) { /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, wi - head_wi - 1, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, wi - head_wi - 1, + sinfo->xdp_frags_size, truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); for (struct mlx5e_wqe_frag_info *pwi = head_wi + 1; pwi < wi; pwi++) pwi->frag_page->frags++; @@ -2105,10 +2104,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w struct mlx5e_frag_page *pagep; /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, frag_page - head_page, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, frag_page - head_page, + sinfo->xdp_frags_size, + truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); pagep = head_page; do @@ -2122,10 +2121,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w if (xdp_buff_has_frags(&mxbuf->xdp)) { struct mlx5e_frag_page *pagep; - xdp_update_skb_shared_info(skb, sinfo->nr_frags, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, sinfo->nr_frags, + sinfo->xdp_frags_size, + truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); pagep = frag_page - sinfo->nr_frags; do diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 975bdc5dab84..06708c9a979e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2185,10 +2185,9 @@ static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - xdp_frags_truesz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + xdp_frags_truesz, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/include/net/xdp.h b/include/net/xdp.h index af60e11b336c..976cfd2f113c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -116,15 +116,14 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool -xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { - return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } -static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) { - xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + return xdp->flags; } static __always_inline void @@ -294,10 +293,10 @@ static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool -xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) +static __always_inline u32 +xdp_frame_get_skb_flags(const struct xdp_frame *frame) { - return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + return frame->flags; } #define XDP_BULK_QUEUE_SIZE 16 @@ -334,9 +333,9 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) } static inline void -xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, - unsigned int size, unsigned int truesize, - bool pfmemalloc) +xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + u32 xdp_flags) { struct skb_shared_info *sinfo = skb_shinfo(skb); @@ -350,7 +349,7 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, skb->len += size; skb->data_len += size; skb->truesize += truesize; - skb->pfmemalloc |= pfmemalloc; + skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } /* Avoids inlining WARN macro in fast-path */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 491334b9b8be..9100e160113a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) u32 tsize; tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, tsize, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); } skb->protocol = eth_type_trans(skb, rxq->dev); @@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, struct skb_shared_info *sinfo = skb_shinfo(skb); const struct skb_shared_info *xinfo; u32 nr_frags, tsize = 0; - bool pfmemalloc = false; + u32 flags = 0; xinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = xinfo->nr_frags; @@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= page_is_pfmemalloc(page); + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } - xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, - tsize, pfmemalloc); + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); return true; } @@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdpf->frame_sz, - xdp_frame_is_frag_pfmemalloc(xdpf)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); -- cgit v1.2.3 From 6bffdc0f88f85cd15b261286be4dc3c62ddea7d3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 5 Sep 2025 15:15:39 -0700 Subject: net: xdp: handle frags with unreadable memory We don't expect frags with unreadable memory to be presented to XDP programs today, but the XDP helpers are designed to be usable whether XDP is enabled or not. Support handling frags with unreadable memory. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250905221539.2930285-3-kuba@kernel.org Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/net/xdp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 976cfd2f113c..6fd294fa6841 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -76,6 +76,11 @@ enum xdp_buff_flags { XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ + /* frags have unreadable mem, this can't be true for real XDP packets, + * but drivers may use XDP helpers to construct Rx pkt state even when + * XDP program is not attached. + */ + XDP_FLAGS_FRAGS_UNREADABLE = BIT(2), }; struct xdp_buff { @@ -121,6 +126,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE; +} + static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) { return xdp->flags; @@ -270,6 +280,8 @@ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, if (unlikely(netmem_is_pfmemalloc(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); + if (unlikely(netmem_is_net_iov(netmem))) + xdp_buff_set_frag_unreadable(xdp); return true; } @@ -350,6 +362,7 @@ xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE); } /* Avoids inlining WARN macro in fast-path */ -- cgit v1.2.3 From 0d3c4a441686663ad34aa3d6abe8c5317d21e707 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 8 Sep 2025 10:32:32 +0300 Subject: ipv4: icmp: Pass IPv4 control block structure as an argument to __icmp_send() __icmp_send() is used to generate ICMP error messages in response to various situations such as MTU errors (i.e., "Fragmentation Required") and too many hops (i.e., "Time Exceeded"). The skb that generated the error does not necessarily come from the IPv4 layer and does not always have a valid IPv4 control block in skb->cb. Therefore, commit 9ef6b42ad6fd ("net: Add __icmp_send helper.") changed the function to take the IP options structure as argument instead of deriving it from the skb's control block. Some callers of this function such as icmp_send() pass the IP options structure from the skb's control block as in these call paths the control block is known to be valid, but other callers simply pass a zeroed structure. A subsequent patch will need __icmp_send() to access more information from the IPv4 control block (specifically, the ifindex of the input interface). As a preparation for this change, change the function to take the IPv4 control block structure as an argument instead of the IP options structure. This makes the function similar to its IPv6 counterpart that already takes the IPv6 control block structure as an argument. No functional changes intended. Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250908073238.119240-3-idosch@nvidia.com Signed-off-by: Paolo Abeni --- include/net/icmp.h | 10 ++++++---- net/ipv4/cipso_ipv4.c | 12 ++++++------ net/ipv4/icmp.c | 12 +++++++----- net/ipv4/route.c | 10 +++++----- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/icmp.h b/include/net/icmp.h index caddf4a59ad1..935ee13d9ae9 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -37,10 +37,10 @@ struct sk_buff; struct net; void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt); + const struct inet_skb_parm *parm); static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); + __icmp_send(skb_in, type, code, info, IPCB(skb_in)); } #if IS_ENABLED(CONFIG_NF_NAT) @@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - struct ip_options opts = { 0 }; - __icmp_send(skb_in, type, code, info, &opts); + struct inet_skb_parm parm; + + memset(&parm, 0, sizeof(parm)); + __icmp_send(skb_in, type, code, info, &parm); } #endif diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c7c949c37e2d..709021197e1c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1715,7 +1715,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - struct ip_options opt; + struct inet_skb_parm parm; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) @@ -1726,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) * so we can not use icmp_send and IPCB here. */ - memset(&opt, 0, sizeof(opt)); - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + memset(&parm, 0, sizeof(parm)); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm); else - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm); } /** diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 863bf5023f2a..59fd0e1993a6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -594,7 +594,7 @@ relookup_failed: */ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt) + const struct inet_skb_parm *parm) { struct iphdr *iph; int room; @@ -725,7 +725,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + &parm->opt)) goto out_unlock; @@ -799,15 +800,16 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; - struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; + struct inet_skb_parm parm; struct nf_conn *ct; __be32 orig_ip; + memset(&parm, 0, sizeof(parm)); ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); return; } @@ -823,7 +825,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 50309f2ab132..6d27d3610c1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1222,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct inet_skb_parm parm; struct net_device *dev; - struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. @@ -1233,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; - memset(&opt, 0, sizeof(opt)); + memset(&parm, 0, sizeof(parm)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; - res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; } - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); } static void ipv4_link_failure(struct sk_buff *skb) -- cgit v1.2.3 From 002ebddd695a53999550e241b71950f1aa0e1ac4 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:20 +0200 Subject: pmdomain: core: Restore behaviour for disabling unused PM domains Recent changes to genpd prevents those PM domains being powered-on during initialization from being powered-off during the boot sequence. Based upon whether CONFIG_PM_CONFIG_PM_GENERIC_DOMAINS_OF is set of not, genpd relies on the sync_state mechanism or the genpd_power_off_unused() (which is a late_initcall_sync), to understand when it's okay to allow these PM domains to be powered-off. This new behaviour in genpd has lead to problems on different platforms. Let's therefore restore the behavior of genpd_power_off_unused(). Moreover, let's introduce GENPD_FLAG_NO_STAY_ON, to allow genpd OF providers to opt-out from the new behaviour. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/20250902-rk3576-lockup-regression-v1-1-c4a0c9daeb00@collabora.com/ Reported-by: Nicolas Frattaroli Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Tested-by: Heiko Stuebner Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 20 ++++++++++++++------ include/linux/pm_domain.h | 7 +++++++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 0006ab3d0789..61c2277c9ce3 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -187,6 +187,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = { #define genpd_is_opp_table_fw(genpd) (genpd->flags & GENPD_FLAG_OPP_TABLE_FW) #define genpd_is_dev_name_fw(genpd) (genpd->flags & GENPD_FLAG_DEV_NAME_FW) #define genpd_is_no_sync_state(genpd) (genpd->flags & GENPD_FLAG_NO_SYNC_STATE) +#define genpd_is_no_stay_on(genpd) (genpd->flags & GENPD_FLAG_NO_STAY_ON) static inline bool irq_safe_dev_in_sleep_domain(struct device *dev, const struct generic_pm_domain *genpd) @@ -1357,7 +1358,6 @@ err_poweroff: return ret; } -#ifndef CONFIG_PM_GENERIC_DOMAINS_OF static bool pd_ignore_unused; static int __init pd_ignore_unused_setup(char *__unused) { @@ -1382,9 +1382,6 @@ static int __init genpd_power_off_unused(void) mutex_lock(&gpd_list_lock); list_for_each_entry(genpd, &gpd_list, gpd_list_node) { - genpd_lock(genpd); - genpd->stay_on = false; - genpd_unlock(genpd); genpd_queue_power_off_work(genpd); } @@ -1393,7 +1390,6 @@ static int __init genpd_power_off_unused(void) return 0; } late_initcall_sync(genpd_power_off_unused); -#endif #ifdef CONFIG_PM_SLEEP @@ -2367,6 +2363,18 @@ static void genpd_lock_init(struct generic_pm_domain *genpd) } } +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = !genpd_is_no_stay_on(genpd) && !is_off; +} +#else +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = false; +} +#endif + /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -2392,7 +2400,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; - genpd->stay_on = !is_off; + genpd_set_stay_on(genpd, is_off); genpd->sync_state = GENPD_SYNC_STATE_OFF; genpd->device_count = 0; genpd->provider = NULL; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c84edf217819..f67a2cb7d781 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -115,6 +115,12 @@ struct dev_pm_domain_list { * genpd provider specific way, likely through a * parent device node. This flag makes genpd to * skip its internal support for this. + * + * GENPD_FLAG_NO_STAY_ON: For genpd OF providers a powered-on PM domain at + * initialization is prevented from being + * powered-off until the ->sync_state() callback is + * invoked. This flag informs genpd to allow a + * power-off without waiting for ->sync_state(). */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -126,6 +132,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) #define GENPD_FLAG_DEV_NAME_FW (1U << 8) #define GENPD_FLAG_NO_SYNC_STATE (1U << 9) +#define GENPD_FLAG_NO_STAY_ON (1U << 10) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ -- cgit v1.2.3 From f0addd325ef692c92c522a2ba4d9db13fc90e664 Mon Sep 17 00:00:00 2001 From: Alexander Kurz Date: Mon, 11 Aug 2025 06:43:58 +0000 Subject: mfd: input: rtc: mc13783: Remove deprecated mc13xxx_irq_ack() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mc13xxx_irq_ack() got deprecated and became dead code with commit 10f9edaeaa30 ("mfd: mc13xxx: Use regmap irq framework for interrupts"). It should be safe to remove it now. Signed-off-by: Alexander Kurz Acked-by: Alexandre Belloni Acked-by: Uwe Kleine-König Acked-by: Dmitry Torokhov # for input Link: https://lore.kernel.org/r/20250811064358.1659-1-akurz@blala.de Signed-off-by: Lee Jones --- drivers/input/misc/mc13783-pwrbutton.c | 1 - drivers/input/touchscreen/mc13783_ts.c | 4 ---- drivers/rtc/rtc-mc13xxx.c | 13 ------------- include/linux/mfd/mc13xxx.h | 6 ------ 4 files changed, 24 deletions(-) (limited to 'include') diff --git a/drivers/input/misc/mc13783-pwrbutton.c b/drivers/input/misc/mc13783-pwrbutton.c index 1c7faa9b7afe..b83d762ae2e9 100644 --- a/drivers/input/misc/mc13783-pwrbutton.c +++ b/drivers/input/misc/mc13783-pwrbutton.c @@ -57,7 +57,6 @@ static irqreturn_t button_irq(int irq, void *_priv) struct mc13783_pwrb *priv = _priv; int val; - mc13xxx_irq_ack(priv->mc13783, irq); mc13xxx_reg_read(priv->mc13783, MC13783_REG_INTERRUPT_SENSE_1, &val); switch (irq) { diff --git a/drivers/input/touchscreen/mc13783_ts.c b/drivers/input/touchscreen/mc13783_ts.c index 33635da85079..47b8da00027f 100644 --- a/drivers/input/touchscreen/mc13783_ts.c +++ b/drivers/input/touchscreen/mc13783_ts.c @@ -42,8 +42,6 @@ static irqreturn_t mc13783_ts_handler(int irq, void *data) { struct mc13783_ts_priv *priv = data; - mc13xxx_irq_ack(priv->mc13xxx, irq); - /* * Kick off reading coordinates. Note that if work happens already * be queued for future execution (it rearms itself) it will not @@ -137,8 +135,6 @@ static int mc13783_ts_open(struct input_dev *dev) mc13xxx_lock(priv->mc13xxx); - mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TS); - ret = mc13xxx_irq_request(priv->mc13xxx, MC13XXX_IRQ_TS, mc13783_ts_handler, MC13783_TS_NAME, priv); if (ret) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index e7b87130e624..2494d13fd767 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -137,10 +137,6 @@ static int mc13xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) } if (!priv->valid) { - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_RTCRST); - if (unlikely(ret)) - goto out; - ret = mc13xxx_irq_unmask(priv->mc13xxx, MC13XXX_IRQ_RTCRST); } @@ -208,10 +204,6 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) if (unlikely(ret)) goto out; - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TODA); - if (unlikely(ret)) - goto out; - s1970 = rtc_tm_to_time64(&alarm->time); dev_dbg(dev, "%s: %s %lld\n", __func__, alarm->enabled ? "on" : "off", @@ -239,12 +231,9 @@ out: static irqreturn_t mc13xxx_rtc_alarm_handler(int irq, void *dev) { struct mc13xxx_rtc *priv = dev; - struct mc13xxx *mc13xxx = priv->mc13xxx; rtc_update_irq(priv->rtc, 1, RTC_IRQF | RTC_AF); - mc13xxx_irq_ack(mc13xxx, irq); - return IRQ_HANDLED; } @@ -293,8 +282,6 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) mc13xxx_lock(mc13xxx); - mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_RTCRST); - ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, mc13xxx_rtc_reset_handler, DRIVER_NAME, priv); if (ret) diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h index f372926d5894..dd46fe424a80 100644 --- a/include/linux/mfd/mc13xxx.h +++ b/include/linux/mfd/mc13xxx.h @@ -31,12 +31,6 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode, unsigned int channel, u8 ato, bool atox, unsigned int *sample); -/* Deprecated calls */ -static inline int mc13xxx_irq_ack(struct mc13xxx *mc13xxx, int irq) -{ - return 0; -} - static inline int mc13xxx_irq_request_nounmask(struct mc13xxx *mc13xxx, int irq, irq_handler_t handler, const char *name, void *dev) -- cgit v1.2.3 From cbd2257dc96e3e46217540fcb095a757ffa20d96 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 2 Sep 2025 13:28:08 +0200 Subject: netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support Expose the input bridge interface ethernet address so it can be used to redirect the packet to the receiving physical device for processing. Tested with nft command line tool. table bridge nat { chain PREROUTING { type filter hook prerouting priority 0; policy accept; ether daddr de:ad:00:00:be:ef meta pkttype set host ether daddr set meta ibrhwdr accept } } Joint work with Pablo Neira. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_meta_bridge.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8e0eb832bc01..7c0c915f0306 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -959,6 +959,7 @@ enum nft_exthdr_attributes { * @NFT_META_SDIF: slave device interface index * @NFT_META_SDIFNAME: slave device interface name * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit + * @NFT_META_BRI_IIFHWADDR: packet input bridge interface ethernet address */ enum nft_meta_keys { NFT_META_LEN, @@ -999,6 +1000,7 @@ enum nft_meta_keys { NFT_META_SDIFNAME, NFT_META_BRI_BROUTE, __NFT_META_IIFTYPE, + NFT_META_BRI_IIFHWADDR, }; /** diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5adced1e7d0c..b7af36bbd306 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store_be16(dest, htons(p_proto)); return; } + case NFT_META_BRI_IIFHWADDR: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + + memcpy(dest, br_dev->dev_addr, ETH_ALEN); + return; default: return nft_meta_get_eval(expr, regs, pkt); } @@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; + case NFT_META_BRI_IIFHWADDR: + len = ETH_ALEN; + break; default: return nft_meta_get_init(ctx, expr, tb); } @@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_BRI_BROUTE: + case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: -- cgit v1.2.3 From b8cf8fda522d5a37f8948ad8a19a1113cc38710f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 9 Sep 2025 16:30:47 +0200 Subject: fanotify: add watchdog for permission events This is to make it easier to debug issues with AV software, which time and again deadlocks with no indication of where the issue comes from, and the kernel being blamed for the deadlock. Then we need to analyze dumps to prove that the kernel is not in fact at fault. The deadlock comes from recursion: handling the event triggers another permission event, in some roundabout way, obviously, otherwise it would have been found in testing. With this patch a warning is printed when permission event is received by userspace but not answered for more than the timeout specified in /proc/sys/fs/fanotify/watchdog_timeout. The watchdog can be turned off by setting the timeout to zero (which is the default). The timeout is very coarse (T <= t < 2T) but I guess it's good enough for the purpose. Overhead should be minimal. Signed-off-by: Miklos Szeredi Reviewed-by: Amir Goldstein Link: https://patch.msgid.link/20250909143053.112171-1-mszeredi@redhat.com Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 2 + fs/notify/fanotify/fanotify_user.c | 102 +++++++++++++++++++++++++++++++++++++ include/linux/fsnotify_backend.h | 2 + 3 files changed, 106 insertions(+) (limited to 'include') diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index b78308975082..39e60218df7c 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -441,7 +441,9 @@ struct fanotify_perm_event { size_t count; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ + unsigned short watchdog_cnt; /* already scanned by watchdog? */ int fd; /* fd we passed to userspace for this event */ + pid_t recv_pid; /* pid of task receiving the event */ union { struct fanotify_response_info_header hdr; struct fanotify_response_info_audit_rule audit_rule; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 561339b4cf75..1dadda82cae5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -50,6 +50,7 @@ /* configurable via /proc/sys/fs/fanotify/ */ static int fanotify_max_queued_events __read_mostly; +static int perm_group_timeout __read_mostly; #ifdef CONFIG_SYSCTL @@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, + { + .procname = "watchdog_timeout", + .data = &perm_group_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, }; static void __init fanotify_sysctls_init(void) @@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void) #define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ +static LIST_HEAD(perm_group_list); +static DEFINE_SPINLOCK(perm_group_lock); +static void perm_group_watchdog(struct work_struct *work); +static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog); + +static void perm_group_watchdog_schedule(void) +{ + schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout)); +} + +static void perm_group_watchdog(struct work_struct *work) +{ + struct fsnotify_group *group; + struct fanotify_perm_event *event; + struct task_struct *task; + pid_t failed_pid = 0; + + guard(spinlock)(&perm_group_lock); + if (list_empty(&perm_group_list)) + return; + + list_for_each_entry(group, &perm_group_list, + fanotify_data.perm_grp_list) { + /* + * Ok to test without lock, racing with an addition is + * fine, will deal with it next round + */ + if (list_empty(&group->fanotify_data.access_list)) + continue; + + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (likely(event->watchdog_cnt == 0)) { + event->watchdog_cnt = 1; + } else if (event->watchdog_cnt == 1) { + /* Report on event only once */ + event->watchdog_cnt = 2; + + /* Do not report same pid repeatedly */ + if (event->recv_pid == failed_pid) + continue; + + failed_pid = event->recv_pid; + rcu_read_lock(); + task = find_task_by_pid_ns(event->recv_pid, + &init_pid_ns); + pr_warn_ratelimited( + "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n", + event->recv_pid, + task ? task->comm : NULL, + perm_group_timeout); + rcu_read_unlock(); + } + } + spin_unlock(&group->notification_lock); + } + perm_group_watchdog_schedule(); +} + +static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group) +{ + if (!list_empty(&group->fanotify_data.perm_grp_list)) { + /* Perm event watchdog can no longer scan this group. */ + spin_lock(&perm_group_lock); + list_del_init(&group->fanotify_data.perm_grp_list); + spin_unlock(&perm_group_lock); + } +} + +static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group) +{ + if (!perm_group_timeout) + return; + + spin_lock(&perm_group_lock); + if (list_empty(&group->fanotify_data.perm_grp_list)) { + /* Add to perm_group_list for monitoring by watchdog. */ + if (list_empty(&perm_group_list)) + perm_group_watchdog_schedule(); + list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list); + } + spin_unlock(&perm_group_lock); +} + /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * @@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); + FANOTIFY_PERM(event)->recv_pid = current->pid; spin_unlock(&group->notification_lock); } } @@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ fsnotify_group_stop_queueing(group); + fanotify_perm_watchdog_group_remove(group); + /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. @@ -1465,6 +1562,10 @@ out: fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); + + if (!ret && (mask & FANOTIFY_PERM_EVENTS)) + fanotify_perm_watchdog_group_add(group); + return ret; } @@ -1625,6 +1726,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list); switch (class) { case FAN_CLASS_NOTIF: group->priority = FSNOTIFY_PRIO_NORMAL; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d4034ddaf392..0d954ea7b179 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -273,6 +273,8 @@ struct fsnotify_group { int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; + /* chained on perm_group_list */ + struct list_head perm_grp_list; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; -- cgit v1.2.3 From 3b1bbfb5fce3ca9fffc92ac1b053b0cfbb1f322b Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 4 Sep 2025 11:05:27 -0500 Subject: mfd: bq257xx: Add support for BQ25703A core driver The Texas Instruments BQ25703A is an integrated charger manager and boost converter. The MFD driver initializes the device for the regulator driver and power supply driver. Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20250904160530.66178-3-macroalpha82@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 +++++ drivers/mfd/Makefile | 1 + drivers/mfd/bq257xx.c | 99 +++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/bq257xx.h | 104 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 drivers/mfd/bq257xx.c create mode 100644 include/linux/mfd/bq257xx.h (limited to 'include') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..768417c97339 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1641,6 +1641,17 @@ config MFD_TI_LMU LM36274. It consists of backlight, LED and regulator driver. It provides consistent device controls for lighting functions. +config MFD_BQ257XX + tristate "TI BQ257XX Buck/Boost Charge Controller" + depends on I2C + select MFD_CORE + select REGMAP_I2C + help + Support Texas Instruments BQ25703 Buck/Boost converter with + charge controller. It consists of regulators that provide + system voltage and OTG voltage, and a charger manager for + batteries containing one or more cells. + config MFD_OMAP_USB_HOST bool "TI OMAP USBHS core and TLL driver" depends on USB_EHCI_HCD_OMAP || USB_OHCI_HCD_OMAP3 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..3d700374a42d 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_MFD_SM501) += sm501.o obj-$(CONFIG_ARCH_BCM2835) += bcm2835-pm.o obj-$(CONFIG_MFD_BCM590XX) += bcm590xx.o obj-$(CONFIG_MFD_BD9571MWV) += bd9571mwv.o +obj-$(CONFIG_MFD_BQ257XX) += bq257xx.o obj-$(CONFIG_MFD_CGBC) += cgbc-core.o obj-$(CONFIG_MFD_CROS_EC_DEV) += cros_ec_dev.o obj-$(CONFIG_MFD_CS42L43) += cs42l43.o diff --git a/drivers/mfd/bq257xx.c b/drivers/mfd/bq257xx.c new file mode 100644 index 000000000000..e9d49dac0a16 --- /dev/null +++ b/drivers/mfd/bq257xx.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Core Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include + +static const struct regmap_range bq25703_readonly_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_MANUFACT_DEV_ID), +}; + +static const struct regmap_access_table bq25703_writeable_regs = { + .no_ranges = bq25703_readonly_reg_ranges, + .n_no_ranges = ARRAY_SIZE(bq25703_readonly_reg_ranges), +}; + +static const struct regmap_range bq25703_volatile_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGE_OPTION_0, BQ25703_IIN_HOST), + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_ADC_OPTION), +}; + +static const struct regmap_access_table bq25703_volatile_regs = { + .yes_ranges = bq25703_volatile_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(bq25703_volatile_reg_ranges), +}; + +static const struct regmap_config bq25703_regmap_config = { + .reg_bits = 8, + .val_bits = 16, + .max_register = BQ25703_ADC_OPTION, + .cache_type = REGCACHE_MAPLE, + .wr_table = &bq25703_writeable_regs, + .volatile_table = &bq25703_volatile_regs, + .val_format_endian = REGMAP_ENDIAN_LITTLE, +}; + +static const struct mfd_cell cells[] = { + MFD_CELL_NAME("bq257xx-regulator"), + MFD_CELL_NAME("bq257xx-charger"), +}; + +static int bq257xx_probe(struct i2c_client *client) +{ + struct bq257xx_device *ddata; + int ret; + + ddata = devm_kzalloc(&client->dev, sizeof(*ddata), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + ddata->client = client; + + ddata->regmap = devm_regmap_init_i2c(client, &bq25703_regmap_config); + if (IS_ERR(ddata->regmap)) { + return dev_err_probe(&client->dev, PTR_ERR(ddata->regmap), + "Failed to allocate register map\n"); + } + + i2c_set_clientdata(client, ddata); + + ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO, + cells, ARRAY_SIZE(cells), NULL, 0, NULL); + if (ret) + return dev_err_probe(&client->dev, ret, + "Failed to register child devices\n"); + + return ret; +} + +static const struct i2c_device_id bq257xx_i2c_ids[] = { + { "bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(i2c, bq257xx_i2c_ids); + +static const struct of_device_id bq257xx_of_match[] = { + { .compatible = "ti,bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(of, bq257xx_of_match); + +static struct i2c_driver bq257xx_driver = { + .driver = { + .name = "bq257xx", + .of_match_table = bq257xx_of_match, + }, + .probe = bq257xx_probe, + .id_table = bq257xx_i2c_ids, +}; +module_i2c_driver(bq257xx_driver); + +MODULE_DESCRIPTION("bq257xx buck/boost/charger driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/bq257xx.h b/include/linux/mfd/bq257xx.h new file mode 100644 index 000000000000..1d6ddc7fb09f --- /dev/null +++ b/include/linux/mfd/bq257xx.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Register definitions for TI BQ257XX + * Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com/ + */ + +#define BQ25703_CHARGE_OPTION_0 0x00 +#define BQ25703_CHARGE_CURRENT 0x02 +#define BQ25703_MAX_CHARGE_VOLT 0x04 +#define BQ25703_OTG_VOLT 0x06 +#define BQ25703_OTG_CURRENT 0x08 +#define BQ25703_INPUT_VOLTAGE 0x0a +#define BQ25703_MIN_VSYS 0x0c +#define BQ25703_IIN_HOST 0x0e +#define BQ25703_CHARGER_STATUS 0x20 +#define BQ25703_PROCHOT_STATUS 0x22 +#define BQ25703_IIN_DPM 0x24 +#define BQ25703_ADCIBAT_CHG 0x28 +#define BQ25703_ADCIINCMPIN 0x2a +#define BQ25703_ADCVSYSVBAT 0x2c +#define BQ25703_MANUFACT_DEV_ID 0x2e +#define BQ25703_CHARGE_OPTION_1 0x30 +#define BQ25703_CHARGE_OPTION_2 0x32 +#define BQ25703_CHARGE_OPTION_3 0x34 +#define BQ25703_ADC_OPTION 0x3a + +#define BQ25703_EN_LWPWR BIT(15) +#define BQ25703_WDTMR_ADJ_MASK GENMASK(14, 13) +#define BQ25703_WDTMR_DISABLE 0 +#define BQ25703_WDTMR_5_SEC 1 +#define BQ25703_WDTMR_88_SEC 2 +#define BQ25703_WDTMR_175_SEC 3 + +#define BQ25703_ICHG_MASK GENMASK(12, 6) +#define BQ25703_ICHG_STEP_UA 64000 +#define BQ25703_ICHG_MIN_UA 64000 +#define BQ25703_ICHG_MAX_UA 8128000 + +#define BQ25703_MAX_CHARGE_VOLT_MASK GENMASK(15, 4) +#define BQ25703_VBATREG_STEP_UV 16000 +#define BQ25703_VBATREG_MIN_UV 1024000 +#define BQ25703_VBATREG_MAX_UV 19200000 + +#define BQ25703_OTG_VOLT_MASK GENMASK(13, 6) +#define BQ25703_OTG_VOLT_STEP_UV 64000 +#define BQ25703_OTG_VOLT_MIN_UV 4480000 +#define BQ25703_OTG_VOLT_MAX_UV 20800000 +#define BQ25703_OTG_VOLT_NUM_VOLT 256 + +#define BQ25703_OTG_CUR_MASK GENMASK(14, 8) +#define BQ25703_OTG_CUR_STEP_UA 50000 +#define BQ25703_OTG_CUR_MAX_UA 6350000 + +#define BQ25703_MINVSYS_MASK GENMASK(13, 8) +#define BQ25703_MINVSYS_STEP_UV 256000 +#define BQ25703_MINVSYS_MIN_UV 1024000 +#define BQ25703_MINVSYS_MAX_UV 16128000 + +#define BQ25703_STS_AC_STAT BIT(15) +#define BQ25703_STS_IN_FCHRG BIT(10) +#define BQ25703_STS_IN_PCHRG BIT(9) +#define BQ25703_STS_FAULT_ACOV BIT(7) +#define BQ25703_STS_FAULT_BATOC BIT(6) +#define BQ25703_STS_FAULT_ACOC BIT(5) + +#define BQ25703_IINDPM_MASK GENMASK(14, 8) +#define BQ25703_IINDPM_STEP_UA 50000 +#define BQ25703_IINDPM_MIN_UA 50000 +#define BQ25703_IINDPM_MAX_UA 6400000 +#define BQ25703_IINDPM_DEFAULT_UA 3300000 +#define BQ25703_IINDPM_OFFSET_UA 50000 + +#define BQ25703_ADCIBAT_DISCHG_MASK GENMASK(6, 0) +#define BQ25703_ADCIBAT_CHG_MASK GENMASK(14, 8) +#define BQ25703_ADCIBAT_CHG_STEP_UA 64000 +#define BQ25703_ADCIBAT_DIS_STEP_UA 256000 + +#define BQ25703_ADCIIN GENMASK(15, 8) +#define BQ25703_ADCIINCMPIN_STEP 50000 + +#define BQ25703_ADCVSYS_MASK GENMASK(15, 8) +#define BQ25703_ADCVBAT_MASK GENMASK(7, 0) +#define BQ25703_ADCVSYSVBAT_OFFSET_UV 2880000 +#define BQ25703_ADCVSYSVBAT_STEP 64000 + +#define BQ25703_ADC_CH_MASK GENMASK(7, 0) +#define BQ25703_ADC_CONV_EN BIT(15) +#define BQ25703_ADC_START BIT(14) +#define BQ25703_ADC_FULL_SCALE BIT(13) +#define BQ25703_ADC_CMPIN_EN BIT(7) +#define BQ25703_ADC_VBUS_EN BIT(6) +#define BQ25703_ADC_PSYS_EN BIT(5) +#define BQ25703_ADC_IIN_EN BIT(4) +#define BQ25703_ADC_IDCHG_EN BIT(3) +#define BQ25703_ADC_ICHG_EN BIT(2) +#define BQ25703_ADC_VSYS_EN BIT(1) +#define BQ25703_ADC_VBAT_EN BIT(0) + +#define BQ25703_EN_OTG_MASK BIT(12) + +struct bq257xx_device { + struct i2c_client *client; + struct regmap *regmap; +}; -- cgit v1.2.3 From 948cb194bcb4c01fb4cd029936f0c02b10780394 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Jul 2025 15:59:24 +0200 Subject: mtd: map: add back asm/barrier.h inclusion The mb() macro is used in this header: In file included from include/linux/mtd/qinfo.h:5, from include/linux/mtd/pfow.h:8, from drivers/mtd/lpddr/lpddr_cmds.c:14: include/linux/mtd/map.h: In function 'inline_map_write': include/linux/mtd/map.h:428:9: error: implicit declaration of function 'mb' [-Wimplicit-function-declaration] Fixes: 56eb7c13b97c ("mtd: map: Don't use "proxy" headers") Signed-off-by: Arnd Bergmann Acked-by: Andy Shevchenko Signed-off-by: Miquel Raynal --- include/linux/mtd/map.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index 288ef765a44e..75b0b2abc880 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -14,6 +14,7 @@ #include #include #include +#include struct device_node; struct module; -- cgit v1.2.3 From fc02f529a8dbf617f6d211cb693f56a842b6dbe5 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Sat, 6 Sep 2025 16:53:23 +0300 Subject: dt-bindings: clock: tegra30: Add IDs for CSI pad clocks Tegra30 has CSI pad clock enable bits embedded into PLLD/PLLD2 registers. Add ids for these clocks. Additionally, move TEGRA30_CLK_CLK_MAX into clk-tegra30 source. Signed-off-by: Svyatoslav Ryhel Acked-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- drivers/clk/tegra/clk-tegra30.c | 1 + include/dt-bindings/clock/tegra30-car.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c index 82a8cb9545eb..e7ebb63970d3 100644 --- a/drivers/clk/tegra/clk-tegra30.c +++ b/drivers/clk/tegra/clk-tegra30.c @@ -53,6 +53,7 @@ #define SYSTEM_CLK_RATE 0x030 #define TEGRA30_CLK_PERIPH_BANKS 5 +#define TEGRA30_CLK_CLK_MAX 311 #define PLLC_BASE 0x80 #define PLLC_MISC 0x8c diff --git a/include/dt-bindings/clock/tegra30-car.h b/include/dt-bindings/clock/tegra30-car.h index f193663e6f28..763b81f80908 100644 --- a/include/dt-bindings/clock/tegra30-car.h +++ b/include/dt-bindings/clock/tegra30-car.h @@ -271,6 +271,7 @@ #define TEGRA30_CLK_AUDIO3_MUX 306 #define TEGRA30_CLK_AUDIO4_MUX 307 #define TEGRA30_CLK_SPDIF_MUX 308 -#define TEGRA30_CLK_CLK_MAX 309 +#define TEGRA30_CLK_CSIA_PAD 309 +#define TEGRA30_CLK_CSIB_PAD 310 #endif /* _DT_BINDINGS_CLOCK_TEGRA30_CAR_H */ -- cgit v1.2.3 From 7526e6db4703d0fe81b5397939c2aefd5fe8d9bc Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Fri, 29 Aug 2025 15:22:31 +0300 Subject: dt-bindings: reset: Add Tegra114 CAR header The way that resets are handled on these Tegra devices is that there is a set of peripheral clocks & resets which are paired up. This is because they are laid out in banks within the CAR (clock and reset) controller. In most cases we're referring to those resets, so you'll often see a clock ID used in conjection with the same reset ID for a given IP block. In addition to those peripheral resets, there are a number of extra resets that don't have a corresponding clock and which are exposed in registers outside of the peripheral banks, but still part of the CAR. To support those "special" registers, the TEGRA*_RESET() is used to denote resets outside of the regular peripheral resets. Essentially it defines the offset within the CAR at which special resets start. In the above case, Tegra114 has 5 banks with 32 peripheral resets each. The first special reset, TEGRA114_RESET(0), therefore gets ID 5 * 32 + 0 = 160. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Mikko Perttunen Acked-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- include/dt-bindings/reset/nvidia,tegra114-car.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/dt-bindings/reset/nvidia,tegra114-car.h (limited to 'include') diff --git a/include/dt-bindings/reset/nvidia,tegra114-car.h b/include/dt-bindings/reset/nvidia,tegra114-car.h new file mode 100644 index 000000000000..9b8c320402db --- /dev/null +++ b/include/dt-bindings/reset/nvidia,tegra114-car.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * This header provides Tegra114-specific constants for binding + * nvidia,tegra114-car. + */ + +#ifndef _DT_BINDINGS_RESET_NVIDIA_TEGRA114_CAR_H +#define _DT_BINDINGS_RESET_NVIDIA_TEGRA114_CAR_H + +#define TEGRA114_RESET(x) (5 * 32 + (x)) +#define TEGRA114_RST_DFLL_DVCO TEGRA114_RESET(0) + +#endif /* _DT_BINDINGS_RESET_NVIDIA_TEGRA114_CAR_H */ -- cgit v1.2.3 From 67a529b7d3c50a56c162476509361f4fe11350dd Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 15 Aug 2025 12:40:02 -0500 Subject: include: adi-axi-common: add version check function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a version check function for checking ADI AXI IP core versions. These cores use a semantic versioning scheme, so it is useful to have a version check function that can check the minor version to enable features in driver while maintaining backward compatibility. Signed-off-by: David Lechner Reviewed-by: Nuno Sá Link: https://patch.msgid.link/20250815-spi-axi-spi-enigne-improve-version-checks-v1-1-13bde357d5b6@baylibre.com Signed-off-by: Mark Brown --- include/linux/adi-axi-common.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/adi-axi-common.h b/include/linux/adi-axi-common.h index f64f4ad4beda..37962ba530df 100644 --- a/include/linux/adi-axi-common.h +++ b/include/linux/adi-axi-common.h @@ -8,6 +8,8 @@ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap */ +#include + #ifndef ADI_AXI_COMMON_H_ #define ADI_AXI_COMMON_H_ @@ -21,6 +23,25 @@ #define ADI_AXI_PCORE_VER_MINOR(version) (((version) >> 8) & 0xff) #define ADI_AXI_PCORE_VER_PATCH(version) ((version) & 0xff) +/** + * adi_axi_pcore_ver_gteq() - check if a version is satisfied + * @version: the full version read from the hardware + * @major: the major version to compare against + * @minor: the minor version to compare against + * + * ADI AXI IP Cores use semantic versioning, so this can be used to check for + * feature availability. + * + * Return: true if the version is greater than or equal to the specified + * major and minor version, false otherwise. + */ +static inline bool adi_axi_pcore_ver_gteq(u32 version, u32 major, u32 minor) +{ + return ADI_AXI_PCORE_VER_MAJOR(version) > (major) || + (ADI_AXI_PCORE_VER_MAJOR(version) == (major) && + ADI_AXI_PCORE_VER_MINOR(version) >= (minor)); +} + #define ADI_AXI_INFO_FPGA_TECH(info) (((info) >> 24) & 0xff) #define ADI_AXI_INFO_FPGA_FAMILY(info) (((info) >> 16) & 0xff) #define ADI_AXI_INFO_FPGA_SPEED_GRADE(info) (((info) >> 8) & 0xff) -- cgit v1.2.3 From a24cd110e664396061b0a72930734bf419bf88c4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 9 Sep 2025 19:07:46 +0100 Subject: dt-bindings: clock: renesas,r9a09g047-cpg: Add USB3.0 core clocks Add definitions for USB3.0 core clocks in the R9A09G047 CPG DT bindings header file. Reviewed-by: Geert Uytterhoeven Acked-by: Conor Dooley Signed-off-by: Biju Das Link: https://patch.msgid.link/20250909180803.140939-2-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g047-cpg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h index a27132f9a6c8..f165df8a6f5a 100644 --- a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h +++ b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h @@ -20,5 +20,7 @@ #define R9A09G047_SPI_CLK_SPI 9 #define R9A09G047_GBETH_0_CLK_PTP_REF_I 10 #define R9A09G047_GBETH_1_CLK_PTP_REF_I 11 +#define R9A09G047_USB3_0_REF_ALT_CLK_P 12 +#define R9A09G047_USB3_0_CLKCORE 13 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G047_CPG_H__ */ -- cgit v1.2.3 From 413cf5db2fee00fdd69bc62debdbf655f97f4c08 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 12 Aug 2025 06:23:31 +0200 Subject: libie, ice: move fwlog admin queue to libie Copy the code and: - change ICE_AQC to LIBIE_AQC - change ice_aqc to libie_aqc - move definitions outside the structures Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 78 ---------------------- drivers/net/ethernet/intel/ice/ice_debugfs.c | 21 +++--- drivers/net/ethernet/intel/ice/ice_fwlog.c | 46 ++++++------- drivers/net/ethernet/intel/ice/ice_fwlog.h | 2 +- include/linux/net/intel/libie/adminq.h | 89 +++++++++++++++++++++++++ 5 files changed, 124 insertions(+), 112 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index caae1780fd37..93aedb35fd17 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2399,42 +2399,6 @@ struct ice_aqc_event_lan_overflow { u8 reserved[8]; }; -enum ice_aqc_fw_logging_mod { - ICE_AQC_FW_LOG_ID_GENERAL = 0, - ICE_AQC_FW_LOG_ID_CTRL, - ICE_AQC_FW_LOG_ID_LINK, - ICE_AQC_FW_LOG_ID_LINK_TOPO, - ICE_AQC_FW_LOG_ID_DNL, - ICE_AQC_FW_LOG_ID_I2C, - ICE_AQC_FW_LOG_ID_SDP, - ICE_AQC_FW_LOG_ID_MDIO, - ICE_AQC_FW_LOG_ID_ADMINQ, - ICE_AQC_FW_LOG_ID_HDMA, - ICE_AQC_FW_LOG_ID_LLDP, - ICE_AQC_FW_LOG_ID_DCBX, - ICE_AQC_FW_LOG_ID_DCB, - ICE_AQC_FW_LOG_ID_XLR, - ICE_AQC_FW_LOG_ID_NVM, - ICE_AQC_FW_LOG_ID_AUTH, - ICE_AQC_FW_LOG_ID_VPD, - ICE_AQC_FW_LOG_ID_IOSF, - ICE_AQC_FW_LOG_ID_PARSER, - ICE_AQC_FW_LOG_ID_SW, - ICE_AQC_FW_LOG_ID_SCHEDULER, - ICE_AQC_FW_LOG_ID_TXQ, - ICE_AQC_FW_LOG_ID_RSVD, - ICE_AQC_FW_LOG_ID_POST, - ICE_AQC_FW_LOG_ID_WATCHDOG, - ICE_AQC_FW_LOG_ID_TASK_DISPATCH, - ICE_AQC_FW_LOG_ID_MNG, - ICE_AQC_FW_LOG_ID_SYNCE, - ICE_AQC_FW_LOG_ID_HEALTH, - ICE_AQC_FW_LOG_ID_TSDRV, - ICE_AQC_FW_LOG_ID_PFREG, - ICE_AQC_FW_LOG_ID_MDLVER, - ICE_AQC_FW_LOG_ID_MAX, -}; - enum ice_aqc_health_status_mask { ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK = BIT(0), ICE_AQC_HEALTH_STATUS_SET_ALL_PF_MASK = BIT(1), @@ -2516,48 +2480,6 @@ struct ice_aqc_health_status_elem { __le32 internal_data2; }; -/* Set FW Logging configuration (indirect 0xFF30) - * Register for FW Logging (indirect 0xFF31) - * Query FW Logging (indirect 0xFF32) - * FW Log Event (indirect 0xFF33) - */ -struct ice_aqc_fw_log { - u8 cmd_flags; -#define ICE_AQC_FW_LOG_CONF_UART_EN BIT(0) -#define ICE_AQC_FW_LOG_CONF_AQ_EN BIT(1) -#define ICE_AQC_FW_LOG_QUERY_REGISTERED BIT(2) -#define ICE_AQC_FW_LOG_CONF_SET_VALID BIT(3) -#define ICE_AQC_FW_LOG_AQ_REGISTER BIT(0) -#define ICE_AQC_FW_LOG_AQ_QUERY BIT(2) - - u8 rsp_flag; - __le16 fw_rt_msb; - union { - struct { - __le32 fw_rt_lsb; - } sync; - struct { - __le16 log_resolution; -#define ICE_AQC_FW_LOG_MIN_RESOLUTION (1) -#define ICE_AQC_FW_LOG_MAX_RESOLUTION (128) - - __le16 mdl_cnt; - } cfg; - } ops; - __le32 addr_high; - __le32 addr_low; -}; - -/* Response Buffer for: - * Set Firmware Logging Configuration (0xFF30) - * Query FW Logging (0xFF32) - */ -struct ice_aqc_fw_log_cfg_resp { - __le16 module_identifier; - u8 log_level; - u8 rsvd0; -}; - /* Admin Queue command opcodes */ enum ice_adminq_opc { /* AQ commands */ diff --git a/drivers/net/ethernet/intel/ice/ice_debugfs.c b/drivers/net/ethernet/intel/ice/ice_debugfs.c index 161e0571bae7..6f7f6788447e 100644 --- a/drivers/net/ethernet/intel/ice/ice_debugfs.c +++ b/drivers/net/ethernet/intel/ice/ice_debugfs.c @@ -12,10 +12,11 @@ static struct dentry *ice_debugfs_root; /* create a define that has an extra module that doesn't really exist. this * is so we can add a module 'all' to easily enable/disable all the modules */ -#define ICE_NR_FW_LOG_MODULES (ICE_AQC_FW_LOG_ID_MAX + 1) +#define ICE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) /* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in ice_aqc_fw_logging_mod + * values in the FW so the index is the same value as in + * libie_aqc_fw_logging_mod */ static const char * const ice_fwlog_module_string[] = { "general", @@ -84,7 +85,7 @@ ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, { struct ice_fwlog_module_entry *entry; - if (module != ICE_AQC_FW_LOG_ID_MAX) { + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { entry = &cfg->module_entries[module]; seq_printf(s, "\tModule: %s, Log Level: %s\n", @@ -93,7 +94,7 @@ ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, } else { int i; - for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) { + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) { entry = &cfg->module_entries[i]; seq_printf(s, "\tModule: %s, Log Level: %s\n", @@ -190,7 +191,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, return -EINVAL; } - if (module != ICE_AQC_FW_LOG_ID_MAX) { + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { fwlog->cfg.module_entries[module].log_level = log_level; } else { /* the module 'all' is a shortcut so that we can set @@ -198,7 +199,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, */ int i; - for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) fwlog->cfg.module_entries[i].log_level = log_level; } @@ -266,11 +267,11 @@ ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, if (ret) return ret; - if (nr_messages < ICE_AQC_FW_LOG_MIN_RESOLUTION || - nr_messages > ICE_AQC_FW_LOG_MAX_RESOLUTION) { + if (nr_messages < LIBIE_AQC_FW_LOG_MIN_RESOLUTION || + nr_messages > LIBIE_AQC_FW_LOG_MAX_RESOLUTION) { dev_err(dev, "Invalid FW log number of messages %d, value must be between %d - %d\n", - nr_messages, ICE_AQC_FW_LOG_MIN_RESOLUTION, - ICE_AQC_FW_LOG_MAX_RESOLUTION); + nr_messages, LIBIE_AQC_FW_LOG_MIN_RESOLUTION, + LIBIE_AQC_FW_LOG_MAX_RESOLUTION); return -EINVAL; } diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c index 9e640e942feb..30175be484a5 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.c +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c @@ -142,8 +142,8 @@ static bool ice_fwlog_supported(struct ice_fwlog *fwlog) */ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) { - struct ice_aqc_fw_log_cfg_resp *fw_modules; - struct ice_aqc_fw_log *cmd; + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aqc_fw_log *cmd; struct libie_aq_desc desc; u16 module_id_cnt; int status; @@ -156,10 +156,10 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) if (!buf) return -ENOMEM; - ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_query); + ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_query); cmd = libie_aq_raw(&desc); - cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_QUERY; + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; status = fwlog->send_cmd(fwlog->priv, &desc, buf, ICE_AQ_MAX_BUF_LEN); if (status) { @@ -168,26 +168,26 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) } module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt); - if (module_id_cnt < ICE_AQC_FW_LOG_ID_MAX) { + if (module_id_cnt < LIBIE_AQC_FW_LOG_ID_MAX) { dev_dbg(&fwlog->pdev->dev, "FW returned less than the expected number of FW log module IDs\n"); - } else if (module_id_cnt > ICE_AQC_FW_LOG_ID_MAX) { + } else if (module_id_cnt > LIBIE_AQC_FW_LOG_ID_MAX) { dev_dbg(&fwlog->pdev->dev, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n", - ICE_AQC_FW_LOG_ID_MAX); - module_id_cnt = ICE_AQC_FW_LOG_ID_MAX; + LIBIE_AQC_FW_LOG_ID_MAX); + module_id_cnt = LIBIE_AQC_FW_LOG_ID_MAX; } cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); - if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_AQ_EN) + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) cfg->options |= ICE_FWLOG_OPTION_ARQ_ENA; - if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_UART_EN) + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) cfg->options |= ICE_FWLOG_OPTION_UART_ENA; - if (cmd->cmd_flags & ICE_AQC_FW_LOG_QUERY_REGISTERED) + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) cfg->options |= ICE_FWLOG_OPTION_IS_REGISTERED; - fw_modules = (struct ice_aqc_fw_log_cfg_resp *)buf; + fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; for (i = 0; i < module_id_cnt; i++) { - struct ice_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; + struct libie_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; cfg->module_entries[i].module_id = le16_to_cpu(fw_module->module_identifier); @@ -325,8 +325,8 @@ ice_aq_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_module_entry *entries, u16 num_entries, u16 options, u16 log_resolution) { - struct ice_aqc_fw_log_cfg_resp *fw_modules; - struct ice_aqc_fw_log *cmd; + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aqc_fw_log *cmd; struct libie_aq_desc desc; int status; int i; @@ -341,19 +341,19 @@ ice_aq_fwlog_set(struct ice_fwlog *fwlog, fw_modules[i].log_level = entries[i].log_level; } - ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_config); + ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_config); desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD); cmd = libie_aq_raw(&desc); - cmd->cmd_flags = ICE_AQC_FW_LOG_CONF_SET_VALID; + cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); if (options & ICE_FWLOG_OPTION_ARQ_ENA) - cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_AQ_EN; + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; if (options & ICE_FWLOG_OPTION_UART_ENA) - cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_UART_EN; + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, sizeof(*fw_modules) * num_entries); @@ -382,7 +382,7 @@ int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) return -EOPNOTSUPP; return ice_aq_fwlog_set(fwlog, cfg->module_entries, - ICE_AQC_FW_LOG_ID_MAX, cfg->options, + LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, cfg->log_resolution); } @@ -393,14 +393,14 @@ int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) */ static int ice_aq_fwlog_register(struct ice_fwlog *fwlog, bool reg) { - struct ice_aqc_fw_log *cmd; + struct libie_aqc_fw_log *cmd; struct libie_aq_desc desc; - ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_register); + ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_register); cmd = libie_aq_raw(&desc); if (reg) - cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_REGISTER; + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); } diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h index 22585ea9ec93..9efa4a83c957 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.h +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.h @@ -29,7 +29,7 @@ struct ice_fwlog_module_entry { struct ice_fwlog_cfg { /* list of modules for configuring log level */ - struct ice_fwlog_module_entry module_entries[ICE_AQC_FW_LOG_ID_MAX]; + struct ice_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; /* options used to configure firmware logging */ u16 options; #define ICE_FWLOG_OPTION_ARQ_ENA BIT(0) diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index ba62f703df43..ca2ac88b5709 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -222,6 +222,94 @@ struct libie_aqc_list_caps_elem { }; LIBIE_CHECK_STRUCT_LEN(32, libie_aqc_list_caps_elem); +/* Admin Queue command opcodes */ +enum libie_adminq_opc { + /* FW Logging Commands */ + libie_aqc_opc_fw_logs_config = 0xFF30, + libie_aqc_opc_fw_logs_register = 0xFF31, + libie_aqc_opc_fw_logs_query = 0xFF32, + libie_aqc_opc_fw_logs_event = 0xFF33, +}; + +enum libie_aqc_fw_logging_mod { + LIBIE_AQC_FW_LOG_ID_GENERAL = 0, + LIBIE_AQC_FW_LOG_ID_CTRL, + LIBIE_AQC_FW_LOG_ID_LINK, + LIBIE_AQC_FW_LOG_ID_LINK_TOPO, + LIBIE_AQC_FW_LOG_ID_DNL, + LIBIE_AQC_FW_LOG_ID_I2C, + LIBIE_AQC_FW_LOG_ID_SDP, + LIBIE_AQC_FW_LOG_ID_MDIO, + LIBIE_AQC_FW_LOG_ID_ADMINQ, + LIBIE_AQC_FW_LOG_ID_HDMA, + LIBIE_AQC_FW_LOG_ID_LLDP, + LIBIE_AQC_FW_LOG_ID_DCBX, + LIBIE_AQC_FW_LOG_ID_DCB, + LIBIE_AQC_FW_LOG_ID_XLR, + LIBIE_AQC_FW_LOG_ID_NVM, + LIBIE_AQC_FW_LOG_ID_AUTH, + LIBIE_AQC_FW_LOG_ID_VPD, + LIBIE_AQC_FW_LOG_ID_IOSF, + LIBIE_AQC_FW_LOG_ID_PARSER, + LIBIE_AQC_FW_LOG_ID_SW, + LIBIE_AQC_FW_LOG_ID_SCHEDULER, + LIBIE_AQC_FW_LOG_ID_TXQ, + LIBIE_AQC_FW_LOG_ID_RSVD, + LIBIE_AQC_FW_LOG_ID_POST, + LIBIE_AQC_FW_LOG_ID_WATCHDOG, + LIBIE_AQC_FW_LOG_ID_TASK_DISPATCH, + LIBIE_AQC_FW_LOG_ID_MNG, + LIBIE_AQC_FW_LOG_ID_SYNCE, + LIBIE_AQC_FW_LOG_ID_HEALTH, + LIBIE_AQC_FW_LOG_ID_TSDRV, + LIBIE_AQC_FW_LOG_ID_PFREG, + LIBIE_AQC_FW_LOG_ID_MDLVER, + LIBIE_AQC_FW_LOG_ID_MAX, +}; + +/* Set FW Logging configuration (indirect 0xFF30) + * Register for FW Logging (indirect 0xFF31) + * Query FW Logging (indirect 0xFF32) + * FW Log Event (indirect 0xFF33) + */ +#define LIBIE_AQC_FW_LOG_CONF_UART_EN BIT(0) +#define LIBIE_AQC_FW_LOG_CONF_AQ_EN BIT(1) +#define LIBIE_AQC_FW_LOG_QUERY_REGISTERED BIT(2) +#define LIBIE_AQC_FW_LOG_CONF_SET_VALID BIT(3) +#define LIBIE_AQC_FW_LOG_AQ_REGISTER BIT(0) +#define LIBIE_AQC_FW_LOG_AQ_QUERY BIT(2) + +#define LIBIE_AQC_FW_LOG_MIN_RESOLUTION (1) +#define LIBIE_AQC_FW_LOG_MAX_RESOLUTION (128) + +struct libie_aqc_fw_log { + u8 cmd_flags; + + u8 rsp_flag; + __le16 fw_rt_msb; + union { + struct { + __le32 fw_rt_lsb; + } sync; + struct { + __le16 log_resolution; + __le16 mdl_cnt; + } cfg; + } ops; + __le32 addr_high; + __le32 addr_low; +}; + +/* Response Buffer for: + * Set Firmware Logging Configuration (0xFF30) + * Query FW Logging (0xFF32) + */ +struct libie_aqc_fw_log_cfg_resp { + __le16 module_identifier; + u8 log_level; + u8 rsvd0; +}; + /** * struct libie_aq_desc - Admin Queue (AQ) descriptor * @flags: LIBIE_AQ_FLAG_* flags @@ -253,6 +341,7 @@ struct libie_aq_desc { struct libie_aqc_driver_ver driver_ver; struct libie_aqc_req_res res_owner; struct libie_aqc_list_caps get_cap; + struct libie_aqc_fw_log fw_log; } params; }; LIBIE_CHECK_STRUCT_LEN(32, libie_aq_desc); -- cgit v1.2.3 From 02f44dac8930dc7cc43aa3eba872ce35382f6332 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 12 Aug 2025 06:23:33 +0200 Subject: ice: prepare for moving file to libie s/ice/libie There is no function for filling default descriptor in libie. Zero descriptor structure and set opcode without calling the function. Make functions that are caled only in ice_fwlog.c static. Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 6 +- drivers/net/ethernet/intel/ice/ice_fwlog.c | 624 ++++++++++++++-------------- drivers/net/ethernet/intel/ice/ice_fwlog.h | 78 ++-- drivers/net/ethernet/intel/ice/ice_main.c | 4 +- drivers/net/ethernet/intel/ice/ice_type.h | 2 +- include/linux/net/intel/libie/adminq.h | 1 + 6 files changed, 359 insertions(+), 356 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index fd15d7385aaa..1baac0b74caf 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -995,7 +995,7 @@ static int __fwlog_send_cmd(void *priv, struct libie_aq_desc *desc, void *buf, static int __fwlog_init(struct ice_hw *hw) { struct ice_pf *pf = hw->back; - struct ice_fwlog_api api = { + struct libie_fwlog_api api = { .pdev = pf->pdev, .send_cmd = __fwlog_send_cmd, .priv = hw, @@ -1012,7 +1012,7 @@ static int __fwlog_init(struct ice_hw *hw) api.debugfs_root = pf->ice_debugfs_pf; - return ice_fwlog_init(&hw->fwlog, &api); + return libie_fwlog_init(&hw->fwlog, &api); } /** @@ -1197,7 +1197,7 @@ static void __fwlog_deinit(struct ice_hw *hw) return; ice_debugfs_pf_deinit(hw->back); - ice_fwlog_deinit(&hw->fwlog); + libie_fwlog_deinit(&hw->fwlog); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c index 036c595cf92b..cb2d3ff203c4 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.c +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c @@ -12,13 +12,13 @@ /* create a define that has an extra module that doesn't really exist. this * is so we can add a module 'all' to easily enable/disable all the modules */ -#define ICE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) +#define LIBIE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) /* the ordering in this array is important. it matches the ordering of the * values in the FW so the index is the same value as in * libie_aqc_fw_logging_mod */ -static const char * const ice_fwlog_module_string[] = { +static const char * const libie_fwlog_module_string[] = { "general", "ctrl", "link", @@ -55,9 +55,9 @@ static const char * const ice_fwlog_module_string[] = { }; /* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in ice_fwlog_level + * values in the FW so the index is the same value as in libie_fwlog_level */ -static const char * const ice_fwlog_level_string[] = { +static const char * const libie_fwlog_level_string[] = { "none", "error", "warning", @@ -65,7 +65,7 @@ static const char * const ice_fwlog_level_string[] = { "verbose", }; -static const char * const ice_fwlog_log_size[] = { +static const char * const libie_fwlog_log_size[] = { "128K", "256K", "512K", @@ -73,43 +73,43 @@ static const char * const ice_fwlog_log_size[] = { "2M", }; -static bool ice_fwlog_ring_empty(struct ice_fwlog_ring *rings) +static bool libie_fwlog_ring_empty(struct libie_fwlog_ring *rings) { return rings->head == rings->tail; } -static void ice_fwlog_ring_increment(u16 *item, u16 size) +static void libie_fwlog_ring_increment(u16 *item, u16 size) { *item = (*item + 1) & (size - 1); } -static int ice_fwlog_alloc_ring_buffs(struct ice_fwlog_ring *rings) +static int libie_fwlog_alloc_ring_buffs(struct libie_fwlog_ring *rings) { int i, nr_bytes; u8 *mem; - nr_bytes = rings->size * ICE_AQ_MAX_BUF_LEN; + nr_bytes = rings->size * LIBIE_AQ_MAX_BUF_LEN; mem = vzalloc(nr_bytes); if (!mem) return -ENOMEM; for (i = 0; i < rings->size; i++) { - struct ice_fwlog_data *ring = &rings->rings[i]; + struct libie_fwlog_data *ring = &rings->rings[i]; - ring->data_size = ICE_AQ_MAX_BUF_LEN; + ring->data_size = LIBIE_AQ_MAX_BUF_LEN; ring->data = mem; - mem += ICE_AQ_MAX_BUF_LEN; + mem += LIBIE_AQ_MAX_BUF_LEN; } return 0; } -static void ice_fwlog_free_ring_buffs(struct ice_fwlog_ring *rings) +static void libie_fwlog_free_ring_buffs(struct libie_fwlog_ring *rings) { int i; for (i = 0; i < rings->size; i++) { - struct ice_fwlog_data *ring = &rings->rings[i]; + struct libie_fwlog_data *ring = &rings->rings[i]; /* the first ring is the base memory for the whole range so * free it @@ -122,16 +122,16 @@ static void ice_fwlog_free_ring_buffs(struct ice_fwlog_ring *rings) } } -#define ICE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) +#define LIBIE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) /** - * ice_fwlog_realloc_rings - reallocate the FW log rings + * libie_fwlog_realloc_rings - reallocate the FW log rings * @fwlog: pointer to the fwlog structure * @index: the new index to use to allocate memory for the log data * */ -static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) +static void libie_fwlog_realloc_rings(struct libie_fwlog *fwlog, int index) { - struct ice_fwlog_ring ring; + struct libie_fwlog_ring ring; int status, ring_size; /* convert the number of bytes into a number of 4K buffers. externally @@ -143,7 +143,7 @@ static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) * the user the driver knows that the data is correct and the FW log * can be correctly parsed by the tools */ - ring_size = ICE_FWLOG_INDEX_TO_BYTES(index) / ICE_AQ_MAX_BUF_LEN; + ring_size = LIBIE_FWLOG_INDEX_TO_BYTES(index) / LIBIE_AQ_MAX_BUF_LEN; if (ring_size == fwlog->ring.size) return; @@ -157,15 +157,15 @@ static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) ring.size = ring_size; - status = ice_fwlog_alloc_ring_buffs(&ring); + status = libie_fwlog_alloc_ring_buffs(&ring); if (status) { dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - ice_fwlog_free_ring_buffs(&ring); + libie_fwlog_free_ring_buffs(&ring); kfree(ring.rings); return; } - ice_fwlog_free_ring_buffs(&fwlog->ring); + libie_fwlog_free_ring_buffs(&fwlog->ring); kfree(fwlog->ring.rings); fwlog->ring.rings = ring.rings; @@ -176,23 +176,174 @@ static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) } /** - * ice_fwlog_print_module_cfg - print current FW logging module configuration + * libie_fwlog_supported - Cached for whether FW supports FW logging or not + * @fwlog: pointer to the fwlog structure + * + * This will always return false if called before libie_init_hw(), so it must be + * called after libie_init_hw(). + */ +static bool libie_fwlog_supported(struct libie_fwlog *fwlog) +{ + return fwlog->supported; +} + +/** + * libie_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) + * @fwlog: pointer to the fwlog structure + * @entries: entries to configure + * @num_entries: number of @entries + * @options: options from libie_fwlog_cfg->options structure + * @log_resolution: logging resolution + */ +static int +libie_aq_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_module_entry *entries, u16 num_entries, + u16 options, u16 log_resolution) +{ + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + int status; + int i; + + fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); + if (!fw_modules) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) { + fw_modules[i].module_identifier = + cpu_to_le16(entries[i].module_id); + fw_modules[i].log_level = entries[i].log_level; + } + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_config); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI) | + cpu_to_le16(LIBIE_AQ_FLAG_RD); + + cmd = libie_aq_raw(&desc); + + cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; + cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); + cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); + + if (options & LIBIE_FWLOG_OPTION_ARQ_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; + if (options & LIBIE_FWLOG_OPTION_UART_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; + + status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, + sizeof(*fw_modules) * num_entries); + + kfree(fw_modules); + + return status; +} + +/** + * libie_fwlog_set - Set the firmware logging settings + * @fwlog: pointer to the fwlog structure + * @cfg: config used to set firmware logging + * + * This function should be called whenever the driver needs to set the firmware + * logging configuration. It can be called on initialization, reset, or during + * runtime. + * + * If the PF wishes to receive FW logging then it must register via + * libie_fwlog_register. Note, that libie_fwlog_register does not need to be called + * for init. + */ +static int libie_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) +{ + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + return libie_aq_fwlog_set(fwlog, cfg->module_entries, + LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, + cfg->log_resolution); +} + +/** + * libie_aq_fwlog_register - Register PF for firmware logging events (0xFF31) + * @fwlog: pointer to the fwlog structure + * @reg: true to register and false to unregister + */ +static int libie_aq_fwlog_register(struct libie_fwlog *fwlog, bool reg) +{ + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_register); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); + cmd = libie_aq_raw(&desc); + + if (reg) + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; + + return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); +} + +/** + * libie_fwlog_register - Register the PF for firmware logging + * @fwlog: pointer to the fwlog structure + * + * After this call the PF will start to receive firmware logging based on the + * configuration set in libie_fwlog_set. + */ +int libie_fwlog_register(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, true); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); + else + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_unregister - Unregister the PF from firmware logging + * @fwlog: pointer to the fwlog structure + */ +static int libie_fwlog_unregister(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, false); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); + else + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_print_module_cfg - print current FW logging module configuration * @cfg: pointer to the fwlog cfg structure * @module: module to print * @s: the seq file to put data into */ static void -ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, - struct seq_file *s) +libie_fwlog_print_module_cfg(struct libie_fwlog_cfg *cfg, int module, + struct seq_file *s) { - struct ice_fwlog_module_entry *entry; + struct libie_fwlog_module_entry *entry; if (module != LIBIE_AQC_FW_LOG_ID_MAX) { entry = &cfg->module_entries[module]; seq_printf(s, "\tModule: %s, Log Level: %s\n", - ice_fwlog_module_string[entry->module_id], - ice_fwlog_level_string[entry->log_level]); + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); } else { int i; @@ -200,19 +351,19 @@ ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, entry = &cfg->module_entries[i]; seq_printf(s, "\tModule: %s, Log Level: %s\n", - ice_fwlog_module_string[entry->module_id], - ice_fwlog_level_string[entry->log_level]); + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); } } } -static int ice_find_module_by_dentry(struct dentry **modules, struct dentry *d) +static int libie_find_module_by_dentry(struct dentry **modules, struct dentry *d) { int i, module; module = -1; /* find the module based on the dentry */ - for (i = 0; i < ICE_NR_FW_LOG_MODULES; i++) { + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { if (d == modules[i]) { module = i; break; @@ -223,47 +374,47 @@ static int ice_find_module_by_dentry(struct dentry **modules, struct dentry *d) } /** - * ice_debugfs_module_show - read from 'module' file + * libie_debugfs_module_show - read from 'module' file * @s: the opened file * @v: pointer to the offset */ -static int ice_debugfs_module_show(struct seq_file *s, void *v) +static int libie_debugfs_module_show(struct seq_file *s, void *v) { - struct ice_fwlog *fwlog = s->private; + struct libie_fwlog *fwlog = s->private; const struct file *filp = s->file; struct dentry *dentry; int module; dentry = file_dentry(filp); - module = ice_find_module_by_dentry(fwlog->debugfs_modules, dentry); + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); if (module < 0) { dev_info(&fwlog->pdev->dev, "unknown module\n"); return -EINVAL; } - ice_fwlog_print_module_cfg(&fwlog->cfg, module, s); + libie_fwlog_print_module_cfg(&fwlog->cfg, module, s); return 0; } -static int ice_debugfs_module_open(struct inode *inode, struct file *filp) +static int libie_debugfs_module_open(struct inode *inode, struct file *filp) { - return single_open(filp, ice_debugfs_module_show, inode->i_private); + return single_open(filp, libie_debugfs_module_show, inode->i_private); } /** - * ice_debugfs_module_write - write into 'module' file + * libie_debugfs_module_write - write into 'module' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_module_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_module_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = file_inode(filp)->i_private; + struct libie_fwlog *fwlog = file_inode(filp)->i_private; struct dentry *dentry = file_dentry(filp); struct device *dev = &fwlog->pdev->dev; char user_val[16], *cmd_buf; @@ -277,7 +428,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, if (IS_ERR(cmd_buf)) return PTR_ERR(cmd_buf); - module = ice_find_module_by_dentry(fwlog->debugfs_modules, dentry); + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); if (module < 0) { dev_info(dev, "unknown module\n"); return -EINVAL; @@ -287,7 +438,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, if (cnt != 1) return -EINVAL; - log_level = sysfs_match_string(ice_fwlog_level_string, user_val); + log_level = sysfs_match_string(libie_fwlog_level_string, user_val); if (log_level < 0) { dev_info(dev, "unknown log level '%s'\n", user_val); return -EINVAL; @@ -308,26 +459,26 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, return count; } -static const struct file_operations ice_debugfs_module_fops = { +static const struct file_operations libie_debugfs_module_fops = { .owner = THIS_MODULE, - .open = ice_debugfs_module_open, + .open = libie_debugfs_module_open, .read = seq_read, .release = single_release, - .write = ice_debugfs_module_write, + .write = libie_debugfs_module_write, }; /** - * ice_debugfs_nr_messages_read - read from 'nr_messages' file + * libie_debugfs_nr_messages_read - read from 'nr_messages' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_nr_messages_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) +static ssize_t libie_debugfs_nr_messages_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char buff[32] = {}; snprintf(buff, sizeof(buff), "%d\n", @@ -337,17 +488,17 @@ static ssize_t ice_debugfs_nr_messages_read(struct file *filp, } /** - * ice_debugfs_nr_messages_write - write into 'nr_messages' file + * libie_debugfs_nr_messages_write - write into 'nr_messages' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_nr_messages_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; struct device *dev = &fwlog->pdev->dev; char user_val[8], *cmd_buf; s16 nr_messages; @@ -382,46 +533,46 @@ ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, return count; } -static const struct file_operations ice_debugfs_nr_messages_fops = { +static const struct file_operations libie_debugfs_nr_messages_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_nr_messages_read, - .write = ice_debugfs_nr_messages_write, + .read = libie_debugfs_nr_messages_read, + .write = libie_debugfs_nr_messages_write, }; /** - * ice_debugfs_enable_read - read from 'enable' file + * libie_debugfs_enable_read - read from 'enable' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_enable_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) +static ssize_t libie_debugfs_enable_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char buff[32] = {}; snprintf(buff, sizeof(buff), "%u\n", (u16)(fwlog->cfg.options & - ICE_FWLOG_OPTION_IS_REGISTERED) >> 3); + LIBIE_FWLOG_OPTION_IS_REGISTERED) >> 3); return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); } /** - * ice_debugfs_enable_write - write into 'enable' file + * libie_debugfs_enable_write - write into 'enable' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_enable_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_enable_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char user_val[8], *cmd_buf; bool enable; ssize_t ret; @@ -443,18 +594,18 @@ ice_debugfs_enable_write(struct file *filp, const char __user *buf, goto enable_write_error; if (enable) - fwlog->cfg.options |= ICE_FWLOG_OPTION_ARQ_ENA; + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_ARQ_ENA; else - fwlog->cfg.options &= ~ICE_FWLOG_OPTION_ARQ_ENA; + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; - ret = ice_fwlog_set(fwlog, &fwlog->cfg); + ret = libie_fwlog_set(fwlog, &fwlog->cfg); if (ret) goto enable_write_error; if (enable) - ret = ice_fwlog_register(fwlog); + ret = libie_fwlog_register(fwlog); else - ret = ice_fwlog_unregister(fwlog); + ret = libie_fwlog_unregister(fwlog); if (ret) goto enable_write_error; @@ -475,46 +626,46 @@ enable_write_error: return ret; } -static const struct file_operations ice_debugfs_enable_fops = { +static const struct file_operations libie_debugfs_enable_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_enable_read, - .write = ice_debugfs_enable_write, + .read = libie_debugfs_enable_read, + .write = libie_debugfs_enable_write, }; /** - * ice_debugfs_log_size_read - read from 'log_size' file + * libie_debugfs_log_size_read - read from 'log_size' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_log_size_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) +static ssize_t libie_debugfs_log_size_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char buff[32] = {}; int index; index = fwlog->ring.index; - snprintf(buff, sizeof(buff), "%s\n", ice_fwlog_log_size[index]); + snprintf(buff, sizeof(buff), "%s\n", libie_fwlog_log_size[index]); return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); } /** - * ice_debugfs_log_size_write - write into 'log_size' file + * libie_debugfs_log_size_write - write into 'log_size' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_log_size_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_log_size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; struct device *dev = &fwlog->pdev->dev; char user_val[8], *cmd_buf; ssize_t ret; @@ -532,20 +683,20 @@ ice_debugfs_log_size_write(struct file *filp, const char __user *buf, if (ret != 1) return -EINVAL; - index = sysfs_match_string(ice_fwlog_log_size, user_val); + index = sysfs_match_string(libie_fwlog_log_size, user_val); if (index < 0) { dev_info(dev, "Invalid log size '%s'. The value must be one of 128K, 256K, 512K, 1M, 2M\n", user_val); ret = -EINVAL; goto log_size_write_error; - } else if (fwlog->cfg.options & ICE_FWLOG_OPTION_IS_REGISTERED) { + } else if (fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED) { dev_info(dev, "FW logging is currently running. Please disable FW logging to change log_size\n"); ret = -EINVAL; goto log_size_write_error; } /* free all the buffers and the tracking info and resize */ - ice_fwlog_realloc_rings(fwlog, index); + libie_fwlog_realloc_rings(fwlog, index); /* if we get here, nothing went wrong; return count since we didn't * really write anything @@ -563,32 +714,32 @@ log_size_write_error: return ret; } -static const struct file_operations ice_debugfs_log_size_fops = { +static const struct file_operations libie_debugfs_log_size_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_log_size_read, - .write = ice_debugfs_log_size_write, + .read = libie_debugfs_log_size_read, + .write = libie_debugfs_log_size_write, }; /** - * ice_debugfs_data_read - read from 'data' file + * libie_debugfs_data_read - read from 'data' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_data_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) +static ssize_t libie_debugfs_data_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; int data_copied = 0; bool done = false; - if (ice_fwlog_ring_empty(&fwlog->ring)) + if (libie_fwlog_ring_empty(&fwlog->ring)) return 0; - while (!ice_fwlog_ring_empty(&fwlog->ring) && !done) { - struct ice_fwlog_data *log; + while (!libie_fwlog_ring_empty(&fwlog->ring) && !done) { + struct libie_fwlog_data *log; u16 cur_buf_len; log = &fwlog->ring.rings[fwlog->ring.head]; @@ -610,24 +761,24 @@ static ssize_t ice_debugfs_data_read(struct file *filp, char __user *buffer, buffer += cur_buf_len; count -= cur_buf_len; *ppos += cur_buf_len; - ice_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); } return data_copied; } /** - * ice_debugfs_data_write - write into 'data' file + * libie_debugfs_data_write - write into 'data' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) +libie_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; struct device *dev = &fwlog->pdev->dev; ssize_t ret; @@ -638,7 +789,7 @@ ice_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, /* any value is allowed to clear the buffer so no need to even look at * what the value is */ - if (!(fwlog->cfg.options & ICE_FWLOG_OPTION_IS_REGISTERED)) { + if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) { fwlog->ring.head = 0; fwlog->ring.tail = 0; } else { @@ -663,19 +814,20 @@ nr_buffs_write_error: return ret; } -static const struct file_operations ice_debugfs_data_fops = { +static const struct file_operations libie_debugfs_data_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_data_read, - .write = ice_debugfs_data_write, + .read = libie_debugfs_data_read, + .write = libie_debugfs_data_write, }; /** - * ice_debugfs_fwlog_init - setup the debugfs directory + * libie_debugfs_fwlog_init - setup the debugfs directory * @fwlog: pointer to the fwlog structure * @root: debugfs root entry on which fwlog director will be registered */ -static void ice_debugfs_fwlog_init(struct ice_fwlog *fwlog, struct dentry *root) +static void libie_debugfs_fwlog_init(struct libie_fwlog *fwlog, + struct dentry *root) { struct dentry *fw_modules_dir; struct dentry **fw_modules; @@ -684,7 +836,7 @@ static void ice_debugfs_fwlog_init(struct ice_fwlog *fwlog, struct dentry *root) /* allocate space for this first because if it fails then we don't * need to unwind */ - fw_modules = kcalloc(ICE_NR_FW_LOG_MODULES, sizeof(*fw_modules), + fw_modules = kcalloc(LIBIE_NR_FW_LOG_MODULES, sizeof(*fw_modules), GFP_KERNEL); if (!fw_modules) return; @@ -697,27 +849,27 @@ static void ice_debugfs_fwlog_init(struct ice_fwlog *fwlog, struct dentry *root) if (IS_ERR(fw_modules_dir)) goto err_create_module_files; - for (i = 0; i < ICE_NR_FW_LOG_MODULES; i++) { - fw_modules[i] = debugfs_create_file(ice_fwlog_module_string[i], + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { + fw_modules[i] = debugfs_create_file(libie_fwlog_module_string[i], 0600, fw_modules_dir, fwlog, - &ice_debugfs_module_fops); + &libie_debugfs_module_fops); if (IS_ERR(fw_modules[i])) goto err_create_module_files; } debugfs_create_file("nr_messages", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_nr_messages_fops); + &libie_debugfs_nr_messages_fops); fwlog->debugfs_modules = fw_modules; debugfs_create_file("enable", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_enable_fops); + &libie_debugfs_enable_fops); debugfs_create_file("log_size", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_log_size_fops); + &libie_debugfs_log_size_fops); debugfs_create_file("data", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_data_fops); + &libie_debugfs_data_fops); return; @@ -726,7 +878,7 @@ err_create_module_files: kfree(fw_modules); } -static bool ice_fwlog_ring_full(struct ice_fwlog_ring *rings) +static bool libie_fwlog_ring_full(struct libie_fwlog_ring *rings) { u16 head, tail; @@ -742,27 +894,16 @@ static bool ice_fwlog_ring_full(struct ice_fwlog_ring *rings) } /** - * ice_fwlog_supported - Cached for whether FW supports FW logging or not - * @fwlog: pointer to the fwlog structure - * - * This will always return false if called before ice_init_hw(), so it must be - * called after ice_init_hw(). - */ -static bool ice_fwlog_supported(struct ice_fwlog *fwlog) -{ - return fwlog->supported; -} - -/** - * ice_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) + * libie_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) * @fwlog: pointer to the fwlog structure * @cfg: firmware logging configuration to populate */ -static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) +static int libie_aq_fwlog_get(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) { struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; struct libie_aqc_fw_log *cmd; - struct libie_aq_desc desc; u16 module_id_cnt; int status; void *buf; @@ -770,16 +911,17 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) memset(cfg, 0, sizeof(*cfg)); - buf = kzalloc(ICE_AQ_MAX_BUF_LEN, GFP_KERNEL); + buf = kzalloc(LIBIE_AQ_MAX_BUF_LEN, GFP_KERNEL); if (!buf) return -ENOMEM; - ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_query); + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_query); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); cmd = libie_aq_raw(&desc); cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; - status = fwlog->send_cmd(fwlog->priv, &desc, buf, ICE_AQ_MAX_BUF_LEN); + status = fwlog->send_cmd(fwlog->priv, &desc, buf, LIBIE_AQ_MAX_BUF_LEN); if (status) { dev_dbg(&fwlog->pdev->dev, "Failed to get FW log configuration\n"); goto status_out; @@ -796,11 +938,11 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) - cfg->options |= ICE_FWLOG_OPTION_ARQ_ENA; + cfg->options |= LIBIE_FWLOG_OPTION_ARQ_ENA; if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) - cfg->options |= ICE_FWLOG_OPTION_UART_ENA; + cfg->options |= LIBIE_FWLOG_OPTION_UART_ENA; if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) - cfg->options |= ICE_FWLOG_OPTION_IS_REGISTERED; + cfg->options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; @@ -818,18 +960,18 @@ status_out: } /** - * ice_fwlog_set_supported - Set if FW logging is supported by FW + * libie_fwlog_set_supported - Set if FW logging is supported by FW * @fwlog: pointer to the fwlog structure * - * If FW returns success to the ice_aq_fwlog_get call then it supports FW + * If FW returns success to the libie_aq_fwlog_get call then it supports FW * logging, else it doesn't. Set the fwlog_supported flag accordingly. * * This function is only meant to be called during driver init to determine if * the FW support FW logging. */ -static void ice_fwlog_set_supported(struct ice_fwlog *fwlog) +static void libie_fwlog_set_supported(struct libie_fwlog *fwlog) { - struct ice_fwlog_cfg *cfg; + struct libie_fwlog_cfg *cfg; int status; fwlog->supported = false; @@ -838,9 +980,9 @@ static void ice_fwlog_set_supported(struct ice_fwlog *fwlog) if (!cfg) return; - status = ice_aq_fwlog_get(fwlog, cfg); + status = libie_aq_fwlog_get(fwlog, cfg); if (status) - dev_dbg(&fwlog->pdev->dev, "ice_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", + dev_dbg(&fwlog->pdev->dev, "libie_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", status); else fwlog->supported = true; @@ -849,27 +991,27 @@ static void ice_fwlog_set_supported(struct ice_fwlog *fwlog) } /** - * ice_fwlog_init - Initialize FW logging configuration + * libie_fwlog_init - Initialize FW logging configuration * @fwlog: pointer to the fwlog structure * @api: api structure to init fwlog * * This function should be called on driver initialization during - * ice_init_hw(). + * libie_init_hw(). */ -int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api) +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api) { fwlog->api = *api; - ice_fwlog_set_supported(fwlog); + libie_fwlog_set_supported(fwlog); - if (ice_fwlog_supported(fwlog)) { + if (libie_fwlog_supported(fwlog)) { int status; /* read the current config from the FW and store it */ - status = ice_aq_fwlog_get(fwlog, &fwlog->cfg); + status = libie_aq_fwlog_get(fwlog, &fwlog->cfg); if (status) return status; - fwlog->ring.rings = kcalloc(ICE_FWLOG_RING_SIZE_DFLT, + fwlog->ring.rings = kcalloc(LIBIE_FWLOG_RING_SIZE_DFLT, sizeof(*fwlog->ring.rings), GFP_KERNEL); if (!fwlog->ring.rings) { @@ -877,18 +1019,18 @@ int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api) return -ENOMEM; } - fwlog->ring.size = ICE_FWLOG_RING_SIZE_DFLT; - fwlog->ring.index = ICE_FWLOG_RING_SIZE_INDEX_DFLT; + fwlog->ring.size = LIBIE_FWLOG_RING_SIZE_DFLT; + fwlog->ring.index = LIBIE_FWLOG_RING_SIZE_INDEX_DFLT; - status = ice_fwlog_alloc_ring_buffs(&fwlog->ring); + status = libie_fwlog_alloc_ring_buffs(&fwlog->ring); if (status) { dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - ice_fwlog_free_ring_buffs(&fwlog->ring); + libie_fwlog_free_ring_buffs(&fwlog->ring); kfree(fwlog->ring.rings); return status; } - ice_debugfs_fwlog_init(fwlog, api->debugfs_root); + libie_debugfs_fwlog_init(fwlog, api->debugfs_root); } else { dev_warn(&fwlog->pdev->dev, "FW logging is not supported in this NVM image. Please update the NVM to get FW log support\n"); } @@ -897,20 +1039,20 @@ int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api) } /** - * ice_fwlog_deinit - unroll FW logging configuration + * libie_fwlog_deinit - unroll FW logging configuration * @fwlog: pointer to the fwlog structure * - * This function should be called in ice_deinit_hw(). + * This function should be called in libie_deinit_hw(). */ -void ice_fwlog_deinit(struct ice_fwlog *fwlog) +void libie_fwlog_deinit(struct libie_fwlog *fwlog) { int status; /* make sure FW logging is disabled to not put the FW in a weird state * for the next driver load */ - fwlog->cfg.options &= ~ICE_FWLOG_OPTION_ARQ_ENA; - status = ice_fwlog_set(fwlog, &fwlog->cfg); + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; + status = libie_fwlog_set(fwlog, &fwlog->cfg); if (status) dev_warn(&fwlog->pdev->dev, "Unable to turn off FW logging, status: %d\n", status); @@ -919,162 +1061,26 @@ void ice_fwlog_deinit(struct ice_fwlog *fwlog) fwlog->debugfs_modules = NULL; - status = ice_fwlog_unregister(fwlog); + status = libie_fwlog_unregister(fwlog); if (status) dev_warn(&fwlog->pdev->dev, "Unable to unregister FW logging, status: %d\n", status); if (fwlog->ring.rings) { - ice_fwlog_free_ring_buffs(&fwlog->ring); + libie_fwlog_free_ring_buffs(&fwlog->ring); kfree(fwlog->ring.rings); } } /** - * ice_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) - * @fwlog: pointer to the fwlog structure - * @entries: entries to configure - * @num_entries: number of @entries - * @options: options from ice_fwlog_cfg->options structure - * @log_resolution: logging resolution - */ -static int -ice_aq_fwlog_set(struct ice_fwlog *fwlog, - struct ice_fwlog_module_entry *entries, u16 num_entries, - u16 options, u16 log_resolution) -{ - struct libie_aqc_fw_log_cfg_resp *fw_modules; - struct libie_aqc_fw_log *cmd; - struct libie_aq_desc desc; - int status; - int i; - - fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); - if (!fw_modules) - return -ENOMEM; - - for (i = 0; i < num_entries; i++) { - fw_modules[i].module_identifier = - cpu_to_le16(entries[i].module_id); - fw_modules[i].log_level = entries[i].log_level; - } - - ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_config); - desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD); - - cmd = libie_aq_raw(&desc); - - cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; - cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); - cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); - - if (options & ICE_FWLOG_OPTION_ARQ_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; - if (options & ICE_FWLOG_OPTION_UART_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; - - status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, - sizeof(*fw_modules) * num_entries); - - kfree(fw_modules); - - return status; -} - -/** - * ice_fwlog_set - Set the firmware logging settings - * @fwlog: pointer to the fwlog structure - * @cfg: config used to set firmware logging - * - * This function should be called whenever the driver needs to set the firmware - * logging configuration. It can be called on initialization, reset, or during - * runtime. - * - * If the PF wishes to receive FW logging then it must register via - * ice_fwlog_register. Note, that ice_fwlog_register does not need to be called - * for init. - */ -int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) -{ - if (!ice_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - return ice_aq_fwlog_set(fwlog, cfg->module_entries, - LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, - cfg->log_resolution); -} - -/** - * ice_aq_fwlog_register - Register PF for firmware logging events (0xFF31) - * @fwlog: pointer to the fwlog structure - * @reg: true to register and false to unregister - */ -static int ice_aq_fwlog_register(struct ice_fwlog *fwlog, bool reg) -{ - struct libie_aqc_fw_log *cmd; - struct libie_aq_desc desc; - - ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_register); - cmd = libie_aq_raw(&desc); - - if (reg) - cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; - - return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); -} - -/** - * ice_fwlog_register - Register the PF for firmware logging - * @fwlog: pointer to the fwlog structure - * - * After this call the PF will start to receive firmware logging based on the - * configuration set in ice_fwlog_set. - */ -int ice_fwlog_register(struct ice_fwlog *fwlog) -{ - int status; - - if (!ice_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = ice_aq_fwlog_register(fwlog, true); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); - else - fwlog->cfg.options |= ICE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * ice_fwlog_unregister - Unregister the PF from firmware logging - * @fwlog: pointer to the fwlog structure - */ -int ice_fwlog_unregister(struct ice_fwlog *fwlog) -{ - int status; - - if (!ice_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = ice_aq_fwlog_register(fwlog, false); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); - else - fwlog->cfg.options &= ~ICE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * ice_get_fwlog_data - copy the FW log data from ARQ event + * libie_get_fwlog_data - copy the FW log data from ARQ event * @fwlog: fwlog that the FW log event is associated with * @buf: event buffer pointer * @len: len of event descriptor */ -void ice_get_fwlog_data(struct ice_fwlog *fwlog, u8 *buf, u16 len) +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len) { - struct ice_fwlog_data *log; + struct libie_fwlog_data *log; log = &fwlog->ring.rings[fwlog->ring.tail]; @@ -1082,10 +1088,10 @@ void ice_get_fwlog_data(struct ice_fwlog *fwlog, u8 *buf, u16 len) log->data_size = len; memcpy(log->data, buf, log->data_size); - ice_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); + libie_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); - if (ice_fwlog_ring_full(&fwlog->ring)) { + if (libie_fwlog_ring_full(&fwlog->ring)) { /* the rings are full so bump the head to create room */ - ice_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); } } diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h index d5868b9e4de6..3698759c8ebb 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.h +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.h @@ -1,77 +1,75 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2022, Intel Corporation. */ -#ifndef _ICE_FWLOG_H_ -#define _ICE_FWLOG_H_ +#ifndef _LIBIE_FWLOG_H_ +#define _LIBIE_FWLOG_H_ #include "ice_adminq_cmd.h" -struct ice_hw; - /* Only a single log level should be set and all log levels under the set value - * are enabled, e.g. if log level is set to ICE_FW_LOG_LEVEL_VERBOSE, then all - * other log levels are included (except ICE_FW_LOG_LEVEL_NONE) + * are enabled, e.g. if log level is set to LIBIE_FW_LOG_LEVEL_VERBOSE, then all + * other log levels are included (except LIBIE_FW_LOG_LEVEL_NONE) */ -enum ice_fwlog_level { - ICE_FWLOG_LEVEL_NONE = 0, - ICE_FWLOG_LEVEL_ERROR = 1, - ICE_FWLOG_LEVEL_WARNING = 2, - ICE_FWLOG_LEVEL_NORMAL = 3, - ICE_FWLOG_LEVEL_VERBOSE = 4, - ICE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ +enum libie_fwlog_level { + LIBIE_FWLOG_LEVEL_NONE = 0, + LIBIE_FWLOG_LEVEL_ERROR = 1, + LIBIE_FWLOG_LEVEL_WARNING = 2, + LIBIE_FWLOG_LEVEL_NORMAL = 3, + LIBIE_FWLOG_LEVEL_VERBOSE = 4, + LIBIE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ }; -struct ice_fwlog_module_entry { +struct libie_fwlog_module_entry { /* module ID for the corresponding firmware logging event */ u16 module_id; /* verbosity level for the module_id */ u8 log_level; }; -struct ice_fwlog_cfg { +struct libie_fwlog_cfg { /* list of modules for configuring log level */ - struct ice_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; + struct libie_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; /* options used to configure firmware logging */ u16 options; -#define ICE_FWLOG_OPTION_ARQ_ENA BIT(0) -#define ICE_FWLOG_OPTION_UART_ENA BIT(1) - /* set before calling ice_fwlog_init() so the PF registers for firmware - * logging on initialization +#define LIBIE_FWLOG_OPTION_ARQ_ENA BIT(0) +#define LIBIE_FWLOG_OPTION_UART_ENA BIT(1) + /* set before calling libie_fwlog_init() so the PF registers for + * firmware logging on initialization */ -#define ICE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) - /* set in the ice_aq_fwlog_get() response if the PF is registered for FW - * logging events over ARQ +#define LIBIE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) + /* set in the libie_aq_fwlog_get() response if the PF is registered for + * FW logging events over ARQ */ -#define ICE_FWLOG_OPTION_IS_REGISTERED BIT(3) +#define LIBIE_FWLOG_OPTION_IS_REGISTERED BIT(3) /* minimum number of log events sent per Admin Receive Queue event */ u16 log_resolution; }; -struct ice_fwlog_data { +struct libie_fwlog_data { u16 data_size; u8 *data; }; -struct ice_fwlog_ring { - struct ice_fwlog_data *rings; +struct libie_fwlog_ring { + struct libie_fwlog_data *rings; u16 index; u16 size; u16 head; u16 tail; }; -#define ICE_FWLOG_RING_SIZE_INDEX_DFLT 3 -#define ICE_FWLOG_RING_SIZE_DFLT 256 -#define ICE_FWLOG_RING_SIZE_MAX 512 +#define LIBIE_FWLOG_RING_SIZE_INDEX_DFLT 3 +#define LIBIE_FWLOG_RING_SIZE_DFLT 256 +#define LIBIE_FWLOG_RING_SIZE_MAX 512 -struct ice_fwlog { - struct ice_fwlog_cfg cfg; +struct libie_fwlog { + struct libie_fwlog_cfg cfg; bool supported; /* does hardware support FW logging? */ - struct ice_fwlog_ring ring; + struct libie_fwlog_ring ring; struct dentry *debugfs; /* keep track of all the dentrys for FW log modules */ struct dentry **debugfs_modules; - struct_group_tagged(ice_fwlog_api, api, + struct_group_tagged(libie_fwlog_api, api, struct pci_dev *pdev; int (*send_cmd)(void *, struct libie_aq_desc *, void *, u16); void *priv; @@ -79,10 +77,8 @@ struct ice_fwlog { ); }; -int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api); -void ice_fwlog_deinit(struct ice_fwlog *fwlog); -int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg); -int ice_fwlog_register(struct ice_fwlog *fwlog); -int ice_fwlog_unregister(struct ice_fwlog *fwlog); -void ice_get_fwlog_data(struct ice_fwlog *fwlog, u8 *buf, u16 len); -#endif /* _ICE_FWLOG_H_ */ +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); +void libie_fwlog_deinit(struct libie_fwlog *fwlog); +int libie_fwlog_register(struct libie_fwlog *fwlog); +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); +#endif /* _LIBIE_FWLOG_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index e7e775f7ea34..44e184c6c416 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1540,8 +1540,8 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type) } break; case ice_aqc_opc_fw_logs_event: - ice_get_fwlog_data(&hw->fwlog, event.msg_buf, - le16_to_cpu(event.desc.datalen)); + libie_get_fwlog_data(&hw->fwlog, event.msg_buf, + le16_to_cpu(event.desc.datalen)); break; case ice_aqc_opc_lldp_set_mib_change: ice_dcb_process_lldp_set_mib_change(pf, &event); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 643d84cc78df..14296bccc4c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -948,7 +948,7 @@ struct ice_hw { u8 fw_patch; /* firmware patch version */ u32 fw_build; /* firmware build number */ - struct ice_fwlog fwlog; + struct libie_fwlog fwlog; /* Device max aggregate bandwidths corresponding to the GL_PWR_MODE_CTL * register. Used for determining the ITR/INTRL granularity during diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index ca2ac88b5709..29420193889a 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -9,6 +9,7 @@ #define LIBIE_CHECK_STRUCT_LEN(n, X) \ static_assert((n) == sizeof(struct X)) +#define LIBIE_AQ_MAX_BUF_LEN 4096 /** * struct libie_aqc_generic - Generic structure used in adminq communication -- cgit v1.2.3 From f3b3fc1ff0823b73f6f66b6340e6ebc4b00d2ed3 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 12 Aug 2025 06:23:35 +0200 Subject: ice, libie: move fwlog code to libie Move whole code from ice_fwlog.c/h to libie/fwlog.c/h. Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/ice/Makefile | 1 - drivers/net/ethernet/intel/ice/ice_fwlog.c | 1106 --------------------------- drivers/net/ethernet/intel/ice/ice_fwlog.h | 84 --- drivers/net/ethernet/intel/ice/ice_main.c | 1 + drivers/net/ethernet/intel/ice/ice_type.h | 2 +- drivers/net/ethernet/intel/libie/Kconfig | 9 + drivers/net/ethernet/intel/libie/Makefile | 4 + drivers/net/ethernet/intel/libie/fwlog.c | 1115 ++++++++++++++++++++++++++++ include/linux/net/intel/libie/adminq.h | 6 +- include/linux/net/intel/libie/fwlog.h | 85 +++ 11 files changed, 1219 insertions(+), 1195 deletions(-) delete mode 100644 drivers/net/ethernet/intel/ice/ice_fwlog.c delete mode 100644 drivers/net/ethernet/intel/ice/ice_fwlog.h create mode 100644 drivers/net/ethernet/intel/libie/fwlog.c create mode 100644 include/linux/net/intel/libie/fwlog.h (limited to 'include') diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index b05cc0d7a15d..09f0af386af1 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -297,6 +297,7 @@ config ICE select DIMLIB select LIBIE select LIBIE_ADMINQ + select LIBIE_FWLOG select NET_DEVLINK select PACKING select PLDMFW diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index eac45d7c0cf1..5b2c666496e7 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -42,7 +42,6 @@ ice-y := ice_main.o \ ice_ethtool.o \ ice_repr.o \ ice_tc_lib.o \ - ice_fwlog.o \ ice_debugfs.o \ ice_adapter.o ice-$(CONFIG_PCI_IOV) += \ diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c deleted file mode 100644 index 8e7086191030..000000000000 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.c +++ /dev/null @@ -1,1106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2022, Intel Corporation. */ - -#include -#include -#include -#include -#include "ice.h" -#include "ice_common.h" -#include "ice_fwlog.h" - -/* create a define that has an extra module that doesn't really exist. this - * is so we can add a module 'all' to easily enable/disable all the modules - */ -#define LIBIE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) - -/* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in - * libie_aqc_fw_logging_mod - */ -static const char * const libie_fwlog_module_string[] = { - "general", - "ctrl", - "link", - "link_topo", - "dnl", - "i2c", - "sdp", - "mdio", - "adminq", - "hdma", - "lldp", - "dcbx", - "dcb", - "xlr", - "nvm", - "auth", - "vpd", - "iosf", - "parser", - "sw", - "scheduler", - "txq", - "rsvd", - "post", - "watchdog", - "task_dispatch", - "mng", - "synce", - "health", - "tsdrv", - "pfreg", - "mdlver", - "all", -}; - -/* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in libie_fwlog_level - */ -static const char * const libie_fwlog_level_string[] = { - "none", - "error", - "warning", - "normal", - "verbose", -}; - -static const char * const libie_fwlog_log_size[] = { - "128K", - "256K", - "512K", - "1M", - "2M", -}; - -static bool libie_fwlog_ring_empty(struct libie_fwlog_ring *rings) -{ - return rings->head == rings->tail; -} - -static void libie_fwlog_ring_increment(u16 *item, u16 size) -{ - *item = (*item + 1) & (size - 1); -} - -static int libie_fwlog_alloc_ring_buffs(struct libie_fwlog_ring *rings) -{ - int i, nr_bytes; - u8 *mem; - - nr_bytes = rings->size * LIBIE_AQ_MAX_BUF_LEN; - mem = vzalloc(nr_bytes); - if (!mem) - return -ENOMEM; - - for (i = 0; i < rings->size; i++) { - struct libie_fwlog_data *ring = &rings->rings[i]; - - ring->data_size = LIBIE_AQ_MAX_BUF_LEN; - ring->data = mem; - mem += LIBIE_AQ_MAX_BUF_LEN; - } - - return 0; -} - -static void libie_fwlog_free_ring_buffs(struct libie_fwlog_ring *rings) -{ - int i; - - for (i = 0; i < rings->size; i++) { - struct libie_fwlog_data *ring = &rings->rings[i]; - - /* the first ring is the base memory for the whole range so - * free it - */ - if (!i) - vfree(ring->data); - - ring->data = NULL; - ring->data_size = 0; - } -} - -#define LIBIE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) -/** - * libie_fwlog_realloc_rings - reallocate the FW log rings - * @fwlog: pointer to the fwlog structure - * @index: the new index to use to allocate memory for the log data - * - */ -static void libie_fwlog_realloc_rings(struct libie_fwlog *fwlog, int index) -{ - struct libie_fwlog_ring ring; - int status, ring_size; - - /* convert the number of bytes into a number of 4K buffers. externally - * the driver presents the interface to the FW log data as a number of - * bytes because that's easy for users to understand. internally the - * driver uses a ring of buffers because the driver doesn't know where - * the beginning and end of any line of log data is so the driver has - * to overwrite data as complete blocks. when the data is returned to - * the user the driver knows that the data is correct and the FW log - * can be correctly parsed by the tools - */ - ring_size = LIBIE_FWLOG_INDEX_TO_BYTES(index) / LIBIE_AQ_MAX_BUF_LEN; - if (ring_size == fwlog->ring.size) - return; - - /* allocate space for the new rings and buffers then release the - * old rings and buffers. that way if we don't have enough - * memory then we at least have what we had before - */ - ring.rings = kcalloc(ring_size, sizeof(*ring.rings), GFP_KERNEL); - if (!ring.rings) - return; - - ring.size = ring_size; - - status = libie_fwlog_alloc_ring_buffs(&ring); - if (status) { - dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - libie_fwlog_free_ring_buffs(&ring); - kfree(ring.rings); - return; - } - - libie_fwlog_free_ring_buffs(&fwlog->ring); - kfree(fwlog->ring.rings); - - fwlog->ring.rings = ring.rings; - fwlog->ring.size = ring.size; - fwlog->ring.index = index; - fwlog->ring.head = 0; - fwlog->ring.tail = 0; -} - -/** - * libie_fwlog_supported - Cached for whether FW supports FW logging or not - * @fwlog: pointer to the fwlog structure - * - * This will always return false if called before libie_init_hw(), so it must be - * called after libie_init_hw(). - */ -static bool libie_fwlog_supported(struct libie_fwlog *fwlog) -{ - return fwlog->supported; -} - -/** - * libie_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) - * @fwlog: pointer to the fwlog structure - * @entries: entries to configure - * @num_entries: number of @entries - * @options: options from libie_fwlog_cfg->options structure - * @log_resolution: logging resolution - */ -static int -libie_aq_fwlog_set(struct libie_fwlog *fwlog, - struct libie_fwlog_module_entry *entries, u16 num_entries, - u16 options, u16 log_resolution) -{ - struct libie_aqc_fw_log_cfg_resp *fw_modules; - struct libie_aq_desc desc = {0}; - struct libie_aqc_fw_log *cmd; - int status; - int i; - - fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); - if (!fw_modules) - return -ENOMEM; - - for (i = 0; i < num_entries; i++) { - fw_modules[i].module_identifier = - cpu_to_le16(entries[i].module_id); - fw_modules[i].log_level = entries[i].log_level; - } - - desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_config); - desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI) | - cpu_to_le16(LIBIE_AQ_FLAG_RD); - - cmd = libie_aq_raw(&desc); - - cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; - cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); - cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); - - if (options & LIBIE_FWLOG_OPTION_ARQ_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; - if (options & LIBIE_FWLOG_OPTION_UART_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; - - status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, - sizeof(*fw_modules) * num_entries); - - kfree(fw_modules); - - return status; -} - -/** - * libie_fwlog_set - Set the firmware logging settings - * @fwlog: pointer to the fwlog structure - * @cfg: config used to set firmware logging - * - * This function should be called whenever the driver needs to set the firmware - * logging configuration. It can be called on initialization, reset, or during - * runtime. - * - * If the PF wishes to receive FW logging then it must register via - * libie_fwlog_register. Note, that libie_fwlog_register does not need to be called - * for init. - */ -static int libie_fwlog_set(struct libie_fwlog *fwlog, - struct libie_fwlog_cfg *cfg) -{ - if (!libie_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - return libie_aq_fwlog_set(fwlog, cfg->module_entries, - LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, - cfg->log_resolution); -} - -/** - * libie_aq_fwlog_register - Register PF for firmware logging events (0xFF31) - * @fwlog: pointer to the fwlog structure - * @reg: true to register and false to unregister - */ -static int libie_aq_fwlog_register(struct libie_fwlog *fwlog, bool reg) -{ - struct libie_aq_desc desc = {0}; - struct libie_aqc_fw_log *cmd; - - desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_register); - desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); - cmd = libie_aq_raw(&desc); - - if (reg) - cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; - - return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); -} - -/** - * libie_fwlog_register - Register the PF for firmware logging - * @fwlog: pointer to the fwlog structure - * - * After this call the PF will start to receive firmware logging based on the - * configuration set in libie_fwlog_set. - */ -static int libie_fwlog_register(struct libie_fwlog *fwlog) -{ - int status; - - if (!libie_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = libie_aq_fwlog_register(fwlog, true); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); - else - fwlog->cfg.options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * libie_fwlog_unregister - Unregister the PF from firmware logging - * @fwlog: pointer to the fwlog structure - */ -static int libie_fwlog_unregister(struct libie_fwlog *fwlog) -{ - int status; - - if (!libie_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = libie_aq_fwlog_register(fwlog, false); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); - else - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * libie_fwlog_print_module_cfg - print current FW logging module configuration - * @cfg: pointer to the fwlog cfg structure - * @module: module to print - * @s: the seq file to put data into - */ -static void -libie_fwlog_print_module_cfg(struct libie_fwlog_cfg *cfg, int module, - struct seq_file *s) -{ - struct libie_fwlog_module_entry *entry; - - if (module != LIBIE_AQC_FW_LOG_ID_MAX) { - entry = &cfg->module_entries[module]; - - seq_printf(s, "\tModule: %s, Log Level: %s\n", - libie_fwlog_module_string[entry->module_id], - libie_fwlog_level_string[entry->log_level]); - } else { - int i; - - for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) { - entry = &cfg->module_entries[i]; - - seq_printf(s, "\tModule: %s, Log Level: %s\n", - libie_fwlog_module_string[entry->module_id], - libie_fwlog_level_string[entry->log_level]); - } - } -} - -static int libie_find_module_by_dentry(struct dentry **modules, struct dentry *d) -{ - int i, module; - - module = -1; - /* find the module based on the dentry */ - for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { - if (d == modules[i]) { - module = i; - break; - } - } - - return module; -} - -/** - * libie_debugfs_module_show - read from 'module' file - * @s: the opened file - * @v: pointer to the offset - */ -static int libie_debugfs_module_show(struct seq_file *s, void *v) -{ - struct libie_fwlog *fwlog = s->private; - const struct file *filp = s->file; - struct dentry *dentry; - int module; - - dentry = file_dentry(filp); - - module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); - if (module < 0) { - dev_info(&fwlog->pdev->dev, "unknown module\n"); - return -EINVAL; - } - - libie_fwlog_print_module_cfg(&fwlog->cfg, module, s); - - return 0; -} - -static int libie_debugfs_module_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, libie_debugfs_module_show, inode->i_private); -} - -/** - * libie_debugfs_module_write - write into 'module' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_module_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = file_inode(filp)->i_private; - struct dentry *dentry = file_dentry(filp); - struct device *dev = &fwlog->pdev->dev; - char user_val[16], *cmd_buf; - int module, log_level, cnt; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 8) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); - if (module < 0) { - dev_info(dev, "unknown module\n"); - return -EINVAL; - } - - cnt = sscanf(cmd_buf, "%s", user_val); - if (cnt != 1) - return -EINVAL; - - log_level = sysfs_match_string(libie_fwlog_level_string, user_val); - if (log_level < 0) { - dev_info(dev, "unknown log level '%s'\n", user_val); - return -EINVAL; - } - - if (module != LIBIE_AQC_FW_LOG_ID_MAX) { - fwlog->cfg.module_entries[module].log_level = log_level; - } else { - /* the module 'all' is a shortcut so that we can set - * all of the modules to the same level quickly - */ - int i; - - for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) - fwlog->cfg.module_entries[i].log_level = log_level; - } - - return count; -} - -static const struct file_operations libie_debugfs_module_fops = { - .owner = THIS_MODULE, - .open = libie_debugfs_module_open, - .read = seq_read, - .release = single_release, - .write = libie_debugfs_module_write, -}; - -/** - * libie_debugfs_nr_messages_read - read from 'nr_messages' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_nr_messages_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char buff[32] = {}; - - snprintf(buff, sizeof(buff), "%d\n", - fwlog->cfg.log_resolution); - - return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); -} - -/** - * libie_debugfs_nr_messages_write - write into 'nr_messages' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_nr_messages_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - struct device *dev = &fwlog->pdev->dev; - char user_val[8], *cmd_buf; - s16 nr_messages; - ssize_t ret; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 4) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - ret = sscanf(cmd_buf, "%s", user_val); - if (ret != 1) - return -EINVAL; - - ret = kstrtos16(user_val, 0, &nr_messages); - if (ret) - return ret; - - if (nr_messages < LIBIE_AQC_FW_LOG_MIN_RESOLUTION || - nr_messages > LIBIE_AQC_FW_LOG_MAX_RESOLUTION) { - dev_err(dev, "Invalid FW log number of messages %d, value must be between %d - %d\n", - nr_messages, LIBIE_AQC_FW_LOG_MIN_RESOLUTION, - LIBIE_AQC_FW_LOG_MAX_RESOLUTION); - return -EINVAL; - } - - fwlog->cfg.log_resolution = nr_messages; - - return count; -} - -static const struct file_operations libie_debugfs_nr_messages_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_nr_messages_read, - .write = libie_debugfs_nr_messages_write, -}; - -/** - * libie_debugfs_enable_read - read from 'enable' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_enable_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char buff[32] = {}; - - snprintf(buff, sizeof(buff), "%u\n", - (u16)(fwlog->cfg.options & - LIBIE_FWLOG_OPTION_IS_REGISTERED) >> 3); - - return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); -} - -/** - * libie_debugfs_enable_write - write into 'enable' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_enable_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char user_val[8], *cmd_buf; - bool enable; - ssize_t ret; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 2) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - ret = sscanf(cmd_buf, "%s", user_val); - if (ret != 1) - return -EINVAL; - - ret = kstrtobool(user_val, &enable); - if (ret) - goto enable_write_error; - - if (enable) - fwlog->cfg.options |= LIBIE_FWLOG_OPTION_ARQ_ENA; - else - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; - - ret = libie_fwlog_set(fwlog, &fwlog->cfg); - if (ret) - goto enable_write_error; - - if (enable) - ret = libie_fwlog_register(fwlog); - else - ret = libie_fwlog_unregister(fwlog); - - if (ret) - goto enable_write_error; - - /* if we get here, nothing went wrong; return count since we didn't - * really write anything - */ - ret = (ssize_t)count; - -enable_write_error: - /* This function always consumes all of the written input, or produces - * an error. Check and enforce this. Otherwise, the write operation - * won't complete properly. - */ - if (WARN_ON(ret != (ssize_t)count && ret >= 0)) - ret = -EIO; - - return ret; -} - -static const struct file_operations libie_debugfs_enable_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_enable_read, - .write = libie_debugfs_enable_write, -}; - -/** - * libie_debugfs_log_size_read - read from 'log_size' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_log_size_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char buff[32] = {}; - int index; - - index = fwlog->ring.index; - snprintf(buff, sizeof(buff), "%s\n", libie_fwlog_log_size[index]); - - return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); -} - -/** - * libie_debugfs_log_size_write - write into 'log_size' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_log_size_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - struct device *dev = &fwlog->pdev->dev; - char user_val[8], *cmd_buf; - ssize_t ret; - int index; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 5) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - ret = sscanf(cmd_buf, "%s", user_val); - if (ret != 1) - return -EINVAL; - - index = sysfs_match_string(libie_fwlog_log_size, user_val); - if (index < 0) { - dev_info(dev, "Invalid log size '%s'. The value must be one of 128K, 256K, 512K, 1M, 2M\n", - user_val); - ret = -EINVAL; - goto log_size_write_error; - } else if (fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED) { - dev_info(dev, "FW logging is currently running. Please disable FW logging to change log_size\n"); - ret = -EINVAL; - goto log_size_write_error; - } - - /* free all the buffers and the tracking info and resize */ - libie_fwlog_realloc_rings(fwlog, index); - - /* if we get here, nothing went wrong; return count since we didn't - * really write anything - */ - ret = (ssize_t)count; - -log_size_write_error: - /* This function always consumes all of the written input, or produces - * an error. Check and enforce this. Otherwise, the write operation - * won't complete properly. - */ - if (WARN_ON(ret != (ssize_t)count && ret >= 0)) - ret = -EIO; - - return ret; -} - -static const struct file_operations libie_debugfs_log_size_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_log_size_read, - .write = libie_debugfs_log_size_write, -}; - -/** - * libie_debugfs_data_read - read from 'data' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_data_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - int data_copied = 0; - bool done = false; - - if (libie_fwlog_ring_empty(&fwlog->ring)) - return 0; - - while (!libie_fwlog_ring_empty(&fwlog->ring) && !done) { - struct libie_fwlog_data *log; - u16 cur_buf_len; - - log = &fwlog->ring.rings[fwlog->ring.head]; - cur_buf_len = log->data_size; - if (cur_buf_len >= count) { - done = true; - continue; - } - - if (copy_to_user(buffer, log->data, cur_buf_len)) { - /* if there is an error then bail and return whatever - * the driver has copied so far - */ - done = true; - continue; - } - - data_copied += cur_buf_len; - buffer += cur_buf_len; - count -= cur_buf_len; - *ppos += cur_buf_len; - libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); - } - - return data_copied; -} - -/** - * libie_debugfs_data_write - write into 'data' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - struct device *dev = &fwlog->pdev->dev; - ssize_t ret; - - /* don't allow partial writes */ - if (*ppos != 0) - return 0; - - /* any value is allowed to clear the buffer so no need to even look at - * what the value is - */ - if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) { - fwlog->ring.head = 0; - fwlog->ring.tail = 0; - } else { - dev_info(dev, "Can't clear FW log data while FW log running\n"); - ret = -EINVAL; - goto nr_buffs_write_error; - } - - /* if we get here, nothing went wrong; return count since we didn't - * really write anything - */ - ret = (ssize_t)count; - -nr_buffs_write_error: - /* This function always consumes all of the written input, or produces - * an error. Check and enforce this. Otherwise, the write operation - * won't complete properly. - */ - if (WARN_ON(ret != (ssize_t)count && ret >= 0)) - ret = -EIO; - - return ret; -} - -static const struct file_operations libie_debugfs_data_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_data_read, - .write = libie_debugfs_data_write, -}; - -/** - * libie_debugfs_fwlog_init - setup the debugfs directory - * @fwlog: pointer to the fwlog structure - * @root: debugfs root entry on which fwlog director will be registered - */ -static void libie_debugfs_fwlog_init(struct libie_fwlog *fwlog, - struct dentry *root) -{ - struct dentry *fw_modules_dir; - struct dentry **fw_modules; - int i; - - /* allocate space for this first because if it fails then we don't - * need to unwind - */ - fw_modules = kcalloc(LIBIE_NR_FW_LOG_MODULES, sizeof(*fw_modules), - GFP_KERNEL); - if (!fw_modules) - return; - - fwlog->debugfs = debugfs_create_dir("fwlog", root); - if (IS_ERR(fwlog->debugfs)) - goto err_create_module_files; - - fw_modules_dir = debugfs_create_dir("modules", fwlog->debugfs); - if (IS_ERR(fw_modules_dir)) - goto err_create_module_files; - - for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { - fw_modules[i] = debugfs_create_file(libie_fwlog_module_string[i], - 0600, fw_modules_dir, fwlog, - &libie_debugfs_module_fops); - if (IS_ERR(fw_modules[i])) - goto err_create_module_files; - } - - debugfs_create_file("nr_messages", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_nr_messages_fops); - - fwlog->debugfs_modules = fw_modules; - - debugfs_create_file("enable", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_enable_fops); - - debugfs_create_file("log_size", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_log_size_fops); - - debugfs_create_file("data", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_data_fops); - - return; - -err_create_module_files: - debugfs_remove_recursive(fwlog->debugfs); - kfree(fw_modules); -} - -static bool libie_fwlog_ring_full(struct libie_fwlog_ring *rings) -{ - u16 head, tail; - - head = rings->head; - tail = rings->tail; - - if (head < tail && (tail - head == (rings->size - 1))) - return true; - else if (head > tail && (tail == (head - 1))) - return true; - - return false; -} - -/** - * libie_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) - * @fwlog: pointer to the fwlog structure - * @cfg: firmware logging configuration to populate - */ -static int libie_aq_fwlog_get(struct libie_fwlog *fwlog, - struct libie_fwlog_cfg *cfg) -{ - struct libie_aqc_fw_log_cfg_resp *fw_modules; - struct libie_aq_desc desc = {0}; - struct libie_aqc_fw_log *cmd; - u16 module_id_cnt; - int status; - void *buf; - int i; - - memset(cfg, 0, sizeof(*cfg)); - - buf = kzalloc(LIBIE_AQ_MAX_BUF_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_query); - desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); - cmd = libie_aq_raw(&desc); - - cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; - - status = fwlog->send_cmd(fwlog->priv, &desc, buf, LIBIE_AQ_MAX_BUF_LEN); - if (status) { - dev_dbg(&fwlog->pdev->dev, "Failed to get FW log configuration\n"); - goto status_out; - } - - module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt); - if (module_id_cnt < LIBIE_AQC_FW_LOG_ID_MAX) { - dev_dbg(&fwlog->pdev->dev, "FW returned less than the expected number of FW log module IDs\n"); - } else if (module_id_cnt > LIBIE_AQC_FW_LOG_ID_MAX) { - dev_dbg(&fwlog->pdev->dev, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n", - LIBIE_AQC_FW_LOG_ID_MAX); - module_id_cnt = LIBIE_AQC_FW_LOG_ID_MAX; - } - - cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); - if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) - cfg->options |= LIBIE_FWLOG_OPTION_ARQ_ENA; - if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) - cfg->options |= LIBIE_FWLOG_OPTION_UART_ENA; - if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) - cfg->options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; - - fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; - - for (i = 0; i < module_id_cnt; i++) { - struct libie_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; - - cfg->module_entries[i].module_id = - le16_to_cpu(fw_module->module_identifier); - cfg->module_entries[i].log_level = fw_module->log_level; - } - -status_out: - kfree(buf); - return status; -} - -/** - * libie_fwlog_set_supported - Set if FW logging is supported by FW - * @fwlog: pointer to the fwlog structure - * - * If FW returns success to the libie_aq_fwlog_get call then it supports FW - * logging, else it doesn't. Set the fwlog_supported flag accordingly. - * - * This function is only meant to be called during driver init to determine if - * the FW support FW logging. - */ -static void libie_fwlog_set_supported(struct libie_fwlog *fwlog) -{ - struct libie_fwlog_cfg *cfg; - int status; - - fwlog->supported = false; - - cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); - if (!cfg) - return; - - status = libie_aq_fwlog_get(fwlog, cfg); - if (status) - dev_dbg(&fwlog->pdev->dev, "libie_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", - status); - else - fwlog->supported = true; - - kfree(cfg); -} - -/** - * libie_fwlog_init - Initialize FW logging configuration - * @fwlog: pointer to the fwlog structure - * @api: api structure to init fwlog - * - * This function should be called on driver initialization during - * libie_init_hw(). - */ -int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api) -{ - fwlog->api = *api; - libie_fwlog_set_supported(fwlog); - - if (libie_fwlog_supported(fwlog)) { - int status; - - /* read the current config from the FW and store it */ - status = libie_aq_fwlog_get(fwlog, &fwlog->cfg); - if (status) - return status; - - fwlog->ring.rings = kcalloc(LIBIE_FWLOG_RING_SIZE_DFLT, - sizeof(*fwlog->ring.rings), - GFP_KERNEL); - if (!fwlog->ring.rings) { - dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log rings\n"); - return -ENOMEM; - } - - fwlog->ring.size = LIBIE_FWLOG_RING_SIZE_DFLT; - fwlog->ring.index = LIBIE_FWLOG_RING_SIZE_INDEX_DFLT; - - status = libie_fwlog_alloc_ring_buffs(&fwlog->ring); - if (status) { - dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - libie_fwlog_free_ring_buffs(&fwlog->ring); - kfree(fwlog->ring.rings); - return status; - } - - libie_debugfs_fwlog_init(fwlog, api->debugfs_root); - } else { - dev_warn(&fwlog->pdev->dev, "FW logging is not supported in this NVM image. Please update the NVM to get FW log support\n"); - } - - return 0; -} - -/** - * libie_fwlog_deinit - unroll FW logging configuration - * @fwlog: pointer to the fwlog structure - * - * This function should be called in libie_deinit_hw(). - */ -void libie_fwlog_deinit(struct libie_fwlog *fwlog) -{ - int status; - - /* make sure FW logging is disabled to not put the FW in a weird state - * for the next driver load - */ - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; - status = libie_fwlog_set(fwlog, &fwlog->cfg); - if (status) - dev_warn(&fwlog->pdev->dev, "Unable to turn off FW logging, status: %d\n", - status); - - kfree(fwlog->debugfs_modules); - - fwlog->debugfs_modules = NULL; - - status = libie_fwlog_unregister(fwlog); - if (status) - dev_warn(&fwlog->pdev->dev, "Unable to unregister FW logging, status: %d\n", - status); - - if (fwlog->ring.rings) { - libie_fwlog_free_ring_buffs(&fwlog->ring); - kfree(fwlog->ring.rings); - } -} - -/** - * libie_get_fwlog_data - copy the FW log data from ARQ event - * @fwlog: fwlog that the FW log event is associated with - * @buf: event buffer pointer - * @len: len of event descriptor - */ -void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len) -{ - struct libie_fwlog_data *log; - - log = &fwlog->ring.rings[fwlog->ring.tail]; - - memset(log->data, 0, PAGE_SIZE); - log->data_size = len; - - memcpy(log->data, buf, log->data_size); - libie_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); - - if (libie_fwlog_ring_full(&fwlog->ring)) { - /* the rings are full so bump the head to create room */ - libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); - } -} - -void libie_fwlog_reregister(struct libie_fwlog *fwlog) -{ - if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) - return; - - if (libie_fwlog_register(fwlog)) - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; -} diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h deleted file mode 100644 index e534205a2d04..000000000000 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.h +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2022, Intel Corporation. */ - -#ifndef _LIBIE_FWLOG_H_ -#define _LIBIE_FWLOG_H_ -#include "ice_adminq_cmd.h" - -/* Only a single log level should be set and all log levels under the set value - * are enabled, e.g. if log level is set to LIBIE_FW_LOG_LEVEL_VERBOSE, then all - * other log levels are included (except LIBIE_FW_LOG_LEVEL_NONE) - */ -enum libie_fwlog_level { - LIBIE_FWLOG_LEVEL_NONE = 0, - LIBIE_FWLOG_LEVEL_ERROR = 1, - LIBIE_FWLOG_LEVEL_WARNING = 2, - LIBIE_FWLOG_LEVEL_NORMAL = 3, - LIBIE_FWLOG_LEVEL_VERBOSE = 4, - LIBIE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ -}; - -struct libie_fwlog_module_entry { - /* module ID for the corresponding firmware logging event */ - u16 module_id; - /* verbosity level for the module_id */ - u8 log_level; -}; - -struct libie_fwlog_cfg { - /* list of modules for configuring log level */ - struct libie_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; - /* options used to configure firmware logging */ - u16 options; -#define LIBIE_FWLOG_OPTION_ARQ_ENA BIT(0) -#define LIBIE_FWLOG_OPTION_UART_ENA BIT(1) - /* set before calling libie_fwlog_init() so the PF registers for - * firmware logging on initialization - */ -#define LIBIE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) - /* set in the libie_aq_fwlog_get() response if the PF is registered for - * FW logging events over ARQ - */ -#define LIBIE_FWLOG_OPTION_IS_REGISTERED BIT(3) - - /* minimum number of log events sent per Admin Receive Queue event */ - u16 log_resolution; -}; - -struct libie_fwlog_data { - u16 data_size; - u8 *data; -}; - -struct libie_fwlog_ring { - struct libie_fwlog_data *rings; - u16 index; - u16 size; - u16 head; - u16 tail; -}; - -#define LIBIE_FWLOG_RING_SIZE_INDEX_DFLT 3 -#define LIBIE_FWLOG_RING_SIZE_DFLT 256 -#define LIBIE_FWLOG_RING_SIZE_MAX 512 - -struct libie_fwlog { - struct libie_fwlog_cfg cfg; - bool supported; /* does hardware support FW logging? */ - struct libie_fwlog_ring ring; - struct dentry *debugfs; - /* keep track of all the dentrys for FW log modules */ - struct dentry **debugfs_modules; - struct_group_tagged(libie_fwlog_api, api, - struct pci_dev *pdev; - int (*send_cmd)(void *, struct libie_aq_desc *, void *, u16); - void *priv; - struct dentry *debugfs_root; - ); -}; - -int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); -void libie_fwlog_deinit(struct libie_fwlog *fwlog); -void libie_fwlog_reregister(struct libie_fwlog *fwlog); -void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); -#endif /* _LIBIE_FWLOG_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 44e184c6c416..f0d3aee24a93 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -39,6 +39,7 @@ static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation."; MODULE_DESCRIPTION(DRV_SUMMARY); MODULE_IMPORT_NS("LIBIE"); MODULE_IMPORT_NS("LIBIE_ADMINQ"); +MODULE_IMPORT_NS("LIBIE_FWLOG"); MODULE_LICENSE("GPL v2"); MODULE_FIRMWARE(ICE_DDP_PKG_FILE); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 14296bccc4c9..b0a1b67071c5 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -17,7 +17,7 @@ #include "ice_protocol_type.h" #include "ice_sbq_cmd.h" #include "ice_vlan_mode.h" -#include "ice_fwlog.h" +#include #include #include diff --git a/drivers/net/ethernet/intel/libie/Kconfig b/drivers/net/ethernet/intel/libie/Kconfig index e6072758e3d8..70831c7e336e 100644 --- a/drivers/net/ethernet/intel/libie/Kconfig +++ b/drivers/net/ethernet/intel/libie/Kconfig @@ -14,3 +14,12 @@ config LIBIE_ADMINQ help Helper functions used by Intel Ethernet drivers for administration queue command interface (aka adminq). + +config LIBIE_FWLOG + tristate + select LIBIE_ADMINQ + help + Library to support firmware logging on device that have support + for it. Firmware logging is using admin queue interface to communicate + with the device. Debugfs is a user interface used to config logging + and dump all collected logs. diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile index e98f00b865d3..db57fc6780ea 100644 --- a/drivers/net/ethernet/intel/libie/Makefile +++ b/drivers/net/ethernet/intel/libie/Makefile @@ -8,3 +8,7 @@ libie-y := rx.o obj-$(CONFIG_LIBIE_ADMINQ) += libie_adminq.o libie_adminq-y := adminq.o + +obj-$(CONFIG_LIBIE_FWLOG) += libie_fwlog.o + +libie_fwlog-y := fwlog.o diff --git a/drivers/net/ethernet/intel/libie/fwlog.c b/drivers/net/ethernet/intel/libie/fwlog.c new file mode 100644 index 000000000000..f39cc11cb7c5 --- /dev/null +++ b/drivers/net/ethernet/intel/libie/fwlog.c @@ -0,0 +1,1115 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022, Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_SYMBOL_NAMESPACE "LIBIE_FWLOG" + +/* create a define that has an extra module that doesn't really exist. this + * is so we can add a module 'all' to easily enable/disable all the modules + */ +#define LIBIE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) + +/* the ordering in this array is important. it matches the ordering of the + * values in the FW so the index is the same value as in + * libie_aqc_fw_logging_mod + */ +static const char * const libie_fwlog_module_string[] = { + "general", + "ctrl", + "link", + "link_topo", + "dnl", + "i2c", + "sdp", + "mdio", + "adminq", + "hdma", + "lldp", + "dcbx", + "dcb", + "xlr", + "nvm", + "auth", + "vpd", + "iosf", + "parser", + "sw", + "scheduler", + "txq", + "rsvd", + "post", + "watchdog", + "task_dispatch", + "mng", + "synce", + "health", + "tsdrv", + "pfreg", + "mdlver", + "all", +}; + +/* the ordering in this array is important. it matches the ordering of the + * values in the FW so the index is the same value as in libie_fwlog_level + */ +static const char * const libie_fwlog_level_string[] = { + "none", + "error", + "warning", + "normal", + "verbose", +}; + +static const char * const libie_fwlog_log_size[] = { + "128K", + "256K", + "512K", + "1M", + "2M", +}; + +static bool libie_fwlog_ring_empty(struct libie_fwlog_ring *rings) +{ + return rings->head == rings->tail; +} + +static void libie_fwlog_ring_increment(u16 *item, u16 size) +{ + *item = (*item + 1) & (size - 1); +} + +static int libie_fwlog_alloc_ring_buffs(struct libie_fwlog_ring *rings) +{ + int i, nr_bytes; + u8 *mem; + + nr_bytes = rings->size * LIBIE_AQ_MAX_BUF_LEN; + mem = vzalloc(nr_bytes); + if (!mem) + return -ENOMEM; + + for (i = 0; i < rings->size; i++) { + struct libie_fwlog_data *ring = &rings->rings[i]; + + ring->data_size = LIBIE_AQ_MAX_BUF_LEN; + ring->data = mem; + mem += LIBIE_AQ_MAX_BUF_LEN; + } + + return 0; +} + +static void libie_fwlog_free_ring_buffs(struct libie_fwlog_ring *rings) +{ + int i; + + for (i = 0; i < rings->size; i++) { + struct libie_fwlog_data *ring = &rings->rings[i]; + + /* the first ring is the base memory for the whole range so + * free it + */ + if (!i) + vfree(ring->data); + + ring->data = NULL; + ring->data_size = 0; + } +} + +#define LIBIE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) +/** + * libie_fwlog_realloc_rings - reallocate the FW log rings + * @fwlog: pointer to the fwlog structure + * @index: the new index to use to allocate memory for the log data + * + */ +static void libie_fwlog_realloc_rings(struct libie_fwlog *fwlog, int index) +{ + struct libie_fwlog_ring ring; + int status, ring_size; + + /* convert the number of bytes into a number of 4K buffers. externally + * the driver presents the interface to the FW log data as a number of + * bytes because that's easy for users to understand. internally the + * driver uses a ring of buffers because the driver doesn't know where + * the beginning and end of any line of log data is so the driver has + * to overwrite data as complete blocks. when the data is returned to + * the user the driver knows that the data is correct and the FW log + * can be correctly parsed by the tools + */ + ring_size = LIBIE_FWLOG_INDEX_TO_BYTES(index) / LIBIE_AQ_MAX_BUF_LEN; + if (ring_size == fwlog->ring.size) + return; + + /* allocate space for the new rings and buffers then release the + * old rings and buffers. that way if we don't have enough + * memory then we at least have what we had before + */ + ring.rings = kcalloc(ring_size, sizeof(*ring.rings), GFP_KERNEL); + if (!ring.rings) + return; + + ring.size = ring_size; + + status = libie_fwlog_alloc_ring_buffs(&ring); + if (status) { + dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); + libie_fwlog_free_ring_buffs(&ring); + kfree(ring.rings); + return; + } + + libie_fwlog_free_ring_buffs(&fwlog->ring); + kfree(fwlog->ring.rings); + + fwlog->ring.rings = ring.rings; + fwlog->ring.size = ring.size; + fwlog->ring.index = index; + fwlog->ring.head = 0; + fwlog->ring.tail = 0; +} + +/** + * libie_fwlog_supported - Cached for whether FW supports FW logging or not + * @fwlog: pointer to the fwlog structure + * + * This will always return false if called before libie_init_hw(), so it must be + * called after libie_init_hw(). + */ +static bool libie_fwlog_supported(struct libie_fwlog *fwlog) +{ + return fwlog->supported; +} + +/** + * libie_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) + * @fwlog: pointer to the fwlog structure + * @entries: entries to configure + * @num_entries: number of @entries + * @options: options from libie_fwlog_cfg->options structure + * @log_resolution: logging resolution + */ +static int +libie_aq_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_module_entry *entries, u16 num_entries, + u16 options, u16 log_resolution) +{ + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + int status; + int i; + + fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); + if (!fw_modules) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) { + fw_modules[i].module_identifier = + cpu_to_le16(entries[i].module_id); + fw_modules[i].log_level = entries[i].log_level; + } + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_config); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI) | + cpu_to_le16(LIBIE_AQ_FLAG_RD); + + cmd = libie_aq_raw(&desc); + + cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; + cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); + cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); + + if (options & LIBIE_FWLOG_OPTION_ARQ_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; + if (options & LIBIE_FWLOG_OPTION_UART_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; + + status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, + sizeof(*fw_modules) * num_entries); + + kfree(fw_modules); + + return status; +} + +/** + * libie_fwlog_set - Set the firmware logging settings + * @fwlog: pointer to the fwlog structure + * @cfg: config used to set firmware logging + * + * This function should be called whenever the driver needs to set the firmware + * logging configuration. It can be called on initialization, reset, or during + * runtime. + * + * If the PF wishes to receive FW logging then it must register via + * libie_fwlog_register. Note, that libie_fwlog_register does not need to be called + * for init. + */ +static int libie_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) +{ + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + return libie_aq_fwlog_set(fwlog, cfg->module_entries, + LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, + cfg->log_resolution); +} + +/** + * libie_aq_fwlog_register - Register PF for firmware logging events (0xFF31) + * @fwlog: pointer to the fwlog structure + * @reg: true to register and false to unregister + */ +static int libie_aq_fwlog_register(struct libie_fwlog *fwlog, bool reg) +{ + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_register); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); + cmd = libie_aq_raw(&desc); + + if (reg) + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; + + return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); +} + +/** + * libie_fwlog_register - Register the PF for firmware logging + * @fwlog: pointer to the fwlog structure + * + * After this call the PF will start to receive firmware logging based on the + * configuration set in libie_fwlog_set. + */ +static int libie_fwlog_register(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, true); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); + else + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_unregister - Unregister the PF from firmware logging + * @fwlog: pointer to the fwlog structure + */ +static int libie_fwlog_unregister(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, false); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); + else + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_print_module_cfg - print current FW logging module configuration + * @cfg: pointer to the fwlog cfg structure + * @module: module to print + * @s: the seq file to put data into + */ +static void +libie_fwlog_print_module_cfg(struct libie_fwlog_cfg *cfg, int module, + struct seq_file *s) +{ + struct libie_fwlog_module_entry *entry; + + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { + entry = &cfg->module_entries[module]; + + seq_printf(s, "\tModule: %s, Log Level: %s\n", + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); + } else { + int i; + + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) { + entry = &cfg->module_entries[i]; + + seq_printf(s, "\tModule: %s, Log Level: %s\n", + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); + } + } +} + +static int libie_find_module_by_dentry(struct dentry **modules, struct dentry *d) +{ + int i, module; + + module = -1; + /* find the module based on the dentry */ + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { + if (d == modules[i]) { + module = i; + break; + } + } + + return module; +} + +/** + * libie_debugfs_module_show - read from 'module' file + * @s: the opened file + * @v: pointer to the offset + */ +static int libie_debugfs_module_show(struct seq_file *s, void *v) +{ + struct libie_fwlog *fwlog = s->private; + const struct file *filp = s->file; + struct dentry *dentry; + int module; + + dentry = file_dentry(filp); + + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); + if (module < 0) { + dev_info(&fwlog->pdev->dev, "unknown module\n"); + return -EINVAL; + } + + libie_fwlog_print_module_cfg(&fwlog->cfg, module, s); + + return 0; +} + +static int libie_debugfs_module_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, libie_debugfs_module_show, inode->i_private); +} + +/** + * libie_debugfs_module_write - write into 'module' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_module_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = file_inode(filp)->i_private; + struct dentry *dentry = file_dentry(filp); + struct device *dev = &fwlog->pdev->dev; + char user_val[16], *cmd_buf; + int module, log_level, cnt; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 8) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); + if (module < 0) { + dev_info(dev, "unknown module\n"); + return -EINVAL; + } + + cnt = sscanf(cmd_buf, "%s", user_val); + if (cnt != 1) + return -EINVAL; + + log_level = sysfs_match_string(libie_fwlog_level_string, user_val); + if (log_level < 0) { + dev_info(dev, "unknown log level '%s'\n", user_val); + return -EINVAL; + } + + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { + fwlog->cfg.module_entries[module].log_level = log_level; + } else { + /* the module 'all' is a shortcut so that we can set + * all of the modules to the same level quickly + */ + int i; + + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) + fwlog->cfg.module_entries[i].log_level = log_level; + } + + return count; +} + +static const struct file_operations libie_debugfs_module_fops = { + .owner = THIS_MODULE, + .open = libie_debugfs_module_open, + .read = seq_read, + .release = single_release, + .write = libie_debugfs_module_write, +}; + +/** + * libie_debugfs_nr_messages_read - read from 'nr_messages' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_nr_messages_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char buff[32] = {}; + + snprintf(buff, sizeof(buff), "%d\n", + fwlog->cfg.log_resolution); + + return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); +} + +/** + * libie_debugfs_nr_messages_write - write into 'nr_messages' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_nr_messages_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + struct device *dev = &fwlog->pdev->dev; + char user_val[8], *cmd_buf; + s16 nr_messages; + ssize_t ret; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 4) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + ret = sscanf(cmd_buf, "%s", user_val); + if (ret != 1) + return -EINVAL; + + ret = kstrtos16(user_val, 0, &nr_messages); + if (ret) + return ret; + + if (nr_messages < LIBIE_AQC_FW_LOG_MIN_RESOLUTION || + nr_messages > LIBIE_AQC_FW_LOG_MAX_RESOLUTION) { + dev_err(dev, "Invalid FW log number of messages %d, value must be between %d - %d\n", + nr_messages, LIBIE_AQC_FW_LOG_MIN_RESOLUTION, + LIBIE_AQC_FW_LOG_MAX_RESOLUTION); + return -EINVAL; + } + + fwlog->cfg.log_resolution = nr_messages; + + return count; +} + +static const struct file_operations libie_debugfs_nr_messages_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_nr_messages_read, + .write = libie_debugfs_nr_messages_write, +}; + +/** + * libie_debugfs_enable_read - read from 'enable' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_enable_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char buff[32] = {}; + + snprintf(buff, sizeof(buff), "%u\n", + (u16)(fwlog->cfg.options & + LIBIE_FWLOG_OPTION_IS_REGISTERED) >> 3); + + return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); +} + +/** + * libie_debugfs_enable_write - write into 'enable' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_enable_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char user_val[8], *cmd_buf; + bool enable; + ssize_t ret; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 2) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + ret = sscanf(cmd_buf, "%s", user_val); + if (ret != 1) + return -EINVAL; + + ret = kstrtobool(user_val, &enable); + if (ret) + goto enable_write_error; + + if (enable) + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_ARQ_ENA; + else + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; + + ret = libie_fwlog_set(fwlog, &fwlog->cfg); + if (ret) + goto enable_write_error; + + if (enable) + ret = libie_fwlog_register(fwlog); + else + ret = libie_fwlog_unregister(fwlog); + + if (ret) + goto enable_write_error; + + /* if we get here, nothing went wrong; return count since we didn't + * really write anything + */ + ret = (ssize_t)count; + +enable_write_error: + /* This function always consumes all of the written input, or produces + * an error. Check and enforce this. Otherwise, the write operation + * won't complete properly. + */ + if (WARN_ON(ret != (ssize_t)count && ret >= 0)) + ret = -EIO; + + return ret; +} + +static const struct file_operations libie_debugfs_enable_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_enable_read, + .write = libie_debugfs_enable_write, +}; + +/** + * libie_debugfs_log_size_read - read from 'log_size' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_log_size_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char buff[32] = {}; + int index; + + index = fwlog->ring.index; + snprintf(buff, sizeof(buff), "%s\n", libie_fwlog_log_size[index]); + + return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); +} + +/** + * libie_debugfs_log_size_write - write into 'log_size' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_log_size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + struct device *dev = &fwlog->pdev->dev; + char user_val[8], *cmd_buf; + ssize_t ret; + int index; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 5) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + ret = sscanf(cmd_buf, "%s", user_val); + if (ret != 1) + return -EINVAL; + + index = sysfs_match_string(libie_fwlog_log_size, user_val); + if (index < 0) { + dev_info(dev, "Invalid log size '%s'. The value must be one of 128K, 256K, 512K, 1M, 2M\n", + user_val); + ret = -EINVAL; + goto log_size_write_error; + } else if (fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED) { + dev_info(dev, "FW logging is currently running. Please disable FW logging to change log_size\n"); + ret = -EINVAL; + goto log_size_write_error; + } + + /* free all the buffers and the tracking info and resize */ + libie_fwlog_realloc_rings(fwlog, index); + + /* if we get here, nothing went wrong; return count since we didn't + * really write anything + */ + ret = (ssize_t)count; + +log_size_write_error: + /* This function always consumes all of the written input, or produces + * an error. Check and enforce this. Otherwise, the write operation + * won't complete properly. + */ + if (WARN_ON(ret != (ssize_t)count && ret >= 0)) + ret = -EIO; + + return ret; +} + +static const struct file_operations libie_debugfs_log_size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_log_size_read, + .write = libie_debugfs_log_size_write, +}; + +/** + * libie_debugfs_data_read - read from 'data' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_data_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + int data_copied = 0; + bool done = false; + + if (libie_fwlog_ring_empty(&fwlog->ring)) + return 0; + + while (!libie_fwlog_ring_empty(&fwlog->ring) && !done) { + struct libie_fwlog_data *log; + u16 cur_buf_len; + + log = &fwlog->ring.rings[fwlog->ring.head]; + cur_buf_len = log->data_size; + if (cur_buf_len >= count) { + done = true; + continue; + } + + if (copy_to_user(buffer, log->data, cur_buf_len)) { + /* if there is an error then bail and return whatever + * the driver has copied so far + */ + done = true; + continue; + } + + data_copied += cur_buf_len; + buffer += cur_buf_len; + count -= cur_buf_len; + *ppos += cur_buf_len; + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + } + + return data_copied; +} + +/** + * libie_debugfs_data_write - write into 'data' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + struct device *dev = &fwlog->pdev->dev; + ssize_t ret; + + /* don't allow partial writes */ + if (*ppos != 0) + return 0; + + /* any value is allowed to clear the buffer so no need to even look at + * what the value is + */ + if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) { + fwlog->ring.head = 0; + fwlog->ring.tail = 0; + } else { + dev_info(dev, "Can't clear FW log data while FW log running\n"); + ret = -EINVAL; + goto nr_buffs_write_error; + } + + /* if we get here, nothing went wrong; return count since we didn't + * really write anything + */ + ret = (ssize_t)count; + +nr_buffs_write_error: + /* This function always consumes all of the written input, or produces + * an error. Check and enforce this. Otherwise, the write operation + * won't complete properly. + */ + if (WARN_ON(ret != (ssize_t)count && ret >= 0)) + ret = -EIO; + + return ret; +} + +static const struct file_operations libie_debugfs_data_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_data_read, + .write = libie_debugfs_data_write, +}; + +/** + * libie_debugfs_fwlog_init - setup the debugfs directory + * @fwlog: pointer to the fwlog structure + * @root: debugfs root entry on which fwlog director will be registered + */ +static void libie_debugfs_fwlog_init(struct libie_fwlog *fwlog, + struct dentry *root) +{ + struct dentry *fw_modules_dir; + struct dentry **fw_modules; + int i; + + /* allocate space for this first because if it fails then we don't + * need to unwind + */ + fw_modules = kcalloc(LIBIE_NR_FW_LOG_MODULES, sizeof(*fw_modules), + GFP_KERNEL); + if (!fw_modules) + return; + + fwlog->debugfs = debugfs_create_dir("fwlog", root); + if (IS_ERR(fwlog->debugfs)) + goto err_create_module_files; + + fw_modules_dir = debugfs_create_dir("modules", fwlog->debugfs); + if (IS_ERR(fw_modules_dir)) + goto err_create_module_files; + + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { + fw_modules[i] = debugfs_create_file(libie_fwlog_module_string[i], + 0600, fw_modules_dir, fwlog, + &libie_debugfs_module_fops); + if (IS_ERR(fw_modules[i])) + goto err_create_module_files; + } + + debugfs_create_file("nr_messages", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_nr_messages_fops); + + fwlog->debugfs_modules = fw_modules; + + debugfs_create_file("enable", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_enable_fops); + + debugfs_create_file("log_size", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_log_size_fops); + + debugfs_create_file("data", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_data_fops); + + return; + +err_create_module_files: + debugfs_remove_recursive(fwlog->debugfs); + kfree(fw_modules); +} + +static bool libie_fwlog_ring_full(struct libie_fwlog_ring *rings) +{ + u16 head, tail; + + head = rings->head; + tail = rings->tail; + + if (head < tail && (tail - head == (rings->size - 1))) + return true; + else if (head > tail && (tail == (head - 1))) + return true; + + return false; +} + +/** + * libie_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) + * @fwlog: pointer to the fwlog structure + * @cfg: firmware logging configuration to populate + */ +static int libie_aq_fwlog_get(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) +{ + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + u16 module_id_cnt; + int status; + void *buf; + int i; + + memset(cfg, 0, sizeof(*cfg)); + + buf = kzalloc(LIBIE_AQ_MAX_BUF_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_query); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); + cmd = libie_aq_raw(&desc); + + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; + + status = fwlog->send_cmd(fwlog->priv, &desc, buf, LIBIE_AQ_MAX_BUF_LEN); + if (status) { + dev_dbg(&fwlog->pdev->dev, "Failed to get FW log configuration\n"); + goto status_out; + } + + module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt); + if (module_id_cnt < LIBIE_AQC_FW_LOG_ID_MAX) { + dev_dbg(&fwlog->pdev->dev, "FW returned less than the expected number of FW log module IDs\n"); + } else if (module_id_cnt > LIBIE_AQC_FW_LOG_ID_MAX) { + dev_dbg(&fwlog->pdev->dev, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n", + LIBIE_AQC_FW_LOG_ID_MAX); + module_id_cnt = LIBIE_AQC_FW_LOG_ID_MAX; + } + + cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) + cfg->options |= LIBIE_FWLOG_OPTION_ARQ_ENA; + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) + cfg->options |= LIBIE_FWLOG_OPTION_UART_ENA; + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) + cfg->options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; + + fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; + + for (i = 0; i < module_id_cnt; i++) { + struct libie_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; + + cfg->module_entries[i].module_id = + le16_to_cpu(fw_module->module_identifier); + cfg->module_entries[i].log_level = fw_module->log_level; + } + +status_out: + kfree(buf); + return status; +} + +/** + * libie_fwlog_set_supported - Set if FW logging is supported by FW + * @fwlog: pointer to the fwlog structure + * + * If FW returns success to the libie_aq_fwlog_get call then it supports FW + * logging, else it doesn't. Set the fwlog_supported flag accordingly. + * + * This function is only meant to be called during driver init to determine if + * the FW support FW logging. + */ +static void libie_fwlog_set_supported(struct libie_fwlog *fwlog) +{ + struct libie_fwlog_cfg *cfg; + int status; + + fwlog->supported = false; + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return; + + status = libie_aq_fwlog_get(fwlog, cfg); + if (status) + dev_dbg(&fwlog->pdev->dev, "libie_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", + status); + else + fwlog->supported = true; + + kfree(cfg); +} + +/** + * libie_fwlog_init - Initialize FW logging configuration + * @fwlog: pointer to the fwlog structure + * @api: api structure to init fwlog + * + * This function should be called on driver initialization during + * libie_init_hw(). + */ +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api) +{ + fwlog->api = *api; + libie_fwlog_set_supported(fwlog); + + if (libie_fwlog_supported(fwlog)) { + int status; + + /* read the current config from the FW and store it */ + status = libie_aq_fwlog_get(fwlog, &fwlog->cfg); + if (status) + return status; + + fwlog->ring.rings = kcalloc(LIBIE_FWLOG_RING_SIZE_DFLT, + sizeof(*fwlog->ring.rings), + GFP_KERNEL); + if (!fwlog->ring.rings) { + dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log rings\n"); + return -ENOMEM; + } + + fwlog->ring.size = LIBIE_FWLOG_RING_SIZE_DFLT; + fwlog->ring.index = LIBIE_FWLOG_RING_SIZE_INDEX_DFLT; + + status = libie_fwlog_alloc_ring_buffs(&fwlog->ring); + if (status) { + dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); + libie_fwlog_free_ring_buffs(&fwlog->ring); + kfree(fwlog->ring.rings); + return status; + } + + libie_debugfs_fwlog_init(fwlog, api->debugfs_root); + } else { + dev_warn(&fwlog->pdev->dev, "FW logging is not supported in this NVM image. Please update the NVM to get FW log support\n"); + } + + return 0; +} +EXPORT_SYMBOL_GPL(libie_fwlog_init); + +/** + * libie_fwlog_deinit - unroll FW logging configuration + * @fwlog: pointer to the fwlog structure + * + * This function should be called in libie_deinit_hw(). + */ +void libie_fwlog_deinit(struct libie_fwlog *fwlog) +{ + int status; + + /* make sure FW logging is disabled to not put the FW in a weird state + * for the next driver load + */ + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; + status = libie_fwlog_set(fwlog, &fwlog->cfg); + if (status) + dev_warn(&fwlog->pdev->dev, "Unable to turn off FW logging, status: %d\n", + status); + + kfree(fwlog->debugfs_modules); + + fwlog->debugfs_modules = NULL; + + status = libie_fwlog_unregister(fwlog); + if (status) + dev_warn(&fwlog->pdev->dev, "Unable to unregister FW logging, status: %d\n", + status); + + if (fwlog->ring.rings) { + libie_fwlog_free_ring_buffs(&fwlog->ring); + kfree(fwlog->ring.rings); + } +} +EXPORT_SYMBOL_GPL(libie_fwlog_deinit); + +/** + * libie_get_fwlog_data - copy the FW log data from ARQ event + * @fwlog: fwlog that the FW log event is associated with + * @buf: event buffer pointer + * @len: len of event descriptor + */ +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len) +{ + struct libie_fwlog_data *log; + + log = &fwlog->ring.rings[fwlog->ring.tail]; + + memset(log->data, 0, PAGE_SIZE); + log->data_size = len; + + memcpy(log->data, buf, log->data_size); + libie_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); + + if (libie_fwlog_ring_full(&fwlog->ring)) { + /* the rings are full so bump the head to create room */ + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + } +} +EXPORT_SYMBOL_GPL(libie_get_fwlog_data); + +void libie_fwlog_reregister(struct libie_fwlog *fwlog) +{ + if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) + return; + + if (libie_fwlog_register(fwlog)) + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; +} +EXPORT_SYMBOL_GPL(libie_fwlog_reregister); + +MODULE_DESCRIPTION("Intel(R) Ethernet common library"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index 29420193889a..ab13bd777a28 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -265,7 +265,7 @@ enum libie_aqc_fw_logging_mod { LIBIE_AQC_FW_LOG_ID_TSDRV, LIBIE_AQC_FW_LOG_ID_PFREG, LIBIE_AQC_FW_LOG_ID_MDLVER, - LIBIE_AQC_FW_LOG_ID_MAX, + LIBIE_AQC_FW_LOG_ID_MAX }; /* Set FW Logging configuration (indirect 0xFF30) @@ -280,8 +280,8 @@ enum libie_aqc_fw_logging_mod { #define LIBIE_AQC_FW_LOG_AQ_REGISTER BIT(0) #define LIBIE_AQC_FW_LOG_AQ_QUERY BIT(2) -#define LIBIE_AQC_FW_LOG_MIN_RESOLUTION (1) -#define LIBIE_AQC_FW_LOG_MAX_RESOLUTION (128) +#define LIBIE_AQC_FW_LOG_MIN_RESOLUTION 1 +#define LIBIE_AQC_FW_LOG_MAX_RESOLUTION 128 struct libie_aqc_fw_log { u8 cmd_flags; diff --git a/include/linux/net/intel/libie/fwlog.h b/include/linux/net/intel/libie/fwlog.h new file mode 100644 index 000000000000..36b13fabca9e --- /dev/null +++ b/include/linux/net/intel/libie/fwlog.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2022, Intel Corporation. */ + +#ifndef _LIBIE_FWLOG_H_ +#define _LIBIE_FWLOG_H_ + +#include + +/* Only a single log level should be set and all log levels under the set value + * are enabled, e.g. if log level is set to LIBIE_FW_LOG_LEVEL_VERBOSE, then all + * other log levels are included (except LIBIE_FW_LOG_LEVEL_NONE) + */ +enum libie_fwlog_level { + LIBIE_FWLOG_LEVEL_NONE = 0, + LIBIE_FWLOG_LEVEL_ERROR = 1, + LIBIE_FWLOG_LEVEL_WARNING = 2, + LIBIE_FWLOG_LEVEL_NORMAL = 3, + LIBIE_FWLOG_LEVEL_VERBOSE = 4, + LIBIE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ +}; + +struct libie_fwlog_module_entry { + /* module ID for the corresponding firmware logging event */ + u16 module_id; + /* verbosity level for the module_id */ + u8 log_level; +}; + +struct libie_fwlog_cfg { + /* list of modules for configuring log level */ + struct libie_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; + /* options used to configure firmware logging */ + u16 options; +#define LIBIE_FWLOG_OPTION_ARQ_ENA BIT(0) +#define LIBIE_FWLOG_OPTION_UART_ENA BIT(1) + /* set before calling libie_fwlog_init() so the PF registers for + * firmware logging on initialization + */ +#define LIBIE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) + /* set in the libie_aq_fwlog_get() response if the PF is registered for + * FW logging events over ARQ + */ +#define LIBIE_FWLOG_OPTION_IS_REGISTERED BIT(3) + + /* minimum number of log events sent per Admin Receive Queue event */ + u16 log_resolution; +}; + +struct libie_fwlog_data { + u16 data_size; + u8 *data; +}; + +struct libie_fwlog_ring { + struct libie_fwlog_data *rings; + u16 index; + u16 size; + u16 head; + u16 tail; +}; + +#define LIBIE_FWLOG_RING_SIZE_INDEX_DFLT 3 +#define LIBIE_FWLOG_RING_SIZE_DFLT 256 +#define LIBIE_FWLOG_RING_SIZE_MAX 512 + +struct libie_fwlog { + struct libie_fwlog_cfg cfg; + bool supported; /* does hardware support FW logging? */ + struct libie_fwlog_ring ring; + struct dentry *debugfs; + /* keep track of all the dentrys for FW log modules */ + struct dentry **debugfs_modules; + struct_group_tagged(libie_fwlog_api, api, + struct pci_dev *pdev; + int (*send_cmd)(void *, struct libie_aq_desc *, void *, u16); + void *priv; + struct dentry *debugfs_root; + ); +}; + +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); +void libie_fwlog_deinit(struct libie_fwlog *fwlog); +void libie_fwlog_reregister(struct libie_fwlog *fwlog); +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); +#endif /* _LIBIE_FWLOG_H_ */ -- cgit v1.2.3 From 70f23546d246563da648baedbb0432ba1d6bb357 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:01 +0000 Subject: bpf: core: introduce main_prog_aux for stream access BPF streams are only valid for the main programs, to make it easier to access streams from subprogs, introduce main_prog_aux in struct bpf_prog_aux. prog->aux->main_prog_aux = prog->aux, for main programs and prog->aux->main_prog_aux = main_prog->aux, for subprograms. Make bpf_prog_find_from_stack() use the added main_prog_aux to return the mainprog when a subprog is found on the stack. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250911145808.58042-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 6 +++--- kernel/bpf/verifier.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a8..d133171c4d2a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1633,6 +1633,7 @@ struct bpf_prog_aux { /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; + struct bpf_prog_aux *main_prog_aux; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e68019459106..1cda2589d4b3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -120,6 +120,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->main_prog_aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); @@ -3297,9 +3298,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) rcu_read_unlock(); if (!prog) return true; - if (bpf_is_subprog(prog)) - return true; - ctxp->prog = prog; + /* Make sure we return the main prog if we found a subprog */ + ctxp->prog = prog->aux->main_prog_aux->prog; return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b8b510a25b03..17fe623400a5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21601,6 +21601,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->main_prog_aux = prog->aux; for (j = 0; j < prog->aux->size_poke_tab; j++) { struct bpf_jit_poke_descriptor *poke; -- cgit v1.2.3 From 5c5240d020615f13331f4e2c559186125eddc7d3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:02 +0000 Subject: bpf: Report arena faults to BPF stderr Begin reporting arena page faults and the faulting address to BPF program's stderr, this patch adds support in the arm64 and x86-64 JITs, support for other archs can be added later. The fault handlers receive the 32 bit address in the arena region so the upper 32 bits of user_vm_start is added to it before printing the address. This is what the user would expect to see as this is what is printed by bpf_printk() is you pass it an address returned by bpf_arena_alloc_pages(); Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250911145808.58042-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 60 ++++++++++++++++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 85 ++++++++++++++++++++++++++++++++++++++++--- include/linux/bpf.h | 6 +++ kernel/bpf/arena.c | 30 +++++++++++++++ 4 files changed, 176 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e6d1fdc1e6f5..008273a53e04 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,6 +1066,30 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` field in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+-----------+----------+ + * | 31-27 | 26-22 | 21 | 20-16 | 15-0 | + * | | | | | | + * | FIXUP_REG | Unused | ARENA_ACC | ARENA_REG | OFFSET | + * +-----------+--------+-----------+-----------+----------+ + * + * - OFFSET (16 bits): Offset used to compute address for Load/Store instruction. + * - ARENA_REG (5 bits): Register that is used to calculate the address for load/store when + * accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * - FIXUP_REG (5 bits): Destination register for the load instruction (cleared on fault) or set to + * DONT_CLEAR if it is a store instruction. + */ + +#define BPF_FIXUP_OFFSET_MASK GENMASK(15, 0) +#define BPF_FIXUP_ARENA_REG_MASK GENMASK(20, 16) +#define BPF_ARENA_ACCESS BIT(21) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ @@ -1073,11 +1097,22 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); + s16 off = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); + int arena_reg = FIELD_GET(BPF_FIXUP_ARENA_REG_MASK, ex->fixup); + bool is_arena = !!(ex->fixup & BPF_ARENA_ACCESS); + bool is_write = (dst_reg == DONT_CLEAR); + unsigned long addr; + + if (is_arena) { + addr = regs->regs[arena_reg] + off; + bpf_prog_report_arena_violation(is_write, addr, regs->pc); + } if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; /* Skip the faulting instruction */ regs->pc += AARCH64_INSN_SIZE; + return true; } @@ -1087,6 +1122,9 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; + s16 off = insn->off; + bool is_arena; + int arena_reg; unsigned long pc; struct exception_table_entry *ex; @@ -1100,6 +1138,9 @@ static int add_exception_handler(const struct bpf_insn *insn, BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + if (!ctx->prog->aux->extable || WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries)) return -EINVAL; @@ -1131,6 +1172,25 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + if (is_arena) { + ex->fixup |= BPF_ARENA_ACCESS; + /* + * insn->src_reg/dst_reg holds the address in the arena region with upper 32-bits + * being zero because of a preceding addr_space_cast(r, 0x0, 0x1) instruction. + * This address is adjusted with the addition of arena_vm_start (see the + * implementation of BPF_PROBE_MEM32 and BPF_PROBE_ATOMIC) before being used for the + * memory access. Pass the reg holding the unmodified 32-bit address to + * ex_handler_bpf. + */ + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = bpf2a64[insn->src_reg]; + else + arena_reg = bpf2a64[insn->dst_reg]; + + ex->fixup |= FIELD_PREP(BPF_FIXUP_OFFSET_MASK, off) | + FIELD_PREP(BPF_FIXUP_ARENA_REG_MASK, arena_reg); + } + ex->type = EX_TYPE_BPF; ctx->exentry_idx++; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca164620..8d34a9400a5e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1388,16 +1389,67 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, return 0; } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr, regs->ip); + } /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + return true; } @@ -2070,6 +2122,7 @@ populate_extable: { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; s64 delta; if (!bpf_prog->aux->extable) @@ -2089,8 +2142,29 @@ populate_extable: ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + /* + * src_reg/dst_reg holds the address in the arena region with upper + * 32-bits being zero because of a preceding addr_space_cast(r, + * 0x0, 0x1) instruction. This address is adjusted with the addition + * of arena_vm_start (see the implementation of BPF_PROBE_MEM32 and + * BPF_PROBE_ATOMIC) before being used for the memory access. Pass + * the reg holding the unmodified 32-bit address to + * ex_handler_bpf(). + */ + if (BPF_CLASS(insn->code) == BPF_LDX) { + arena_reg = reg2pt_regs[src_reg]; + fixup_reg = reg2pt_regs[dst_reg]; + } else { + arena_reg = reg2pt_regs[dst_reg]; + fixup_reg = DONT_CLEAR; + } + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + ex->fixup |= FIXUP_ARENA_ACCESS; + + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); } break; @@ -2208,7 +2282,8 @@ populate_extable: * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, reg2pt_regs[dst_reg]); } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d133171c4d2a..41f776071ff5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2881,6 +2881,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -3168,6 +3169,11 @@ static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } + +static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr, + unsigned long fault_ip) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 5b37753799d2..1074ac4459f2 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -633,3 +633,33 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + u64 user_vm_start; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + + /* Use main prog for stream access */ + prog = prog->aux->main_prog_aux->prog; + + user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + addr += clear_lo32(user_vm_start); + + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(ss); + })); +} -- cgit v1.2.3 From eadaa8b255f36ee39ca97d0815c25eeeb1f5d674 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:29 +0300 Subject: dma-mapping: introduce new DMA attribute to indicate MMIO memory This patch introduces the DMA_ATTR_MMIO attribute to mark DMA buffers that reside in memory-mapped I/O (MMIO) regions, such as device BARs exposed through the host bridge, which are accessible for peer-to-peer (P2P) DMA. This attribute is especially useful for exporting device memory to other devices for DMA without CPU involvement, and avoids unnecessary or potentially detrimental CPU cache maintenance calls. DMA_ATTR_MMIO is supposed to provide dma_map_resource() functionality without need to call to special function and perform branching when processing generic containers like bio_vec by the callers. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/6f058ec395c5348014860dbc2eed348c17975843.1757423202.git.leonro@nvidia.com --- Documentation/core-api/dma-attributes.rst | 18 ++++++++++++++++++ include/linux/dma-mapping.h | 20 ++++++++++++++++++++ include/trace/events/dma.h | 3 ++- rust/kernel/dma.rs | 3 +++ 4 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..0bdc2be65e57 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,21 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_MMIO +------------- + +This attribute indicates the physical address is not normal system +memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page() +functions, it may not be cacheable, and access using CPU load/store +instructions may not be allowed. + +Usually this will be used to describe MMIO addresses, or other non-cacheable +register addresses. When DMA mapping this sort of address we call +the operation Peer to Peer as a one device is DMA'ing to another device. +For PCI devices the p2pdma APIs must be used to determine if +DMA_ATTR_MMIO is appropriate. + +For architectures that require cache flushing for DMA coherence +DMA_ATTR_MMIO will not perform any cache flushing. The address +provided must never be mapped cacheable into the CPU. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 55c03e5fe8cb..4254fd9bdf5d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -58,6 +58,26 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * DMA_ATTR_MMIO - Indicates memory-mapped I/O (MMIO) region for DMA mapping + * + * This attribute indicates the physical address is not normal system + * memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page() + * functions, it may not be cacheable, and access using CPU load/store + * instructions may not be allowed. + * + * Usually this will be used to describe MMIO addresses, or other non-cacheable + * register addresses. When DMA mapping this sort of address we call + * the operation Peer to Peer as a one device is DMA'ing to another device. + * For PCI devices the p2pdma APIs must be used to determine if DMA_ATTR_MMIO + * is appropriate. + * + * For architectures that require cache flushing for DMA coherence + * DMA_ATTR_MMIO will not perform any cache flushing. The address + * provided must never be mapped cacheable into the CPU. + */ +#define DMA_ATTR_MMIO (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index d8ddc27b6a7c..ee90d6f1dcf3 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -31,7 +31,8 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_FORCE_CONTIGUOUS, "FORCE_CONTIGUOUS" }, \ { DMA_ATTR_ALLOC_SINGLE_PAGES, "ALLOC_SINGLE_PAGES" }, \ { DMA_ATTR_NO_WARN, "NO_WARN" }, \ - { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }) + { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ + { DMA_ATTR_MMIO, "MMIO" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/rust/kernel/dma.rs b/rust/kernel/dma.rs index 2bc8ab51ec28..61d9eed7a786 100644 --- a/rust/kernel/dma.rs +++ b/rust/kernel/dma.rs @@ -242,6 +242,9 @@ pub mod attrs { /// Indicates that the buffer is fully accessible at an elevated privilege level (and /// ideally inaccessible or at least read-only at lesser-privileged levels). pub const DMA_ATTR_PRIVILEGED: Attrs = Attrs(bindings::DMA_ATTR_PRIVILEGED); + + /// Indicates that the buffer is MMIO memory. + pub const DMA_ATTR_MMIO: Attrs = Attrs(bindings::DMA_ATTR_MMIO); } /// An abstraction of the `dma_alloc_coherent` API. -- cgit v1.2.3 From e9e81d86fee63c6d5757841ab557019ddf73786f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:31 +0300 Subject: dma-debug: refactor to use physical addresses for page mapping Convert the DMA debug infrastructure from page-based to physical address-based mapping as a preparation to rely on physical address for DMA mapping routines. The refactoring renames debug_dma_map_page() to debug_dma_map_phys() and changes its signature to accept a phys_addr_t parameter instead of struct page and offset. Similarly, debug_dma_unmap_page() becomes debug_dma_unmap_phys(). A new dma_debug_phy type is introduced to distinguish physical address mappings from other debug entry types. All callers throughout the codebase are updated to pass physical addresses directly, eliminating the need for page-to-physical conversion in the debug layer. This refactoring eliminates the need to convert between page pointers and physical addresses in the debug layer, making the code more efficient and consistent with the DMA mapping API's physical address focus. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky [mszyprow: added a fixup] Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/56d1a6769b68dfcbf8b26a75a7329aeb8e3c3b6a.1757423202.git.leonro@nvidia.com Link: https://lore.kernel.org/all/20250910052618.GH341237@unreal/ --- Documentation/core-api/dma-api.rst | 4 ++-- include/linux/page-flags.h | 1 + kernel/dma/debug.c | 39 +++++++++++++++++++------------------- kernel/dma/debug.h | 16 +++++++--------- kernel/dma/mapping.c | 10 +++++----- 5 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 3087bea715ed..ca75b3541679 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -761,7 +761,7 @@ example warning message may look like this:: [] find_busiest_group+0x207/0x8a0 [] _spin_lock_irqsave+0x1f/0x50 [] check_unmap+0x203/0x490 - [] debug_dma_unmap_page+0x49/0x50 + [] debug_dma_unmap_phys+0x49/0x50 [] nv_tx_done_optimized+0xc6/0x2c0 [] nv_nic_irq_optimized+0x73/0x2b0 [] handle_IRQ_event+0x34/0x70 @@ -855,7 +855,7 @@ that a driver may be leaking mappings. dma-debug interface debug_dma_mapping_error() to debug drivers that fail to check DMA mapping errors on addresses returned by dma_map_single() and dma_map_page() interfaces. This interface clears a flag set by -debug_dma_map_page() to indicate that dma_mapping_error() has been called by +debug_dma_map_phys() to indicate that dma_mapping_error() has been called by the driver. When driver does unmap, debug_dma_unmap() checks the flag and if this flag is still set, prints warning message that includes call trace that leads up to the unmap. This interface can be called from dma_mapping_error() diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..2a1f34617802 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -618,6 +618,7 @@ FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) #else PAGEFLAG_FALSE(HighMem, highmem) #endif +#define PhysHighMem(__p) (PageHighMem(phys_to_page(__p))) /* Does kmap_local_folio() only allow access to one page of the folio? */ #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index b82399437db0..b275db9ca6a0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -40,6 +40,7 @@ enum { dma_debug_coherent, dma_debug_resource, dma_debug_noncoherent, + dma_debug_phy, }; enum map_err_types { @@ -143,6 +144,7 @@ static const char *type2name[] = { [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", [dma_debug_noncoherent] = "noncoherent", + [dma_debug_phy] = "phy", }; static const char *dir2name[] = { @@ -1054,17 +1056,16 @@ static void check_unmap(struct dma_debug_entry *ref) dma_entry_free(entry); } -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) +static void check_for_stack(struct device *dev, phys_addr_t phys) { void *addr; struct vm_struct *stack_vm_area = task_stack_vm_area(current); if (!stack_vm_area) { /* Stack is direct-mapped. */ - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = page_address(page) + offset; + addr = phys_to_virt(phys); if (object_is_on_stack(addr)) err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr); } else { @@ -1072,10 +1073,12 @@ static void check_for_stack(struct device *dev, int i; for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) + if (__phys_to_pfn(phys) != + page_to_pfn(stack_vm_area->pages[i])) continue; - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + addr = (u8 *)current->stack + i * PAGE_SIZE + + (phys % PAGE_SIZE); err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr); break; } @@ -1204,9 +1207,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, } EXPORT_SYMBOL(debug_dma_map_single); -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - unsigned long attrs) +void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + int direction, dma_addr_t dma_addr, unsigned long attrs) { struct dma_debug_entry *entry; @@ -1221,19 +1223,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_single; - entry->paddr = page_to_phys(page) + offset; + entry->type = dma_debug_phy; + entry->paddr = phys; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - check_for_stack(dev, page, offset); + if (!(attrs & DMA_ATTR_MMIO)) { + check_for_stack(dev, phys); - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); + if (!PhysHighMem(phys)) + check_for_illegal_area(dev, phys_to_virt(phys), size); } add_dma_entry(entry, attrs); @@ -1277,11 +1278,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, +void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_single, + .type = dma_debug_phy, .dev = dev, .dev_addr = dma_addr, .size = size, @@ -1305,7 +1306,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nents, i) { - check_for_stack(dev, sg_page(s), s->offset); + check_for_stack(dev, sg_phys(s)); if (!PageHighMem(sg_page(s))) check_for_illegal_area(dev, sg_virt(s), s->length); } diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index 48757ca13f31..bedae973e725 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -9,12 +9,11 @@ #define _KERNEL_DMA_DEBUG_H #ifdef CONFIG_DMA_API_DEBUG -extern void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, +extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, dma_addr_t dma_addr, unsigned long attrs); -extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, @@ -62,14 +61,13 @@ extern void debug_dma_free_pages(struct device *dev, struct page *page, size_t size, int direction, dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ -static inline void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) +static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, + dma_addr_t dma_addr, unsigned long attrs) { } -static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction) { } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 56de28a3b179..0b7e16c69bf1 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); @@ -165,16 +166,15 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) + arch_dma_map_page_direct(dev, phys + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); - trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, - attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); + trace_dma_map_page(dev, phys, addr, size, dir, attrs); + debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; } @@ -194,7 +194,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir); + debug_dma_unmap_phys(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_page_attrs); -- cgit v1.2.3 From 76bb7c49f50ce7687f98eb35e78798584652dd0e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:32 +0300 Subject: dma-mapping: rename trace_dma_*map_page to trace_dma_*map_phys As a preparation for following map_page -> map_phys API conversion, let's rename trace_dma_*map_page() to be trace_dma_*map_phys(). Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/c0c02d7d8bd4a148072d283353ba227516a76682.1757423202.git.leonro@nvidia.com --- include/trace/events/dma.h | 4 ++-- kernel/dma/mapping.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index ee90d6f1dcf3..84416c7d6bfa 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -72,7 +72,7 @@ DEFINE_EVENT(dma_map, name, \ size_t size, enum dma_data_direction dir, unsigned long attrs), \ TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs)) -DEFINE_MAP_EVENT(dma_map_page); +DEFINE_MAP_EVENT(dma_map_phys); DEFINE_MAP_EVENT(dma_map_resource); DECLARE_EVENT_CLASS(dma_unmap, @@ -110,7 +110,7 @@ DEFINE_EVENT(dma_unmap, name, \ enum dma_data_direction dir, unsigned long attrs), \ TP_ARGS(dev, addr, size, dir, attrs)) -DEFINE_UNMAP_EVENT(dma_unmap_page); +DEFINE_UNMAP_EVENT(dma_unmap_phys); DEFINE_UNMAP_EVENT(dma_unmap_resource); DECLARE_EVENT_CLASS(dma_alloc_class, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 0b7e16c69bf1..bd3bb6d59d72 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -173,7 +173,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); - trace_dma_map_page(dev, phys, addr, size, dir, attrs); + trace_dma_map_phys(dev, phys, addr, size, dir, attrs); debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; @@ -193,7 +193,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, iommu_dma_unmap_page(dev, addr, size, dir, attrs); else ops->unmap_page(dev, addr, size, dir, attrs); - trace_dma_unmap_page(dev, addr, size, dir, attrs); + trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_page_attrs); -- cgit v1.2.3 From 513559f73700966ded094b090c3ecc6dff877ef9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:33 +0300 Subject: iommu/dma: rename iommu_dma_*map_page to iommu_dma_*map_phys Rename the IOMMU DMA mapping functions to better reflect their actual calling convention. The functions iommu_dma_map_page() and iommu_dma_unmap_page() are renamed to iommu_dma_map_phys() and iommu_dma_unmap_phys() respectively, as they already operate on physical addresses rather than page structures. The calling convention changes from accepting (struct page *page, unsigned long offset) to (phys_addr_t phys), which eliminates the need for page-to-physical address conversion within the functions. This renaming prepares for the broader DMA API conversion from page-based to physical address-based mapping throughout the kernel. All callers are updated to pass physical addresses directly, including dma_map_page_attrs(), scatterlist mapping functions, and DMA page allocation helpers. The change simplifies the code by removing the page_to_phys() + offset calculation that was previously done inside the IOMMU functions. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/ed172f95f8f57782beae04f782813366894e98df.1757423202.git.leonro@nvidia.com --- drivers/iommu/dma-iommu.c | 14 ++++++-------- include/linux/iommu-dma.h | 7 +++---- kernel/dma/mapping.c | 4 ++-- kernel/dma/ops_helpers.c | 6 +++--- 4 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e1185ba73e23..aea119f32f96 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1195,11 +1195,9 @@ static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys, return iova_offset(iovad, phys | size); } -dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; bool coherent = dev_is_dma_coherent(dev); int prot = dma_info_to_prot(dir, coherent, attrs); struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -1227,7 +1225,7 @@ dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, return iova; } -void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, +void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -1346,7 +1344,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *s int i; for_each_sg(sg, s, nents, i) - iommu_dma_unmap_page(dev, sg_dma_address(s), + iommu_dma_unmap_phys(dev, sg_dma_address(s), sg_dma_len(s), dir, attrs); } @@ -1359,8 +1357,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg, sg_dma_mark_swiotlb(sg); for_each_sg(sg, s, nents, i) { - sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s), - s->offset, s->length, dir, attrs); + sg_dma_address(s) = iommu_dma_map_phys(dev, sg_phys(s), + s->length, dir, attrs); if (sg_dma_address(s) == DMA_MAPPING_ERROR) goto out_unmap; sg_dma_len(s) = s->length; diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 508beaa44c39..485bdffed988 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -21,10 +21,9 @@ static inline bool use_dma_iommu(struct device *dev) } #endif /* CONFIG_IOMMU_DMA */ -dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs); -void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, +dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index bd3bb6d59d72..90ad728205b9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -169,7 +169,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, arch_dma_map_page_direct(dev, phys + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else if (use_dma_iommu(dev)) - addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); + addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); @@ -190,7 +190,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, arch_dma_unmap_page_direct(dev, addr + size)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, addr, size, dir, attrs); + iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 9afd569eadb9..6f9d604d9d40 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -72,8 +72,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, return NULL; if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, + dir, DMA_ATTR_SKIP_CPU_SYNC); else *dma_handle = ops->map_page(dev, page, 0, size, dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -92,7 +92,7 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, dma_handle, size, dir, + iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, -- cgit v1.2.3 From e53d29f957b36ba1666331956c6ccb047bb157d2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:35 +0300 Subject: dma-mapping: convert dma_direct_*map_page to be phys_addr_t based Convert the DMA direct mapping functions to accept physical addresses directly instead of page+offset parameters. The functions were already operating on physical addresses internally, so this change eliminates the redundant page-to-physical conversion at the API boundary. The functions dma_direct_map_page() and dma_direct_unmap_page() are renamed to dma_direct_map_phys() and dma_direct_unmap_phys() respectively, with their calling convention changed from (struct page *page, unsigned long offset) to (phys_addr_t phys). Architecture-specific functions arch_dma_map_page_direct() and arch_dma_unmap_page_direct() are similarly renamed to arch_dma_map_phys_direct() and arch_dma_unmap_phys_direct(). The is_pci_p2pdma_page() checks are replaced with DMA_ATTR_MMIO checks to allow integration with dma_direct_map_resource and dma_direct_map_phys() is extended to support MMIO path either. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/bb15a22f76dc2e26683333ff54e789606cfbfcf0.1757423202.git.leonro@nvidia.com --- arch/powerpc/kernel/dma-iommu.c | 4 +-- include/linux/dma-map-ops.h | 8 +++--- kernel/dma/direct.c | 6 ++--- kernel/dma/direct.h | 57 ++++++++++++++++++++++++++--------------- kernel/dma/mapping.c | 8 +++--- 5 files changed, 49 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 4d64a5db50f3..0359ab72cd3b 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -14,7 +14,7 @@ #define can_map_direct(dev, addr) \ ((dev)->bus_dma_limit >= phys_to_dma((dev), (addr))) -bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr) +bool arch_dma_map_phys_direct(struct device *dev, phys_addr_t addr) { if (likely(!dev->bus_dma_limit)) return false; @@ -24,7 +24,7 @@ bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr) #define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset) -bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle) +bool arch_dma_unmap_phys_direct(struct device *dev, dma_addr_t dma_handle) { if (likely(!dev->bus_dma_limit)) return false; diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 332b80c42b6f..10882d00cb17 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -395,15 +395,15 @@ void *arch_dma_set_uncached(void *addr, size_t size); void arch_dma_clear_uncached(void *addr, size_t size); #ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT -bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr); -bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle); +bool arch_dma_map_phys_direct(struct device *dev, phys_addr_t addr); +bool arch_dma_unmap_phys_direct(struct device *dev, dma_addr_t dma_handle); bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg, int nents); bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, int nents); #else -#define arch_dma_map_page_direct(d, a) (false) -#define arch_dma_unmap_page_direct(d, a) (false) +#define arch_dma_map_phys_direct(d, a) (false) +#define arch_dma_unmap_phys_direct(d, a) (false) #define arch_dma_map_sg_direct(d, s, n) (false) #define arch_dma_unmap_sg_direct(d, s, n) (false) #endif diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 302e89580972..ba7524f169bc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -448,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else - dma_direct_unmap_page(dev, sg->dma_address, + dma_direct_unmap_phys(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } @@ -471,8 +471,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, */ break; case PCI_P2PDMA_MAP_NONE: - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); + sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index d2c0b7e632fc..da2fadf45bcd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_dma_mark_clean(paddr, size); } -static inline dma_addr_t dma_direct_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) +static inline dma_addr_t dma_direct_map_phys(struct device *dev, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); + dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) + goto err_overflow; + return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true)) || - dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) { + dma_addr = phys; + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; + } else { + dma_addr = phys_to_dma(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { + if (is_swiotlb_active(dev)) + return swiotlb_map(dev, phys, size, dir, attrs); + + goto err_overflow; + } } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); return dma_addr; + +err_overflow: + dev_WARN_ONCE( + dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, +static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = dma_to_phys(dev, addr); + phys_addr_t phys; + + if (attrs & DMA_ATTR_MMIO) + /* nothing to do: uncached and no swiotlb */ + return; + phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 90ad728205b9..3ac7d15e095f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -166,8 +166,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, phys + size)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + arch_dma_map_phys_direct(dev, phys + size)) + addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else @@ -187,8 +187,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || - arch_dma_unmap_page_direct(dev, addr + size)) - dma_direct_unmap_page(dev, addr, size, dir, attrs); + arch_dma_unmap_phys_direct(dev, addr + size)) + dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else -- cgit v1.2.3 From 6eb1e769b2c13a33cb2ca694454a7561d3d72c0a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:36 +0300 Subject: kmsan: convert kmsan_handle_dma to use physical addresses Convert the KMSAN DMA handling function from page-based to physical address-based interface. The refactoring renames kmsan_handle_dma() parameters from accepting (struct page *page, size_t offset, size_t size) to (phys_addr_t phys, size_t size). The existing semantics where callers are expected to provide only kmap memory is continued here. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/3557cbaf66e935bc794f37d2b891ef75cbf2c80c.1757423202.git.leonro@nvidia.com --- drivers/virtio/virtio_ring.c | 4 ++-- include/linux/kmsan.h | 9 ++++----- kernel/dma/mapping.c | 3 ++- mm/kmsan/hooks.c | 10 ++++++---- tools/virtio/linux/kmsan.h | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f5062061c408..c147145a6593 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -378,7 +378,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist * is initialized by the hardware. Explicitly check/unpoison it * depending on the direction. */ - kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); + kmsan_handle_dma(sg_phys(sg), sg->length, direction); *addr = (dma_addr_t)sg_phys(sg); return 0; } @@ -3157,7 +3157,7 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) { - kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); + kmsan_handle_dma(virt_to_phys(ptr), size, dir); return (dma_addr_t)virt_to_phys(ptr); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 2b1432cc16d5..f2fd221107bb 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -182,8 +182,7 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); /** * kmsan_handle_dma() - Handle a DMA data transfer. - * @page: first page of the buffer. - * @offset: offset of the buffer within the first page. + * @phys: physical address of the buffer. * @size: buffer size. * @dir: one of possible dma_data_direction values. * @@ -192,7 +191,7 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); * * initializes the buffer, if it is copied from device; * * does both, if this is a DMA_BIDIRECTIONAL transfer. */ -void kmsan_handle_dma(struct page *page, size_t offset, size_t size, +void kmsan_handle_dma(phys_addr_t phys, size_t size, enum dma_data_direction dir); /** @@ -372,8 +371,8 @@ static inline void kmsan_iounmap_page_range(unsigned long start, { } -static inline void kmsan_handle_dma(struct page *page, size_t offset, - size_t size, enum dma_data_direction dir) +static inline void kmsan_handle_dma(phys_addr_t phys, size_t size, + enum dma_data_direction dir) { } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3ac7d15e095f..e47bcf7cc43d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -172,7 +172,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); - kmsan_handle_dma(page, offset, size, dir); + + kmsan_handle_dma(phys, size, dir); trace_dma_map_phys(dev, phys, addr, size, dir, attrs); debug_dma_map_phys(dev, phys, size, dir, addr, attrs); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 97de3d6194f0..fa9475e5ec4e 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -336,14 +336,16 @@ static void kmsan_handle_dma_page(const void *addr, size_t size, } /* Helper function to handle DMA data transfers. */ -void kmsan_handle_dma(struct page *page, size_t offset, size_t size, +void kmsan_handle_dma(phys_addr_t phys, size_t size, enum dma_data_direction dir) { - u64 page_offset, to_go, addr; + struct page *page = phys_to_page(phys); + u64 page_offset, to_go; + void *addr; - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = (u64)page_address(page) + offset; + addr = page_to_virt(page); /* * The kernel may occasionally give us adjacent DMA pages not belonging * to the same allocation. Process them separately to avoid triggering diff --git a/tools/virtio/linux/kmsan.h b/tools/virtio/linux/kmsan.h index 272b5aa285d5..6cd2e3efd03d 100644 --- a/tools/virtio/linux/kmsan.h +++ b/tools/virtio/linux/kmsan.h @@ -4,7 +4,7 @@ #include -inline void kmsan_handle_dma(struct page *page, size_t offset, size_t size, +inline void kmsan_handle_dma(phys_addr_t phys, size_t size, enum dma_data_direction dir) { } -- cgit v1.2.3 From f7326196a781622b33bfbdabb00f5e72b5fb5679 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:39 +0300 Subject: dma-mapping: export new dma_*map_phys() interface Introduce new DMA mapping functions dma_map_phys() and dma_unmap_phys() that operate directly on physical addresses instead of page+offset parameters. This provides a more efficient interface for drivers that already have physical addresses available. The new functions are implemented as the primary mapping layer, with the existing dma_map_page_attrs()/dma_map_resource() and dma_unmap_page_attrs()/dma_unmap_resource() functions converted to simple wrappers around the phys-based implementations. In case dma_map_page_attrs(), the struct page is converted to physical address with help of page_to_phys() function and dma_map_resource() provides physical address as is together with addition of DMA_ATTR_MMIO attribute. The old page-based API is preserved in mapping.c to ensure that existing code won't be affected by changing EXPORT_SYMBOL to EXPORT_SYMBOL_GPL variant for dma_*map_phys(). Reviewed-by: Jason Gunthorpe Reviewed-by: Keith Busch Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/54cc52af91777906bbe4a386113437ba0bcfba9c.1757423202.git.leonro@nvidia.com --- drivers/iommu/dma-iommu.c | 14 --------- include/linux/dma-direct.h | 2 -- include/linux/dma-mapping.h | 13 +++++++++ include/linux/iommu-dma.h | 4 --- include/trace/events/dma.h | 2 -- kernel/dma/debug.c | 43 ---------------------------- kernel/dma/debug.h | 21 -------------- kernel/dma/direct.c | 16 ----------- kernel/dma/mapping.c | 69 ++++++++++++++++++++++++--------------------- 9 files changed, 50 insertions(+), 134 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 6804aaf034a1..7944a3af4545 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1556,20 +1556,6 @@ void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, __iommu_dma_unmap(dev, start, end - start); } -dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - return __iommu_dma_map(dev, phys, size, - dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO, - dma_get_mask(dev)); -} - -void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - __iommu_dma_unmap(dev, handle, size); -} - static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) { size_t alloc_size = PAGE_ALIGN(size); diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index f3bc0bcd7098..c249912456f9 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -149,7 +149,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir); int dma_direct_supported(struct device *dev, u64 mask); -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4254fd9bdf5d..8248ff9363ee 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -138,6 +138,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, @@ -192,6 +196,15 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } +static inline dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + return DMA_MAPPING_ERROR; +} +static inline void dma_unmap_phys(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ +} static inline unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 485bdffed988..a92b3ff9b934 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -42,10 +42,6 @@ size_t iommu_dma_opt_mapping_size(void); size_t iommu_dma_max_mapping_size(struct device *dev); void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, unsigned long attrs); -dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs); -void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs); struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void iommu_dma_free_noncontiguous(struct device *dev, size_t size, diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 84416c7d6bfa..5da59fd8121d 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -73,7 +73,6 @@ DEFINE_EVENT(dma_map, name, \ TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs)) DEFINE_MAP_EVENT(dma_map_phys); -DEFINE_MAP_EVENT(dma_map_resource); DECLARE_EVENT_CLASS(dma_unmap, TP_PROTO(struct device *dev, dma_addr_t addr, size_t size, @@ -111,7 +110,6 @@ DEFINE_EVENT(dma_unmap, name, \ TP_ARGS(dev, addr, size, dir, attrs)) DEFINE_UNMAP_EVENT(dma_unmap_phys); -DEFINE_UNMAP_EVENT(dma_unmap_resource); DECLARE_EVENT_CLASS(dma_alloc_class, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index b275db9ca6a0..1e5c64cb6a42 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -38,7 +38,6 @@ enum { dma_debug_single, dma_debug_sg, dma_debug_coherent, - dma_debug_resource, dma_debug_noncoherent, dma_debug_phy, }; @@ -142,7 +141,6 @@ static const char *type2name[] = { [dma_debug_single] = "single", [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", - [dma_debug_resource] = "resource", [dma_debug_noncoherent] = "noncoherent", [dma_debug_phy] = "phy", }; @@ -1446,47 +1444,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->paddr = addr; - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry, attrs); -} - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} - void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index bedae973e725..da7be0bddcf6 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -30,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size, extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); -extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs); - -extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction); - extern void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); @@ -95,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, { } -static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs) -{ -} - -static inline void debug_dma_unmap_resource(struct device *dev, - dma_addr_t dma_addr, size_t size, - int direction) -{ -} - static inline void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ba7524f169bc..1f9ee9759426 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -497,22 +497,6 @@ out_unmap: return ret; } -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t dma_addr = paddr; - - if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - dev_err_once(dev, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - WARN_ON_ONCE(1); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 95eab531e227..fe7472f13b10 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -152,12 +152,10 @@ static inline bool dma_map_direct(struct device *dev, return dma_go_direct(dev, *dev->dma_mask, ops); } -dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, - size_t offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - phys_addr_t phys = page_to_phys(page) + offset; bool is_mmio = attrs & DMA_ATTR_MMIO; dma_addr_t addr; @@ -177,6 +175,9 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = ops->map_resource(dev, phys, size, dir, attrs); } else { + struct page *page = phys_to_page(phys); + size_t offset = offset_in_page(phys); + /* * The dma_ops API contract for ops->map_page() requires * kmappable memory, while ops->map_resource() does not. @@ -191,9 +192,26 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return addr; } +EXPORT_SYMBOL_GPL(dma_map_phys); + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(is_zone_device_page(page))) + return DMA_MAPPING_ERROR; + + return dma_map_phys(dev, phys, size, dir, attrs); +} EXPORT_SYMBOL(dma_map_page_attrs); -void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -213,6 +231,16 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } +EXPORT_SYMBOL_GPL(dma_unmap_phys); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + if (unlikely(attrs & DMA_ATTR_MMIO)) + return; + + dma_unmap_phys(dev, addr, size, dir, attrs); +} EXPORT_SYMBOL(dma_unmap_page_attrs); static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, @@ -338,41 +366,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr = DMA_MAPPING_ERROR; - - BUG_ON(!valid_dma_direction(dir)); - - if (WARN_ON_ONCE(!dev->dma_mask)) + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) - addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); - else if (use_dma_iommu(dev)) - addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); - else if (ops->map_resource) - addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - - trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); - return addr; + return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) - ; /* nothing to do: uncached and no swiotlb */ - else if (use_dma_iommu(dev)) - iommu_dma_unmap_resource(dev, addr, size, dir, attrs); - else if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - trace_dma_unmap_resource(dev, addr, size, dir, attrs); - debug_dma_unmap_resource(dev, addr, size, dir); + dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_unmap_resource); -- cgit v1.2.3 From 5f790208d68fe1526c751dc2af366c7b552b8631 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 10 Sep 2025 22:42:47 +0200 Subject: net: phy: fixed_phy: remove two function stubs Remove stubs for fixed_phy_set_link_update() and fixed_phy_change_carrier() because all callers (actually just one per function) select config symbol FIXED_PHY. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8729170d-cf39-48d9-aabc-c9aa4acda070@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 6227a1bdefec..d17ff750c708 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -37,16 +37,6 @@ fixed_phy_register(const struct fixed_phy_status *status, static inline void fixed_phy_unregister(struct phy_device *phydev) { } -static inline int fixed_phy_set_link_update(struct phy_device *phydev, - int (*link_update)(struct net_device *, - struct fixed_phy_status *)) -{ - return -ENODEV; -} -static inline int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier) -{ - return -EINVAL; -} #endif /* CONFIG_FIXED_PHY */ #endif /* __PHY_FIXED_H */ -- cgit v1.2.3 From 21446c06b441b9c993870efae71aef4e9aa72ec7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:23 +0200 Subject: net: bridge: Introduce UAPI for BR_BOOLOPT_FDB_LOCAL_VLAN_0 The previous patches introduced a new option, BR_BOOLOPT_FDB_LOCAL_VLAN_0. When enabled, it has local FDB entries installed only on VLAN 0, instead of duplicating them across all VLANs. In this patch, add the corresponding UAPI toggle, and the code for turning the feature on and off. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/ea99bfb10f687fa58091e6e1c2f8acc33f47ca45.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 3 ++ net/bridge/br.c | 22 ++++++++++ net/bridge/br_fdb.c | 96 ++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 2 + 4 files changed, 123 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 73876c0e2bba..e52f8207ab27 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -823,6 +823,8 @@ struct br_mcast_stats { /* bridge boolean options * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping + * BR_BOOLOPT_FDB_LOCAL_VLAN_0 - local FDB entries installed by the bridge + * driver itself should only be added on VLAN 0 * * IMPORTANT: if adding a new option do not forget to handle * it in br_boolopt_toggle/get and bridge sysfs @@ -832,6 +834,7 @@ enum br_boolopt_id { BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BR_BOOLOPT_FDB_LOCAL_VLAN_0, BR_BOOLOPT_MAX }; diff --git a/net/bridge/br.c b/net/bridge/br.c index c683baa3847f..512872a2ef81 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -259,6 +259,23 @@ static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; +static int +br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + int err; + + if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on) + return 0; + + err = br_fdb_toggle_local_vlan_0(br, on, extack); + if (err) + return err; + + br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on); + return 0; +} + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -287,6 +304,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + err = br_toggle_fdb_local_vlan_0(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -307,6 +327,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4a20578517a5..58d22e2b85fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -582,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } +static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); +} + +static void br_fdb_delete_locals_per_vlan(struct net_bridge *br) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) + br_fdb_delete_locals_per_vlan_port(br, p); + + br_fdb_delete_locals_per_vlan_port(br, NULL); +} + +static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + int err; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) + return err; + } + + return 0; +} + +static int br_fdb_insert_locals_per_vlan(struct net_bridge *br, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + err = br_fdb_insert_locals_per_vlan_port(br, p, extack); + if (err) + goto rollback; + } + + err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack); + if (err) + goto rollback; + + return 0; + +rollback: + NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed"); + br_fdb_delete_locals_per_vlan(br); + return err; +} + +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + if (!on) + return br_fdb_insert_locals_per_vlan(br, extack); + + br_fdb_delete_locals_per_vlan(br); + return 0; +} + static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 87da287f19fe..16be5d250402 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -844,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, -- cgit v1.2.3 From 9e472d9e84b11e9f3c429eba97c2a9e74461a884 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Sep 2025 02:18:50 +0100 Subject: tcp: Destroy TCP-AO, TCP-MD5 keys in .sk_destruct() Currently there are a couple of minor issues with destroying the keys tcp_v4_destroy_sock(): 1. The socket is yet in TCP bind buckets, making it reachable for incoming segments [on another CPU core], potentially available to send late FIN/ACK/RST replies. 2. There is at least one code path, where tcp_done() is called before sending RST [kudos to Bob for investigation]. This is a case of a server, that finished sending its data and just called close(). The socket is in TCP_FIN_WAIT2 and has RCV_SHUTDOWN (set by __tcp_close()) tcp_v4_do_rcv()/tcp_v6_do_rcv() tcp_rcv_state_process() /* LINUX_MIB_TCPABORTONDATA */ tcp_reset() tcp_done_with_error() tcp_done() inet_csk_destroy_sock() /* Destroys AO/MD5 keys */ /* tcp_rcv_state_process() returns SKB_DROP_REASON_TCP_ABORT_ON_DATA */ tcp_v4_send_reset() /* Sends an unsigned RST segment */ tcpdump: > 22:53:15.399377 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 33929, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [F.], seq 2185658590, ack 3969644355, win 502, options [nop,nop,md5 valid], length 0 > 22:53:15.399396 00:00:01:01:00:00 > 00:00:b2:1f:00:00, ethertype IPv4 (0x0800), length 86: (tos 0x0, ttl 64, id 51951, offset 0, flags [DF], proto TCP (6), length 72) > 1.0.0.2.49848 > 1.0.0.1.34567: Flags [.], seq 3969644375, ack 2185658591, win 128, options [nop,nop,md5 valid,nop,nop,sack 1 {2185658590:2185658591}], length 0 > 22:53:16.429588 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658590, win 0, length 0 > 22:53:16.664725 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 > 22:53:17.289832 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 Note the signed RSTs later in the dump - those are sent by the server when the fin-wait socket gets removed from hash buckets, by the listener socket. Instead of destroying AO/MD5 info and their keys in inet_csk_destroy_sock(), slightly delay it until the actual socket .sk_destruct(). As shutdown'ed socket can yet send non-data replies, they should be signed in order for the peer to process them. Now it also matches how AO/MD5 gets destructed for TIME-WAIT sockets (in tcp_twsk_destructor()). This seems optimal for TCP-MD5, while for TCP-AO it seems to have an open problem: once RST get sent and socket gets actually destructed, there is no information on the initial sequence numbers. So, in case this last RST gets lost in the network, the server's listener socket won't be able to properly sign another RST. Nothing in RFC 1122 prescribes keeping any local state after non-graceful reset. Luckily, BGP are known to use keep alive(s). While the issue is quite minor/cosmetic, these days monitoring network counters is a common practice and getting invalid signed segments from a trusted BGP peer can get customers worried. Investigated-by: Bob Gilligan Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20250909-b4-tcp-ao-md5-rst-finwait2-v5-1-9ffaaaf8b236@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 4 ++++ net/ipv4/tcp.c | 27 +++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 33 ++++++++------------------------- net/ipv6/tcp_ipv6.c | 8 ++++++++ 4 files changed, 47 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0fb7923b8367..277914c4d067 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1941,6 +1941,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) +void tcp_md5_destruct_sock(struct sock *sk); #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, @@ -1957,6 +1958,9 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) NULL +static inline void tcp_md5_destruct_sock(struct sock *sk) +{ +} #endif int tcp_md5_alloc_sigpool(void); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9c576dc9a1f7..7c6c143017ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -412,6 +412,33 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) return rate64; } +#ifdef CONFIG_TCP_MD5SIG +static void tcp_md5sig_info_free_rcu(struct rcu_head *head) +{ + struct tcp_md5sig_info *md5sig; + + md5sig = container_of(head, struct tcp_md5sig_info, rcu); + kfree(md5sig); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); +} + +void tcp_md5_destruct_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->md5sig_info) { + struct tcp_md5sig_info *md5sig; + + md5sig = rcu_dereference_protected(tp->md5sig_info, 1); + tcp_clear_md5_list(sk); + rcu_assign_pointer(tp->md5sig_info, NULL); + call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); + } +} +EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); +#endif + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e58a8a9ff7a..17176a5d8638 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2494,6 +2494,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp4_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2509,23 +2516,12 @@ static int tcp_v4_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; + sk->sk_destruct = tcp4_destruct_sock; #endif return 0; } -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - static void tcp_release_user_frags(struct sock *sk) { #ifdef CONFIG_PAGE_POOL @@ -2562,19 +2558,6 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list, if any */ - if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - tcp_clear_md5_list(sk); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); - rcu_assign_pointer(tp->md5sig_info, NULL); - } -#endif - tcp_ao_destroy_sock(sk, false); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0562e939b2e3..08dabc47a6e7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2110,6 +2110,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp6_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet6_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2125,6 +2132,7 @@ static int tcp_v6_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; + sk->sk_destruct = tcp6_destruct_sock; #endif return 0; -- cgit v1.2.3 From 51e547e8c89c661f6fbede4a28b1d33b13625683 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Sep 2025 02:18:51 +0100 Subject: tcp: Free TCP-AO/TCP-MD5 info/keys without RCU Now that the destruction of info/keys is delayed until the socket destructor, it's safe to use kfree() without an RCU callback. The socket is in TCP_CLOSE state either because it never left it, or it's already closed and the refcounter is zero. In any way, no one can discover it anymore, it's safe to release memory straight away. Similar thing was possible for twsk already. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20250909-b4-tcp-ao-md5-rst-finwait2-v5-2-9ffaaaf8b236@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 1 - net/ipv4/tcp.c | 17 +++-------------- net/ipv4/tcp_ao.c | 5 ++--- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 19 +++++-------------- 5 files changed, 12 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index df655ce6987d..1e9e27d6e06b 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -130,7 +130,6 @@ struct tcp_ao_info { u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ - struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7c6c143017ef..7f9c671b1ee0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -413,27 +413,16 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) } #ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} - void tcp_md5_destruct_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); tcp_clear_md5_list(sk); - rcu_assign_pointer(tp->md5sig_info, NULL); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); } } EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..31302be78bc4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -static void tcp_ao_info_free_rcu(struct rcu_head *head) +static void tcp_ao_info_free(struct tcp_ao_info *ao) { - struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; @@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk) if (!twsk) tcp_ao_sk_omem_free(sk, ao); - call_rcu(&ao->rcu, tcp_ao_info_free_rcu); + tcp_ao_info_free(ao); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 17176a5d8638..2a0602035729 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,9 +1503,9 @@ void tcp_clear_md5_list(struct sock *sk) md5sig = rcu_dereference_protected(tp->md5sig_info, 1); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { - hlist_del_rcu(&key->node); + hlist_del(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); - kfree_rcu(key, rcu); + kfree(key); } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d1c9e4088646..7c2ae07d8d5d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -377,26 +377,17 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } EXPORT_SYMBOL(tcp_time_wait); -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5_twsk_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_key *key; - - key = container_of(head, struct tcp_md5sig_key, rcu); - kfree(key); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); + if (twsk->tw_md5_key) { + kfree(twsk->tw_md5_key); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); + } } #endif tcp_ao_destroy_sock(sk, true); -- cgit v1.2.3 From 9b90afa6d613b66ec4e74ae75f9bfa5baf386ecd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:51 +0200 Subject: gpio: move gpio-mmio-specific fields out of struct gpio_chip With all users of bgpio_init() converted to using the modernized generic GPIO chip API, we can now move the gpio-mmio-specific fields out of struct gpio_chip and into the dedicated struct gpio_generic_chip. To that end: adjust the gpio-mmio driver to the new layout, update the docs, etc. The changes in gpio-mlxbf2.c and gpio-mpc8xxx.c are here and not in their respective conversion commits because the former passes the address of the generic chip's lock to the __releases() annotation and we cannot really hide it while gpio-mpc8xxx.c accesses the shadow registers in a driver-specific workaround and there's no reason to make them available in a public API. Also: drop the relevant task from TODO as it's now done. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-15-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/TODO | 5 - drivers/gpio/gpio-mlxbf2.c | 2 +- drivers/gpio/gpio-mmio.c | 321 ++++++++++++++++++++++--------------------- drivers/gpio/gpio-mpc8xxx.c | 5 +- include/linux/gpio/driver.h | 44 ------ include/linux/gpio/generic.h | 67 ++++++--- 6 files changed, 211 insertions(+), 233 deletions(-) (limited to 'include') diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index b797499e627e..8ed74e05903a 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -131,11 +131,6 @@ Work items: helpers (x86 inb()/outb()) and convert port-mapped I/O drivers to use this with dry-coding and sending to maintainers to test -- Move the MMIO GPIO specific fields out of struct gpio_chip into a - dedicated structure. Currently every GPIO chip has them if gpio-mmio is - enabled in Kconfig even if it itself doesn't register with the helper - library. - ------------------------------------------------------------------------------- Generic regmap GPIO diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 7e3b526a6caa..abffce3894fc 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -156,7 +156,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * Release the YU arm_gpio_lock after changing the direction mode. */ static void mlxbf2_gpio_lock_release(struct mlxbf2_gpio_context *gs) - __releases(&gs->chip.gc.bgpio_lock) + __releases(&gs->chip.lock) __releases(yu_arm_gpio_lock_param.lock) { writel(YU_ARM_GPIO_LOCK_RELEASE, yu_arm_gpio_lock_param.io); diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index b4f0ab0daaeb..a3df14d672a9 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -125,20 +125,23 @@ static unsigned long bgpio_read32be(void __iomem *reg) static unsigned long bgpio_line2mask(struct gpio_chip *gc, unsigned int line) { - if (gc->be_bits) - return BIT(gc->bgpio_bits - 1 - line); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (chip->be_bits) + return BIT(chip->bits - 1 - line); return BIT(line); } static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long pinmask = bgpio_line2mask(gc, gpio); - bool dir = !!(gc->bgpio_dir & pinmask); + bool dir = !!(chip->sdir & pinmask); if (dir) - return !!(gc->read_reg(gc->reg_set) & pinmask); - else - return !!(gc->read_reg(gc->reg_dat) & pinmask); + return !!(chip->read_reg(chip->reg_set) & pinmask); + + return !!(chip->read_reg(chip->reg_dat) & pinmask); } /* @@ -148,26 +151,28 @@ static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - unsigned long get_mask = 0; - unsigned long set_mask = 0; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long get_mask = 0, set_mask = 0; /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - set_mask = *mask & gc->bgpio_dir; - get_mask = *mask & ~gc->bgpio_dir; + set_mask = *mask & chip->sdir; + get_mask = *mask & ~chip->sdir; if (set_mask) - *bits |= gc->read_reg(gc->reg_set) & set_mask; + *bits |= chip->read_reg(chip->reg_set) & set_mask; if (get_mask) - *bits |= gc->read_reg(gc->reg_dat) & get_mask; + *bits |= chip->read_reg(chip->reg_dat) & get_mask; return 0; } static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) { - return !!(gc->read_reg(gc->reg_dat) & bgpio_line2mask(gc, gpio)); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + return !!(chip->read_reg(chip->reg_dat) & bgpio_line2mask(gc, gpio)); } /* @@ -176,9 +181,11 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - *bits |= gc->read_reg(gc->reg_dat) & *mask; + *bits |= chip->read_reg(chip->reg_dat) & *mask; return 0; } @@ -188,6 +195,7 @@ static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long readmask = 0; unsigned long val; int bit; @@ -200,7 +208,7 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, readmask |= bgpio_line2mask(gc, bit); /* Read the register */ - val = gc->read_reg(gc->reg_dat) & readmask; + val = chip->read_reg(chip->reg_dat) & readmask; /* * Mirror the result into the "bits" result, this will give line 0 @@ -219,19 +227,20 @@ static int bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_dat, gc->bgpio_data); + chip->write_reg(chip->reg_dat, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -239,31 +248,32 @@ static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set_with_clear(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); if (val) - gc->write_reg(gc->reg_set, mask); + chip->write_reg(chip->reg_set, mask); else - gc->write_reg(gc->reg_clr, mask); + chip->write_reg(chip->reg_clr, mask); return 0; } static int bgpio_set_set(struct gpio_chip *gc, unsigned int gpio, int val) { - unsigned long mask = bgpio_line2mask(gc, gpio); - unsigned long flags; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long mask = bgpio_line2mask(gc, gpio), flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_set, gc->bgpio_data); + chip->write_reg(chip->reg_set, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -273,12 +283,13 @@ static void bgpio_multiple_get_masks(struct gpio_chip *gc, unsigned long *set_mask, unsigned long *clear_mask) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); int i; *set_mask = 0; *clear_mask = 0; - for_each_set_bit(i, mask, gc->bgpio_bits) { + for_each_set_bit(i, mask, chip->bits) { if (test_bit(i, bits)) *set_mask |= bgpio_line2mask(gc, i); else @@ -291,25 +302,27 @@ static void bgpio_set_multiple_single_reg(struct gpio_chip *gc, unsigned long *bits, void __iomem *reg) { - unsigned long flags; - unsigned long set_mask, clear_mask; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long flags, set_mask, clear_mask; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); - gc->bgpio_data |= set_mask; - gc->bgpio_data &= ~clear_mask; + chip->sdata |= set_mask; + chip->sdata &= ~clear_mask; - gc->write_reg(reg, gc->bgpio_data); + chip->write_reg(reg, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_dat); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_dat); return 0; } @@ -317,7 +330,9 @@ static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_set_multiple_set(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_set); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_set); return 0; } @@ -326,21 +341,24 @@ static int bgpio_set_multiple_with_clear(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long set_mask, clear_mask; bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); if (set_mask) - gc->write_reg(gc->reg_set, set_mask); + chip->write_reg(chip->reg_set, set_mask); if (clear_mask) - gc->write_reg(gc->reg_clr, clear_mask); + chip->write_reg(chip->reg_clr, clear_mask); return 0; } static int bgpio_dir_return(struct gpio_chip *gc, unsigned int gpio, bool dir_out) { - if (!gc->bgpio_pinctrl) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (!chip->pinctrl) return 0; if (dir_out) @@ -375,39 +393,42 @@ static int bgpio_simple_dir_out(struct gpio_chip *gc, unsigned int gpio, static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio); + chip->sdir &= ~bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return bgpio_dir_return(gc, gpio, false); } static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Return 0 if output, 1 if input */ - if (gc->bgpio_dir_unreadable) { - if (gc->bgpio_dir & bgpio_line2mask(gc, gpio)) + if (chip->dir_unreadable) { + if (chip->sdir & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_out) { - if (gc->read_reg(gc->reg_dir_out) & bgpio_line2mask(gc, gpio)) + if (chip->reg_dir_out) { + if (chip->read_reg(chip->reg_dir_out) & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_in) - if (!(gc->read_reg(gc->reg_dir_in) & bgpio_line2mask(gc, gpio))) + if (chip->reg_dir_in) + if (!(chip->read_reg(chip->reg_dir_in) & bgpio_line2mask(gc, gpio))) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; @@ -415,18 +436,19 @@ static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) static void bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir |= bgpio_line2mask(gc, gpio); + chip->sdir |= bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_dir_out_dir_first(struct gpio_chip *gc, unsigned int gpio, @@ -446,31 +468,30 @@ static int bgpio_dir_out_val_first(struct gpio_chip *gc, unsigned int gpio, } static int bgpio_setup_accessors(struct device *dev, - struct gpio_chip *gc, + struct gpio_generic_chip *chip, bool byte_be) { - - switch (gc->bgpio_bits) { + switch (chip->bits) { case 8: - gc->read_reg = bgpio_read8; - gc->write_reg = bgpio_write8; + chip->read_reg = bgpio_read8; + chip->write_reg = bgpio_write8; break; case 16: if (byte_be) { - gc->read_reg = bgpio_read16be; - gc->write_reg = bgpio_write16be; + chip->read_reg = bgpio_read16be; + chip->write_reg = bgpio_write16be; } else { - gc->read_reg = bgpio_read16; - gc->write_reg = bgpio_write16; + chip->read_reg = bgpio_read16; + chip->write_reg = bgpio_write16; } break; case 32: if (byte_be) { - gc->read_reg = bgpio_read32be; - gc->write_reg = bgpio_write32be; + chip->read_reg = bgpio_read32be; + chip->write_reg = bgpio_write32be; } else { - gc->read_reg = bgpio_read32; - gc->write_reg = bgpio_write32; + chip->read_reg = bgpio_read32; + chip->write_reg = bgpio_write32; } break; #if BITS_PER_LONG >= 64 @@ -480,13 +501,13 @@ static int bgpio_setup_accessors(struct device *dev, "64 bit big endian byte order unsupported\n"); return -EINVAL; } else { - gc->read_reg = bgpio_read64; - gc->write_reg = bgpio_write64; + chip->read_reg = bgpio_read64; + chip->write_reg = bgpio_write64; } break; #endif /* BITS_PER_LONG >= 64 */ default: - dev_err(dev, "unsupported data width %u bits\n", gc->bgpio_bits); + dev_err(dev, "unsupported data width %u bits\n", chip->bits); return -EINVAL; } @@ -515,27 +536,25 @@ static int bgpio_setup_accessors(struct device *dev, * - an input direction register (named "dirin") where a 1 bit indicates * the GPIO is an input. */ -static int bgpio_setup_io(struct gpio_chip *gc, - void __iomem *dat, - void __iomem *set, - void __iomem *clr, - unsigned long flags) +static int bgpio_setup_io(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; - gc->reg_dat = dat; - if (!gc->reg_dat) + chip->reg_dat = cfg->dat; + if (!chip->reg_dat) return -EINVAL; - if (set && clr) { - gc->reg_set = set; - gc->reg_clr = clr; + if (cfg->set && cfg->clr) { + chip->reg_set = cfg->set; + chip->reg_clr = cfg->clr; gc->set = bgpio_set_with_clear; gc->set_multiple = bgpio_set_multiple_with_clear; - } else if (set && !clr) { - gc->reg_set = set; + } else if (cfg->set && !cfg->clr) { + chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & BGPIOF_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -543,10 +562,10 @@ static int bgpio_setup_io(struct gpio_chip *gc, gc->set_multiple = bgpio_set_multiple; } - if (!(flags & BGPIOF_UNREADABLE_REG_SET) && - (flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & BGPIOF_UNREADABLE_REG_SET) && + (cfg->flags & BGPIOF_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; - if (!gc->be_bits) + if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; /* * We deliberately avoid assigning the ->get_multiple() call @@ -557,7 +576,7 @@ static int bgpio_setup_io(struct gpio_chip *gc, */ } else { gc->get = bgpio_get; - if (gc->be_bits) + if (chip->be_bits) gc->get_multiple = bgpio_get_multiple_be; else gc->get_multiple = bgpio_get_multiple; @@ -566,27 +585,27 @@ static int bgpio_setup_io(struct gpio_chip *gc, return 0; } -static int bgpio_setup_direction(struct gpio_chip *gc, - void __iomem *dirout, - void __iomem *dirin, - unsigned long flags) +static int bgpio_setup_direction(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { - if (dirout || dirin) { - gc->reg_dir_out = dirout; - gc->reg_dir_in = dirin; - if (flags & BGPIOF_NO_SET_ON_INPUT) + struct gpio_chip *gc = &chip->gc; + + if (cfg->dirout || cfg->dirin) { + chip->reg_dir_out = cfg->dirout; + chip->reg_dir_in = cfg->dirin; + if (cfg->flags & BGPIOF_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & BGPIOF_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (flags & BGPIOF_NO_INPUT) + if (cfg->flags & BGPIOF_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -595,117 +614,101 @@ static int bgpio_setup_direction(struct gpio_chip *gc, return 0; } -static int bgpio_request(struct gpio_chip *chip, unsigned gpio_pin) +static int bgpio_request(struct gpio_chip *gc, unsigned int gpio_pin) { - if (gpio_pin >= chip->ngpio) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (gpio_pin >= gc->ngpio) return -EINVAL; - if (chip->bgpio_pinctrl) - return gpiochip_generic_request(chip, gpio_pin); + if (chip->pinctrl) + return gpiochip_generic_request(gc, gpio_pin); return 0; } /** - * bgpio_init() - Initialize generic GPIO accessor functions - * @gc: the GPIO chip to set up - * @dev: the parent device of the new GPIO chip (compulsory) - * @sz: the size (width) of the MMIO registers in bytes, typically 1, 2 or 4 - * @dat: MMIO address for the register to READ the value of the GPIO lines, it - * is expected that a 1 in the corresponding bit in this register means the - * line is asserted - * @set: MMIO address for the register to SET the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * high. - * @clr: MMIO address for the register to CLEAR the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * low. It is allowed to leave this address as NULL, in that case the SET register - * will be assumed to also clear the GPIO lines, by actively writing the line - * with 0. - * @dirout: MMIO address for the register to set the line as OUTPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * output line. Conversely, setting the line to 0 will turn that line into - * an input. - * @dirin: MMIO address for the register to set this line as INPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * input line. Conversely, setting the line to 0 will turn that line into - * an output. - * @flags: Different flags that will affect the behaviour of the device, such as - * endianness etc. + * gpio_generic_chip_init() - Initialize a generic GPIO chip. + * @chip: Generic GPIO chip to set up. + * @cfg: Generic GPIO chip configuration. + * + * Returns 0 on success, negative error number on failure. */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags) +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; + unsigned long flags = cfg->flags; + struct device *dev = cfg->dev; int ret; - if (!is_power_of_2(sz)) + if (!is_power_of_2(cfg->sz)) return -EINVAL; - gc->bgpio_bits = sz * 8; - if (gc->bgpio_bits > BITS_PER_LONG) + chip->bits = cfg->sz * 8; + if (chip->bits > BITS_PER_LONG) return -EINVAL; - raw_spin_lock_init(&gc->bgpio_lock); + raw_spin_lock_init(&chip->lock); gc->parent = dev; gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - gc->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) - gc->ngpio = gc->bgpio_bits; + gc->ngpio = chip->bits; - ret = bgpio_setup_io(gc, dat, set, clr, flags); + ret = bgpio_setup_io(chip, cfg); if (ret) return ret; - ret = bgpio_setup_accessors(dev, gc, flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + ret = bgpio_setup_accessors(dev, chip, + flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; - ret = bgpio_setup_direction(gc, dirout, dirin, flags); + ret = bgpio_setup_direction(chip, cfg); if (ret) return ret; if (flags & BGPIOF_PINCTRL_BACKEND) { - gc->bgpio_pinctrl = true; + chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; } - gc->bgpio_data = gc->read_reg(gc->reg_dat); + chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && !(flags & BGPIOF_UNREADABLE_REG_SET)) - gc->bgpio_data = gc->read_reg(gc->reg_set); + chip->sdata = chip->read_reg(chip->reg_set); if (flags & BGPIOF_UNREADABLE_REG_DIR) - gc->bgpio_dir_unreadable = true; + chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ - if ((gc->reg_dir_out || gc->reg_dir_in) && + if ((chip->reg_dir_out || chip->reg_dir_in) && !(flags & BGPIOF_UNREADABLE_REG_DIR)) { - if (gc->reg_dir_out) - gc->bgpio_dir = gc->read_reg(gc->reg_dir_out); - else if (gc->reg_dir_in) - gc->bgpio_dir = ~gc->read_reg(gc->reg_dir_in); + if (chip->reg_dir_out) + chip->sdir = chip->read_reg(chip->reg_dir_out); + else if (chip->reg_dir_in) + chip->sdir = ~chip->read_reg(chip->reg_dir_in); /* * If we have two direction registers, synchronise * input setting to output setting, the library * can not handle a line being input and output at * the same time. */ - if (gc->reg_dir_out && gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); + if (chip->reg_dir_out && chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); } return ret; } -EXPORT_SYMBOL_GPL(bgpio_init); +EXPORT_SYMBOL_GPL(gpio_generic_chip_init); #if IS_ENABLED(CONFIG_GPIO_GENERIC_PLATFORM) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index dd2cd2cc6e6f..a2a83afb41bb 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -71,7 +71,7 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio) mpc8xxx_gc->regs + GPIO_DIR); val = gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; - out_shadow = gc->bgpio_data & out_mask; + out_shadow = mpc8xxx_gc->chip.sdata & out_mask; return !!((val | out_shadow) & mpc_pin2mask(gpio)); } @@ -399,7 +399,8 @@ static int mpc8xxx_probe(struct platform_device *pdev) gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); /* Also, latch state of GPIOs configured as output by bootloader. */ - gc->bgpio_data = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->chip.sdata = + gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DAT) & gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DIR); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9fcd4a988081..9b14fd20f13e 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -388,28 +388,6 @@ struct gpio_irq_chip { * implies that if the chip supports IRQs, these IRQs need to be threaded * as the chip access may sleep when e.g. reading out the IRQ status * registers. - * @read_reg: reader function for generic GPIO - * @write_reg: writer function for generic GPIO - * @be_bits: if the generic GPIO has big endian bit order (bit 31 is representing - * line 0, bit 30 is line 1 ... bit 0 is line 31) this is set to true by the - * generic GPIO core. It is for internal housekeeping only. - * @reg_dat: data (in) register for generic GPIO - * @reg_set: output set register (out=high) for generic GPIO - * @reg_clr: output clear register (out=low) for generic GPIO - * @reg_dir_out: direction out setting register for generic GPIO - * @reg_dir_in: direction in setting register for generic GPIO - * @bgpio_dir_unreadable: indicates that the direction register(s) cannot - * be read and we need to rely on out internal state tracking. - * @bgpio_pinctrl: the generic GPIO uses a pin control backend. - * @bgpio_bits: number of register bits used for a generic GPIO i.e. - * * 8 - * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep - * shadowed and real data registers writes together. - * @bgpio_data: shadowed data register for generic GPIO to clear/set bits - * safely. - * @bgpio_dir: shadowed direction register for generic GPIO to clear/set - * direction safely. A "1" in this word means the line is set as - * output. * * A gpio_chip can help platforms abstract various sources of GPIOs so * they can all be accessed through a common programming interface. @@ -475,23 +453,6 @@ struct gpio_chip { const char *const *names; bool can_sleep; -#if IS_ENABLED(CONFIG_GPIO_GENERIC) - unsigned long (*read_reg)(void __iomem *reg); - void (*write_reg)(void __iomem *reg, unsigned long data); - bool be_bits; - void __iomem *reg_dat; - void __iomem *reg_set; - void __iomem *reg_clr; - void __iomem *reg_dir_out; - void __iomem *reg_dir_in; - bool bgpio_dir_unreadable; - bool bgpio_pinctrl; - int bgpio_bits; - raw_spinlock_t bgpio_lock; - unsigned long bgpio_data; - unsigned long bgpio_dir; -#endif /* CONFIG_GPIO_GENERIC */ - #ifdef CONFIG_GPIOLIB_IRQCHIP /* * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib @@ -723,11 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags); - #define BGPIOF_BIG_ENDIAN BIT(0) #define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ #define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 4c0626b53ec9..162430d96660 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -50,9 +50,44 @@ struct gpio_generic_chip_config { * struct gpio_generic_chip - Generic GPIO chip implementation. * @gc: The underlying struct gpio_chip object, implementing low-level GPIO * chip routines. + * @read_reg: reader function for generic GPIO + * @write_reg: writer function for generic GPIO + * @be_bits: if the generic GPIO has big endian bit order (bit 31 is + * representing line 0, bit 30 is line 1 ... bit 0 is line 31) this + * is set to true by the generic GPIO core. It is for internal + * housekeeping only. + * @reg_dat: data (in) register for generic GPIO + * @reg_set: output set register (out=high) for generic GPIO + * @reg_clr: output clear register (out=low) for generic GPIO + * @reg_dir_out: direction out setting register for generic GPIO + * @reg_dir_in: direction in setting register for generic GPIO + * @dir_unreadable: indicates that the direction register(s) cannot be read and + * we need to rely on out internal state tracking. + * @pinctrl: the generic GPIO uses a pin control backend. + * @bits: number of register bits used for a generic GPIO + * i.e. * 8 + * @lock: used to lock chip->sdata. Also, this is needed to keep + * shadowed and real data registers writes together. + * @sdata: shadowed data register for generic GPIO to clear/set bits safely. + * @sdir: shadowed direction register for generic GPIO to clear/set direction + * safely. A "1" in this word means the line is set as output. */ struct gpio_generic_chip { struct gpio_chip gc; + unsigned long (*read_reg)(void __iomem *reg); + void (*write_reg)(void __iomem *reg, unsigned long data); + bool be_bits; + void __iomem *reg_dat; + void __iomem *reg_set; + void __iomem *reg_clr; + void __iomem *reg_dir_out; + void __iomem *reg_dir_in; + bool dir_unreadable; + bool pinctrl; + int bits; + raw_spinlock_t lock; + unsigned long sdata; + unsigned long sdir; }; static inline struct gpio_generic_chip * @@ -61,20 +96,8 @@ to_gpio_generic_chip(struct gpio_chip *gc) return container_of(gc, struct gpio_generic_chip, gc); } -/** - * gpio_generic_chip_init() - Initialize a generic GPIO chip. - * @chip: Generic GPIO chip to set up. - * @cfg: Generic GPIO chip configuration. - * - * Returns 0 on success, negative error number on failure. - */ -static inline int -gpio_generic_chip_init(struct gpio_generic_chip *chip, - const struct gpio_generic_chip_config *cfg) -{ - return bgpio_init(&chip->gc, cfg->dev, cfg->sz, cfg->dat, cfg->set, - cfg->clr, cfg->dirout, cfg->dirin, cfg->flags); -} +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg); /** * gpio_generic_chip_set() - Set the GPIO line value of the generic GPIO chip. @@ -110,10 +133,10 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, static inline unsigned long gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) { - if (WARN_ON(!chip->gc.read_reg)) + if (WARN_ON(!chip->read_reg)) return 0; - return chip->gc.read_reg(reg); + return chip->read_reg(reg); } /** @@ -125,23 +148,23 @@ gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, void __iomem *reg, unsigned long val) { - if (WARN_ON(!chip->gc.write_reg)) + if (WARN_ON(!chip->write_reg)) return; - chip->gc.write_reg(reg, val); + chip->write_reg(reg, val); } #define gpio_generic_chip_lock(gen_gc) \ - raw_spin_lock(&(gen_gc)->gc.bgpio_lock) + raw_spin_lock(&(gen_gc)->lock) #define gpio_generic_chip_unlock(gen_gc) \ - raw_spin_unlock(&(gen_gc)->gc.bgpio_lock) + raw_spin_unlock(&(gen_gc)->lock) #define gpio_generic_chip_lock_irqsave(gen_gc, flags) \ - raw_spin_lock_irqsave(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_lock_irqsave(&(gen_gc)->lock, flags) #define gpio_generic_chip_unlock_irqrestore(gen_gc, flags) \ - raw_spin_unlock_irqrestore(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_unlock_irqrestore(&(gen_gc)->lock, flags) DEFINE_LOCK_GUARD_1(gpio_generic_lock, struct gpio_generic_chip, -- cgit v1.2.3 From 121a0f839dbb397af5fabb701cea3e9983223e50 Mon Sep 17 00:00:00 2001 From: Israel Cepeda Date: Thu, 11 Sep 2025 20:13:41 +0200 Subject: usb: misc: Add Intel USBIO bridge driver Add a driver for the Intel USBIO USB IO-expander used by the MIPI cameras on various new (Meteor Lake and later) Intel laptops. This is an USB bridge driver which adds auxbus child devices for the GPIO, I2C and SPI functions of the USBIO chip and which exports IO-functions for the drivers for the auxbus child devices to communicate with the USBIO device's firmware. Co-developed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Israel Cepeda Link: https://lore.kernel.org/r/20250911181343.77398-2-hansg@kernel.org Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 8 + drivers/usb/misc/Kconfig | 14 + drivers/usb/misc/Makefile | 1 + drivers/usb/misc/usbio.c | 749 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/usb/usbio.h | 177 +++++++++++ 5 files changed, 949 insertions(+) create mode 100644 drivers/usb/misc/usbio.c create mode 100644 include/linux/usb/usbio.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d79..434ac1593332 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12688,6 +12688,14 @@ S: Maintained F: Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst F: drivers/platform/x86/intel/uncore-frequency/ +INTEL USBIO USB I/O EXPANDER DRIVERS +M: Israel Cepeda +M: Hans de Goede +R: Sakari Ailus +S: Maintained +F: drivers/usb/misc/usbio.c +F: include/linux/usb/usbio.h + INTEL VENDOR SPECIFIC EXTENDED CAPABILITIES DRIVER M: David E. Box S: Supported diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig index 26eaae32f738..09ac6f1c985f 100644 --- a/drivers/usb/misc/Kconfig +++ b/drivers/usb/misc/Kconfig @@ -179,6 +179,20 @@ config USB_LJCA This driver can also be built as a module. If so, the module will be called usb-ljca. +config USB_USBIO + tristate "Intel USBIO Bridge support" + depends on USB && ACPI + select AUXILIARY_BUS + help + This adds support for Intel USBIO drivers. + This enables the USBIO bridge driver module in charge to talk + to the USB device. Additional drivers such as GPIO_USBIO and + I2C_USBIO must be enabled in order to use the device's full + functionality. + + This driver can also be built as a module. If so, the module + will be called usbio. + source "drivers/usb/misc/sisusbvga/Kconfig" config USB_LD diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile index 0cd5bc8f52fe..494ab0377f35 100644 --- a/drivers/usb/misc/Makefile +++ b/drivers/usb/misc/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_USB_EMI62) += emi62.o obj-$(CONFIG_USB_EZUSB_FX2) += ezusb.o obj-$(CONFIG_APPLE_MFI_FASTCHARGE) += apple-mfi-fastcharge.o obj-$(CONFIG_USB_LJCA) += usb-ljca.o +obj-$(CONFIG_USB_USBIO) += usbio.o obj-$(CONFIG_USB_IDMOUSE) += idmouse.o obj-$(CONFIG_USB_IOWARRIOR) += iowarrior.o obj-$(CONFIG_USB_ISIGHTFW) += isight_firmware.o diff --git a/drivers/usb/misc/usbio.c b/drivers/usb/misc/usbio.c new file mode 100644 index 000000000000..37644dddf157 --- /dev/null +++ b/drivers/usb/misc/usbio.c @@ -0,0 +1,749 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel USBIO Bridge driver + * + * Copyright (c) 2025 Intel Corporation. + * Copyright (c) 2025 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/************************************* + * USBIO Bridge Protocol Definitions * + *************************************/ + +/* USBIO Control Commands */ +#define USBIO_CTRLCMD_PROTVER 0 +#define USBIO_CTRLCMD_FWVER 1 +#define USBIO_CTRLCMD_HS 2 +#define USBIO_CTRLCMD_ENUMGPIO 16 +#define USBIO_CTRLCMD_ENUMI2C 17 + +/* USBIO Packet Flags */ +#define USBIO_PKTFLAG_ACK BIT(0) +#define USBIO_PKTFLAG_RSP BIT(1) +#define USBIO_PKTFLAG_CMP BIT(2) +#define USBIO_PKTFLAG_ERR BIT(3) + +#define USBIO_PKTFLAGS_REQRESP (USBIO_PKTFLAG_CMP | USBIO_PKTFLAG_ACK) + +#define USBIO_CTRLXFER_TIMEOUT 0 +#define USBIO_BULKXFER_TIMEOUT 100 + +struct usbio_protver { + u8 ver; +} __packed; + +struct usbio_fwver { + u8 major; + u8 minor; + __le16 patch; + __le16 build; +} __packed; + +/*********************************** + * USBIO Bridge Device Definitions * + ***********************************/ + +/** + * struct usbio_device - the usb device exposing IOs + * + * @dev: the device in the usb interface + * @udev: the detected usb device + * @intf: the usb interface + * @quirks: quirks + * @ctrl_mutex: protects ctrl_buf + * @ctrl_pipe: the control transfer pipe + * @ctrlbuf_len: the size of the control transfer pipe + * @ctrlbuf: the buffer used for control transfers + * @bulk_mutex: protects tx_buf, rx_buf and split bulk-transfers getting interrupted + * @tx_pipe: the bulk out pipe + * @txbuf_len: the size of the bulk out pipe + * @txbuf: the buffer used for bulk out transfers + * @rx_pipe: the bulk in pipe + * @rxbuf_len: the size of the bulk in pipe + * @rxdat_len: the data length at rx buffer + * @rxbuf: the buffer used for bulk in transfers + * @urb: the urb to read bulk pipe + * @done: completion object as request is done + * @cli_list: device's client list + * @nr_gpio_banks: Number of GPIO banks + * @gpios: GPIO bank descriptors + * @nr_gpio_banks: Number of I2C busses + * @gpios: I2C bank descriptors + */ +struct usbio_device { + struct device *dev; + struct usb_device *udev; + struct usb_interface *intf; + unsigned long quirks; + + struct mutex ctrl_mutex; + unsigned int ctrl_pipe; + u16 ctrlbuf_len; + void *ctrlbuf; + + struct mutex bulk_mutex; + unsigned int tx_pipe; + u16 txbuf_len; + void *txbuf; + + unsigned int rx_pipe; + u16 rxbuf_len; + u16 rxdat_len; + void *rxbuf; + struct urb *urb; + + struct completion done; + + struct list_head cli_list; + + unsigned int nr_gpio_banks; + struct usbio_gpio_bank_desc gpios[USBIO_MAX_GPIOBANKS]; + + unsigned int nr_i2c_buses; + struct usbio_i2c_bus_desc i2cs[USBIO_MAX_I2CBUSES]; +}; + +/** + * struct usbio_client - represents a usbio client + * + * @auxdev: auxiliary device object + * @mutex: protects @bridge + * @bridge: usbio bridge who service the client + * @link: usbio bridge clients list member + */ +struct usbio_client { + struct auxiliary_device auxdev; + struct mutex mutex; + struct usbio_device *bridge; + struct list_head link; +}; + +#define adev_to_client(adev) container_of_const(adev, struct usbio_client, auxdev) + +static int usbio_ctrl_msg(struct usbio_device *usbio, u8 type, u8 cmd, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len) +{ + u8 request = USB_TYPE_VENDOR | USB_RECIP_DEVICE; + struct usbio_ctrl_packet *cpkt; + unsigned int pipe; + u16 cpkt_len; + int ret; + + lockdep_assert_held(&usbio->ctrl_mutex); + + if ((obuf_len > (usbio->ctrlbuf_len - sizeof(*cpkt))) || + (ibuf_len > (usbio->ctrlbuf_len - sizeof(*cpkt)))) + return -EMSGSIZE; + + /* Prepare Control Packet Header */ + cpkt = usbio->ctrlbuf; + cpkt->header.type = type; + cpkt->header.cmd = cmd; + if (type == USBIO_PKTTYPE_CTRL || ibuf_len) + cpkt->header.flags = USBIO_PKTFLAGS_REQRESP; + else + cpkt->header.flags = USBIO_PKTFLAG_CMP; + cpkt->len = obuf_len; + + /* Copy the data */ + memcpy(cpkt->data, obuf, obuf_len); + + pipe = usb_sndctrlpipe(usbio->udev, usbio->ctrl_pipe); + cpkt_len = sizeof(*cpkt) + obuf_len; + ret = usb_control_msg(usbio->udev, pipe, 0, request | USB_DIR_OUT, 0, 0, + cpkt, cpkt_len, USBIO_CTRLXFER_TIMEOUT); + dev_dbg(usbio->dev, "control out %d hdr %*phN data %*phN\n", ret, + (int)sizeof(*cpkt), cpkt, (int)cpkt->len, cpkt->data); + + if (ret != cpkt_len) { + dev_err(usbio->dev, "USB control out failed: %d\n", ret); + return (ret < 0) ? ret : -EPROTO; + } + + if (!(cpkt->header.flags & USBIO_PKTFLAG_ACK)) + return 0; + + pipe = usb_rcvctrlpipe(usbio->udev, usbio->ctrl_pipe); + cpkt_len = sizeof(*cpkt) + ibuf_len; + ret = usb_control_msg(usbio->udev, pipe, 0, request | USB_DIR_IN, 0, 0, + cpkt, cpkt_len, USBIO_CTRLXFER_TIMEOUT); + dev_dbg(usbio->dev, "control in %d hdr %*phN data %*phN\n", ret, + (int)sizeof(*cpkt), cpkt, (int)cpkt->len, cpkt->data); + + if (ret < sizeof(*cpkt)) { + dev_err(usbio->dev, "USB control in failed: %d\n", ret); + return (ret < 0) ? ret : -EPROTO; + } + + if (cpkt->header.type != type || cpkt->header.cmd != cmd || + !(cpkt->header.flags & USBIO_PKTFLAG_RSP)) { + dev_err(usbio->dev, "Unexpected reply type: %u, cmd: %u, flags: %u\n", + cpkt->header.type, cpkt->header.cmd, cpkt->header.flags); + return -EPROTO; + } + + if (cpkt->header.flags & USBIO_PKTFLAG_ERR) + return -EREMOTEIO; + + if (ibuf_len < cpkt->len) + return -ENOSPC; + + memcpy(ibuf, cpkt->data, cpkt->len); + + return cpkt->len; +} + +int usbio_control_msg(struct auxiliary_device *adev, u8 type, u8 cmd, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + int ret; + + guard(mutex)(&client->mutex); + + usbio = client->bridge; + if (!usbio) + return -ENODEV; /* Disconnected */ + + ret = usb_autopm_get_interface(usbio->intf); + if (ret) + return ret; + + mutex_lock(&usbio->ctrl_mutex); + + ret = usbio_ctrl_msg(client->bridge, type, cmd, obuf, obuf_len, ibuf, ibuf_len); + + mutex_unlock(&usbio->ctrl_mutex); + usb_autopm_put_interface(usbio->intf); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(usbio_control_msg, "USBIO"); + +static void usbio_bulk_recv(struct urb *urb) +{ + struct usbio_bulk_packet *bpkt = urb->transfer_buffer; + struct usbio_device *usbio = urb->context; + + if (!urb->status) { + if (bpkt->header.flags & USBIO_PKTFLAG_RSP) { + usbio->rxdat_len = urb->actual_length; + complete(&usbio->done); + } + } else if (urb->status != -ENOENT) { + dev_err(usbio->dev, "Bulk in error %d\n", urb->status); + } + + usb_submit_urb(usbio->urb, GFP_ATOMIC); +} + +int usbio_bulk_msg(struct auxiliary_device *adev, u8 type, u8 cmd, bool last, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio = client->bridge; + struct usbio_bulk_packet *bpkt; + int ret, act = 0; + u16 bpkt_len; + + lockdep_assert_held(&client->mutex); + lockdep_assert_held(&usbio->bulk_mutex); + + if ((obuf_len > (usbio->txbuf_len - sizeof(*bpkt))) || + (ibuf_len > (usbio->txbuf_len - sizeof(*bpkt)))) + return -EMSGSIZE; + + if (ibuf_len) + reinit_completion(&usbio->done); + + /* If no data to send, skip to read */ + if (!obuf_len) + goto read; + + /* Prepare Bulk Packet Header */ + bpkt = usbio->txbuf; + bpkt->header.type = type; + bpkt->header.cmd = cmd; + if (!last) + bpkt->header.flags = 0; + else if (ibuf_len) + bpkt->header.flags = USBIO_PKTFLAGS_REQRESP; + else + bpkt->header.flags = USBIO_PKTFLAG_CMP; + bpkt->len = cpu_to_le16(obuf_len); + + /* Copy the data */ + memcpy(bpkt->data, obuf, obuf_len); + + bpkt_len = sizeof(*bpkt) + obuf_len; + ret = usb_bulk_msg(usbio->udev, usbio->tx_pipe, bpkt, bpkt_len, &act, + USBIO_BULKXFER_TIMEOUT); + dev_dbg(usbio->dev, "bulk out %d hdr %*phN data %*phN\n", act, + (int)sizeof(*bpkt), bpkt, obuf_len, bpkt->data); + + if (ret || act != bpkt_len) { + dev_err(usbio->dev, "Bulk out failed: %d\n", ret); + return ret ?: -EPROTO; + } + + if (!(bpkt->header.flags & USBIO_PKTFLAG_ACK)) + return obuf_len; + +read: + ret = wait_for_completion_timeout(&usbio->done, USBIO_BULKXFER_TIMEOUT); + if (ret <= 0) { + dev_err(usbio->dev, "Bulk in wait failed: %d\n", ret); + return ret ?: -ETIMEDOUT; + } + + act = usbio->rxdat_len; + bpkt = usbio->rxbuf; + bpkt_len = le16_to_cpu(bpkt->len); + dev_dbg(usbio->dev, "bulk in %d hdr %*phN data %*phN\n", act, + (int)sizeof(*bpkt), bpkt, bpkt_len, bpkt->data); + + /* + * Unsupported bulk commands get only an usbio_packet_header with + * the error flag set as reply. Return -EPIPE for this case. + */ + if (act == sizeof(struct usbio_packet_header) && + (bpkt->header.flags & USBIO_PKTFLAG_ERR)) + return -EPIPE; + + if (act < sizeof(*bpkt)) { + dev_err(usbio->dev, "Bulk in short read: %d\n", act); + return -EPROTO; + } + + if (bpkt->header.type != type || bpkt->header.cmd != cmd || + !(bpkt->header.flags & USBIO_PKTFLAG_RSP)) { + dev_err(usbio->dev, + "Unexpected bulk in type 0x%02x cmd 0x%02x flags 0x%02x\n", + bpkt->header.type, bpkt->header.cmd, bpkt->header.flags); + return -EPROTO; + } + + if (bpkt->header.flags & USBIO_PKTFLAG_ERR) + return -EREMOTEIO; + + if (ibuf_len < bpkt_len) + return -ENOSPC; + + memcpy(ibuf, bpkt->data, bpkt_len); + + return bpkt_len; +} +EXPORT_SYMBOL_NS_GPL(usbio_bulk_msg, "USBIO"); + +int usbio_acquire(struct auxiliary_device *adev) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + int ret; + + mutex_lock(&client->mutex); + + usbio = client->bridge; + if (!usbio) { + ret = -ENODEV; /* Disconnected */ + goto err_unlock; + } + + ret = usb_autopm_get_interface(usbio->intf); + if (ret) + goto err_unlock; + + mutex_lock(&usbio->bulk_mutex); + + /* Leave client locked until release to avoid abba deadlock issues */ + return 0; + +err_unlock: + mutex_unlock(&client->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(usbio_acquire, "USBIO"); + +void usbio_release(struct auxiliary_device *adev) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio = client->bridge; + + lockdep_assert_held(&client->mutex); + + mutex_unlock(&usbio->bulk_mutex); + usb_autopm_put_interface(usbio->intf); + mutex_unlock(&client->mutex); +} +EXPORT_SYMBOL_NS_GPL(usbio_release, "USBIO"); + +void usbio_get_txrxbuf_len(struct auxiliary_device *adev, u16 *txbuf_len, u16 *rxbuf_len) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + + guard(mutex)(&client->mutex); + + usbio = client->bridge; + if (!usbio) + return; /* Disconnected */ + + *txbuf_len = usbio->txbuf_len; + *rxbuf_len = usbio->rxbuf_len; +} +EXPORT_SYMBOL_NS_GPL(usbio_get_txrxbuf_len, "USBIO"); + +unsigned long usbio_get_quirks(struct auxiliary_device *adev) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + + guard(mutex)(&client->mutex); + + usbio = client->bridge; + if (!usbio) + return 0; /* Disconnected */ + + return usbio->quirks; +} +EXPORT_SYMBOL_NS_GPL(usbio_get_quirks, "USBIO"); + +static void usbio_auxdev_release(struct device *dev) +{ + struct auxiliary_device *adev = to_auxiliary_dev(dev); + struct usbio_client *client = adev_to_client(adev); + + mutex_destroy(&client->mutex); + kfree(client); +} + +static int usbio_add_client(struct usbio_device *usbio, char *name, u8 id, void *data) +{ + struct usbio_client *client; + struct auxiliary_device *adev; + int ret; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return -ENOMEM; + + mutex_init(&client->mutex); + client->bridge = usbio; + adev = &client->auxdev; + adev->name = name; + adev->id = id; + + adev->dev.parent = usbio->dev; + adev->dev.platform_data = data; + adev->dev.release = usbio_auxdev_release; + + ret = auxiliary_device_init(adev); + if (ret) { + usbio_auxdev_release(&adev->dev); + return ret; + } + + ret = auxiliary_device_add(adev); + if (ret) { + auxiliary_device_uninit(adev); + return ret; + } + + list_add_tail(&client->link, &usbio->cli_list); + + return 0; +} + +static int usbio_enum_gpios(struct usbio_device *usbio) +{ + struct usbio_gpio_bank_desc *gpio = usbio->gpios; + + dev_dbg(usbio->dev, "GPIO Banks: %d\n", usbio->nr_gpio_banks); + + for (unsigned int i = 0; i < usbio->nr_gpio_banks; i++) + dev_dbg(usbio->dev, "\tBank%d[%d] map: %#08x\n", + gpio[i].id, gpio[i].pins, gpio[i].bmap); + + usbio_add_client(usbio, USBIO_GPIO_CLIENT, 0, gpio); + + return 0; +} + +static int usbio_enum_i2cs(struct usbio_device *usbio) +{ + struct usbio_i2c_bus_desc *i2c = usbio->i2cs; + + dev_dbg(usbio->dev, "I2C Busses: %d\n", usbio->nr_i2c_buses); + + for (unsigned int i = 0; i < usbio->nr_i2c_buses; i++) { + dev_dbg(usbio->dev, "\tBus%d caps: %#02x\n", i2c[i].id, i2c[i].caps); + usbio_add_client(usbio, USBIO_I2C_CLIENT, i, &i2c[i]); + } + + return 0; +} + +static int usbio_suspend(struct usb_interface *intf, pm_message_t msg) +{ + struct usbio_device *usbio = usb_get_intfdata(intf); + + usb_kill_urb(usbio->urb); + + return 0; +} + +static int usbio_resume(struct usb_interface *intf) +{ + struct usbio_device *usbio = usb_get_intfdata(intf); + + return usb_submit_urb(usbio->urb, GFP_KERNEL); +} + +static void usbio_disconnect(struct usb_interface *intf) +{ + struct usbio_device *usbio = usb_get_intfdata(intf); + struct usbio_client *client; + + /* Wakeup any clients waiting for a reply */ + usbio->rxdat_len = 0; + complete(&usbio->done); + + /* Let clients know the bridge is gone */ + list_for_each_entry(client, &usbio->cli_list, link) { + mutex_lock(&client->mutex); + client->bridge = NULL; + mutex_unlock(&client->mutex); + } + + /* From here on clients will no longer touch struct usbio_device */ + usb_kill_urb(usbio->urb); + usb_free_urb(usbio->urb); + + list_for_each_entry_reverse(client, &usbio->cli_list, link) { + auxiliary_device_delete(&client->auxdev); + auxiliary_device_uninit(&client->auxdev); + } +} + +static int usbio_probe(struct usb_interface *intf, const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(intf); + struct usb_endpoint_descriptor *ep_in, *ep_out; + struct device *dev = &intf->dev; + struct usbio_protver protver; + struct usbio_device *usbio; + struct usbio_fwver fwver; + int ret; + + usbio = devm_kzalloc(dev, sizeof(*usbio), GFP_KERNEL); + if (!usbio) + return -ENOMEM; + + ret = devm_mutex_init(dev, &usbio->ctrl_mutex); + if (ret) + return ret; + + ret = devm_mutex_init(dev, &usbio->bulk_mutex); + if (ret) + return ret; + + usbio->dev = dev; + usbio->udev = udev; + usbio->intf = intf; + usbio->quirks = id ? id->driver_info : 0; + init_completion(&usbio->done); + INIT_LIST_HEAD(&usbio->cli_list); + usb_set_intfdata(intf, usbio); + + usbio->ctrl_pipe = usb_endpoint_num(&udev->ep0.desc); + usbio->ctrlbuf_len = usb_maxpacket(udev, usbio->ctrl_pipe); + usbio->ctrlbuf = devm_kzalloc(dev, usbio->ctrlbuf_len, GFP_KERNEL); + if (!usbio->ctrlbuf) + return -ENOMEM; + + /* Find the first bulk-in and bulk-out endpoints */ + ret = usb_find_common_endpoints(intf->cur_altsetting, &ep_in, &ep_out, + NULL, NULL); + if (ret) { + dev_err(dev, "Cannot find bulk endpoints: %d\n", ret); + return ret; + } + + usbio->tx_pipe = usb_sndbulkpipe(udev, usb_endpoint_num(ep_out)); + + if (usbio->quirks & USBIO_QUIRK_BULK_MAXP_63) + usbio->txbuf_len = 63; + else + usbio->txbuf_len = usb_endpoint_maxp(ep_out); + + usbio->txbuf = devm_kzalloc(dev, usbio->txbuf_len, GFP_KERNEL); + if (!usbio->txbuf) + return -ENOMEM; + + usbio->rx_pipe = usb_rcvbulkpipe(udev, usb_endpoint_num(ep_in)); + + if (usbio->quirks & USBIO_QUIRK_BULK_MAXP_63) + usbio->rxbuf_len = 63; + else + usbio->rxbuf_len = usb_endpoint_maxp(ep_in); + + usbio->rxbuf = devm_kzalloc(dev, usbio->rxbuf_len, GFP_KERNEL); + if (!usbio->rxbuf) + return -ENOMEM; + + usbio->urb = usb_alloc_urb(0, GFP_KERNEL); + if (!usbio->urb) + return -ENOMEM; + + usb_fill_bulk_urb(usbio->urb, udev, usbio->rx_pipe, usbio->rxbuf, + usbio->rxbuf_len, usbio_bulk_recv, usbio); + ret = usb_submit_urb(usbio->urb, GFP_KERNEL); + if (ret) + return dev_err_probe(dev, ret, "Submitting usb urb\n"); + + mutex_lock(&usbio->ctrl_mutex); + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_HS, NULL, 0, NULL, 0); + if (ret < 0) + goto err_unlock; + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_PROTVER, NULL, 0, + &protver, sizeof(protver)); + if (ret < 0) + goto err_unlock; + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_FWVER, NULL, 0, + &fwver, sizeof(fwver)); + if (ret < 0) + goto err_unlock; + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_ENUMGPIO, NULL, 0, + usbio->gpios, sizeof(usbio->gpios)); + if (ret < 0 || ret % sizeof(struct usbio_gpio_bank_desc)) { + ret = (ret < 0) ? ret : -EPROTO; + goto err_unlock; + } + usbio->nr_gpio_banks = ret / sizeof(struct usbio_gpio_bank_desc); + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_ENUMI2C, NULL, 0, + usbio->i2cs, sizeof(usbio->i2cs)); + if (ret < 0 || ret % sizeof(struct usbio_i2c_bus_desc)) { + ret = (ret < 0) ? ret : -EPROTO; + goto err_unlock; + } + usbio->nr_i2c_buses = ret / sizeof(struct usbio_i2c_bus_desc); + + mutex_unlock(&usbio->ctrl_mutex); + + dev_dbg(dev, "ProtVer(BCD): %02x FwVer: %d.%d.%d.%d\n", + protver.ver, fwver.major, fwver.minor, + le16_to_cpu(fwver.patch), le16_to_cpu(fwver.build)); + + usbio_enum_gpios(usbio); + usbio_enum_i2cs(usbio); + + return 0; + +err_unlock: + mutex_unlock(&usbio->ctrl_mutex); + usb_kill_urb(usbio->urb); + usb_free_urb(usbio->urb); + + return ret; +} + +static const struct usb_device_id usbio_table[] = { + { USB_DEVICE(0x2ac1, 0x20c1), /* Lattice NX40 */ + .driver_info = USBIO_QUIRK_I2C_MAX_RW_LEN_52 }, + { USB_DEVICE(0x2ac1, 0x20c9), /* Lattice NX33 */ + .driver_info = USBIO_QUIRK_I2C_NO_INIT_ACK | USBIO_QUIRK_I2C_MAX_RW_LEN_52 | + USBIO_QUIRK_I2C_ALLOW_400KHZ }, + { USB_DEVICE(0x2ac1, 0x20cb) }, /* Lattice NX33U */ + { USB_DEVICE(0x06cb, 0x0701), /* Synaptics Sabre */ + .driver_info = USBIO_QUIRK_BULK_MAXP_63 | USBIO_QUIRK_I2C_USE_CHUNK_LEN }, + { } +}; +MODULE_DEVICE_TABLE(usb, usbio_table); + +static struct usb_driver usbio_driver = { + .name = "usbio-bridge", + .probe = usbio_probe, + .disconnect = usbio_disconnect, + .suspend = usbio_suspend, + .resume = usbio_resume, + .id_table = usbio_table, + .supports_autosuspend = 1, +}; +module_usb_driver(usbio_driver); + +struct usbio_match_ids_walk_data { + struct acpi_device *adev; + const struct acpi_device_id *hids; + unsigned int id; +}; + +static int usbio_match_device_ids(struct acpi_device *adev, void *data) +{ + struct usbio_match_ids_walk_data *wd = data; + unsigned int id = 0; + char *uid; + + if (acpi_match_device_ids(adev, wd->hids)) + return 0; + + uid = acpi_device_uid(adev); + if (uid) { + for (int i = 0; i < strlen(uid); i++) { + if (!kstrtouint(&uid[i], 10, &id)) + break; + } + } + + if (!uid || wd->id == id) { + wd->adev = adev; + return 1; + } + + return 0; +} + +void usbio_acpi_bind(struct auxiliary_device *adev, const struct acpi_device_id *hids) +{ + struct device *dev = &adev->dev; + struct acpi_device *parent; + struct usbio_match_ids_walk_data wd = { + .adev = NULL, + .hids = hids, + .id = adev->id, + }; + + parent = ACPI_COMPANION(dev->parent); + if (!parent) + return; + + acpi_dev_for_each_child(parent, usbio_match_device_ids, &wd); + if (wd.adev) + ACPI_COMPANION_SET(dev, wd.adev); +} +EXPORT_SYMBOL_NS_GPL(usbio_acpi_bind, "USBIO"); + +MODULE_DESCRIPTION("Intel USBIO Bridge driver"); +MODULE_AUTHOR("Israel Cepeda "); +MODULE_AUTHOR("Hans de Goede "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/usb/usbio.h b/include/linux/usb/usbio.h new file mode 100644 index 000000000000..6c4e7c246d58 --- /dev/null +++ b/include/linux/usb/usbio.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025 Intel Corporation. + * + */ + +#ifndef _LINUX_USBIO_H_ +#define _LINUX_USBIO_H_ + +#include +#include +#include +#include + +/*********************** + * USBIO Clients Names * + ***********************/ +#define USBIO_GPIO_CLIENT "usbio-gpio" +#define USBIO_I2C_CLIENT "usbio-i2c" + +/**************** + * USBIO quirks * + ****************/ +#define USBIO_QUIRK_BULK_MAXP_63 BIT(0) /* Force bulk endpoint maxp to 63 */ +#define USBIO_QUIRK_I2C_NO_INIT_ACK BIT(8) /* Do not ask for ack on I2C init */ +#define USBIO_QUIRK_I2C_MAX_RW_LEN_52 BIT(9) /* Set i2c-adapter max r/w len to 52 */ +#define USBIO_QUIRK_I2C_USE_CHUNK_LEN BIT(10) /* Send chunk-len for split xfers */ +#define USBIO_QUIRK_I2C_ALLOW_400KHZ BIT(11) /* Override desc, allowing 400 KHz */ + +/************************** + * USBIO Type Definitions * + **************************/ + +/* USBIO Packet Type */ +#define USBIO_PKTTYPE_CTRL 1 +#define USBIO_PKTTYPE_DBG 2 +#define USBIO_PKTTYPE_GPIO 3 +#define USBIO_PKTTYPE_I2C 4 + +/* USBIO Packet Header */ +struct usbio_packet_header { + u8 type; + u8 cmd; + u8 flags; +} __packed; + +/* USBIO Control Transfer Packet */ +struct usbio_ctrl_packet { + struct usbio_packet_header header; + u8 len; + u8 data[] __counted_by(len); +} __packed; + +/* USBIO Bulk Transfer Packet */ +struct usbio_bulk_packet { + struct usbio_packet_header header; + __le16 len; + u8 data[] __counted_by(len); +} __packed; + +/* USBIO GPIO commands */ +enum usbio_gpio_cmd { + USBIO_GPIOCMD_DEINIT, + USBIO_GPIOCMD_INIT, + USBIO_GPIOCMD_READ, + USBIO_GPIOCMD_WRITE, + USBIO_GPIOCMD_END +}; + +/* USBIO GPIO config */ +enum usbio_gpio_pincfg { + USBIO_GPIO_PINCFG_DEFAULT, + USBIO_GPIO_PINCFG_PULLUP, + USBIO_GPIO_PINCFG_PULLDOWN, + USBIO_GPIO_PINCFG_PUSHPULL +}; + +#define USBIO_GPIO_PINCFG_SHIFT 2 +#define USBIO_GPIO_PINCFG_MASK (0x3 << USBIO_GPIO_PINCFG_SHIFT) +#define USBIO_GPIO_SET_PINCFG(pincfg) \ + (((pincfg) << USBIO_GPIO_PINCFG_SHIFT) & USBIO_GPIO_PINCFG_MASK) + +enum usbio_gpio_pinmode { + USBIO_GPIO_PINMOD_INVAL, + USBIO_GPIO_PINMOD_INPUT, + USBIO_GPIO_PINMOD_OUTPUT, + USBIO_GPIO_PINMOD_MAXVAL +}; + +#define USBIO_GPIO_PINMOD_MASK 0x3 +#define USBIO_GPIO_SET_PINMOD(pin) (pin & USBIO_GPIO_PINMOD_MASK) + +/************************* + * USBIO GPIO Controller * + *************************/ + +#define USBIO_MAX_GPIOBANKS 5 +#define USBIO_GPIOSPERBANK 32 + +struct usbio_gpio_bank_desc { + u8 id; + u8 pins; + __le32 bmap; +} __packed; + +struct usbio_gpio_init { + u8 bankid; + u8 config; + u8 pincount; + u8 pin; +} __packed; + +struct usbio_gpio_rw { + u8 bankid; + u8 pincount; + u8 pin; + __le32 value; +} __packed; + +/* USBIO I2C commands */ +enum usbio_i2c_cmd { + USBIO_I2CCMD_UNINIT, + USBIO_I2CCMD_INIT, + USBIO_I2CCMD_READ, + USBIO_I2CCMD_WRITE, + USBIO_I2CCMD_END +}; + +/************************ + * USBIO I2C Controller * + ************************/ + +#define USBIO_MAX_I2CBUSES 5 + +#define USBIO_I2C_BUS_ADDR_CAP_10B BIT(3) /* 10bit address support */ +#define USBIO_I2C_BUS_MODE_CAP_MASK 0x3 +#define USBIO_I2C_BUS_MODE_CAP_SM 0 /* Standard Mode */ +#define USBIO_I2C_BUS_MODE_CAP_FM 1 /* Fast Mode */ +#define USBIO_I2C_BUS_MODE_CAP_FMP 2 /* Fast Mode+ */ +#define USBIO_I2C_BUS_MODE_CAP_HSM 3 /* High-Speed Mode */ + +struct usbio_i2c_bus_desc { + u8 id; + u8 caps; +} __packed; + +struct usbio_i2c_uninit { + u8 busid; + __le16 config; +} __packed; + +struct usbio_i2c_init { + u8 busid; + __le16 config; + __le32 speed; +} __packed; + +struct usbio_i2c_rw { + u8 busid; + __le16 config; + __le16 size; + u8 data[] __counted_by(size); +} __packed; + +int usbio_control_msg(struct auxiliary_device *adev, u8 type, u8 cmd, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len); + +int usbio_bulk_msg(struct auxiliary_device *adev, u8 type, u8 cmd, bool last, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len); + +int usbio_acquire(struct auxiliary_device *adev); +void usbio_release(struct auxiliary_device *adev); +void usbio_get_txrxbuf_len(struct auxiliary_device *adev, u16 *txbuf_len, u16 *rxbuf_len); +unsigned long usbio_get_quirks(struct auxiliary_device *adev); +void usbio_acpi_bind(struct auxiliary_device *adev, const struct acpi_device_id *hids); + +#endif -- cgit v1.2.3 From 7f70b89b2be66c03ddc76d3ad8aebeeec4a9c505 Mon Sep 17 00:00:00 2001 From: Guan-Yu Lin Date: Thu, 11 Sep 2025 14:20:14 +0000 Subject: usb: offload: add apis for offload usage tracking Introduce offload_usage and corresponding apis to track offload usage on each USB device. Offload denotes that there is another co-processor accessing the USB device via the same USB host controller. To optimize power usage, it's essential to monitor whether the USB device is actively used by other co-processor. This information is vital when determining if a USB device can be safely suspended during system power state transitions. Signed-off-by: Guan-Yu Lin Link: https://lore.kernel.org/r/20250911142051.90822-3-guanyulin@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250911142051.90822-3-guanyulin@google.com --- drivers/usb/core/Makefile | 1 + drivers/usb/core/offload.c | 136 +++++++++++++++++++++++++++++++++++++++++++++ drivers/usb/core/usb.c | 1 + include/linux/usb.h | 18 ++++++ 4 files changed, 156 insertions(+) create mode 100644 drivers/usb/core/offload.c (limited to 'include') diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile index ac006abd13b3..766000b4939e 100644 --- a/drivers/usb/core/Makefile +++ b/drivers/usb/core/Makefile @@ -9,6 +9,7 @@ usbcore-y += devio.o notify.o generic.o quirks.o devices.o usbcore-y += phy.o port.o usbcore-$(CONFIG_OF) += of.o +usbcore-$(CONFIG_USB_XHCI_SIDEBAND) += offload.o usbcore-$(CONFIG_USB_PCI) += hcd-pci.o usbcore-$(CONFIG_ACPI) += usb-acpi.o diff --git a/drivers/usb/core/offload.c b/drivers/usb/core/offload.c new file mode 100644 index 000000000000..7c699f1b8d2b --- /dev/null +++ b/drivers/usb/core/offload.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * offload.c - USB offload related functions + * + * Copyright (c) 2025, Google LLC. + * + * Author: Guan-Yu Lin + */ + +#include + +#include "usb.h" + +/** + * usb_offload_get - increment the offload_usage of a USB device + * @udev: the USB device to increment its offload_usage + * + * Incrementing the offload_usage of a usb_device indicates that offload is + * enabled on this usb_device; that is, another entity is actively handling USB + * transfers. This information allows the USB driver to adjust its power + * management policy based on offload activity. + * + * Return: 0 on success. A negative error code otherwise. + */ +int usb_offload_get(struct usb_device *udev) +{ + int ret; + + usb_lock_device(udev); + if (udev->state == USB_STATE_NOTATTACHED) { + usb_unlock_device(udev); + return -ENODEV; + } + + if (udev->state == USB_STATE_SUSPENDED || + udev->offload_at_suspend) { + usb_unlock_device(udev); + return -EBUSY; + } + + /* + * offload_usage could only be modified when the device is active, since + * it will alter the suspend flow of the device. + */ + ret = usb_autoresume_device(udev); + if (ret < 0) { + usb_unlock_device(udev); + return ret; + } + + udev->offload_usage++; + usb_autosuspend_device(udev); + usb_unlock_device(udev); + + return ret; +} +EXPORT_SYMBOL_GPL(usb_offload_get); + +/** + * usb_offload_put - drop the offload_usage of a USB device + * @udev: the USB device to drop its offload_usage + * + * The inverse operation of usb_offload_get, which drops the offload_usage of + * a USB device. This information allows the USB driver to adjust its power + * management policy based on offload activity. + * + * Return: 0 on success. A negative error code otherwise. + */ +int usb_offload_put(struct usb_device *udev) +{ + int ret; + + usb_lock_device(udev); + if (udev->state == USB_STATE_NOTATTACHED) { + usb_unlock_device(udev); + return -ENODEV; + } + + if (udev->state == USB_STATE_SUSPENDED || + udev->offload_at_suspend) { + usb_unlock_device(udev); + return -EBUSY; + } + + /* + * offload_usage could only be modified when the device is active, since + * it will alter the suspend flow of the device. + */ + ret = usb_autoresume_device(udev); + if (ret < 0) { + usb_unlock_device(udev); + return ret; + } + + /* Drop the count when it wasn't 0, ignore the operation otherwise. */ + if (udev->offload_usage) + udev->offload_usage--; + usb_autosuspend_device(udev); + usb_unlock_device(udev); + + return ret; +} +EXPORT_SYMBOL_GPL(usb_offload_put); + +/** + * usb_offload_check - check offload activities on a USB device + * @udev: the USB device to check its offload activity. + * + * Check if there are any offload activity on the USB device right now. This + * information could be used for power management or other forms of resource + * management. + * + * The caller must hold @udev's device lock. In addition, the caller should + * ensure downstream usb devices are all either suspended or marked as + * "offload_at_suspend" to ensure the correctness of the return value. + * + * Returns true on any offload activity, false otherwise. + */ +bool usb_offload_check(struct usb_device *udev) __must_hold(&udev->dev->mutex) +{ + struct usb_device *child; + bool active; + int port1; + + usb_hub_for_each_child(udev, port1, child) { + usb_lock_device(child); + active = usb_offload_check(child); + usb_unlock_device(child); + if (active) + return true; + } + + return !!udev->offload_usage; +} +EXPORT_SYMBOL_GPL(usb_offload_check); diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index b88b6271cb30..b6b0b8489523 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -670,6 +670,7 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent, set_dev_node(&dev->dev, dev_to_node(bus->sysdev)); dev->state = USB_STATE_ATTACHED; dev->lpm_disable_count = 1; + dev->offload_usage = 0; atomic_set(&dev->urbnum, 0); INIT_LIST_HEAD(&dev->ep0.urb_list); diff --git a/include/linux/usb.h b/include/linux/usb.h index 70ef00c42d22..e85105939af8 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -636,6 +636,8 @@ struct usb3_lpm_parameters { * @do_remote_wakeup: remote wakeup should be enabled * @reset_resume: needs reset instead of resume * @port_is_suspended: the upstream port is suspended (L2 or U3) + * @offload_at_suspend: offload activities during suspend is enabled. + * @offload_usage: number of offload activities happening on this usb device. * @slot_id: Slot ID assigned by xHCI * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout. * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout. @@ -724,6 +726,8 @@ struct usb_device { unsigned do_remote_wakeup:1; unsigned reset_resume:1; unsigned port_is_suspended:1; + unsigned offload_at_suspend:1; + int offload_usage; enum usb_link_tunnel_mode tunnel_mode; struct device_link *usb4_link; @@ -841,6 +845,20 @@ static inline void usb_mark_last_busy(struct usb_device *udev) { } #endif +#if IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) +int usb_offload_get(struct usb_device *udev); +int usb_offload_put(struct usb_device *udev); +bool usb_offload_check(struct usb_device *udev); +#else + +static inline int usb_offload_get(struct usb_device *udev) +{ return 0; } +static inline int usb_offload_put(struct usb_device *udev) +{ return 0; } +static inline bool usb_offload_check(struct usb_device *udev) +{ return false; } +#endif + extern int usb_disable_lpm(struct usb_device *udev); extern void usb_enable_lpm(struct usb_device *udev); /* Same as above, but these functions lock/unlock the bandwidth_mutex. */ -- cgit v1.2.3 From ef82a4803aabaf623bfcae07981406f1386eabf9 Mon Sep 17 00:00:00 2001 From: Guan-Yu Lin Date: Thu, 11 Sep 2025 14:20:15 +0000 Subject: xhci: sideband: add api to trace sideband usage The existing sideband driver only registers sidebands without tracking their active usage. To address this, sideband will now record its active usage when it creates/removes interrupters. In addition, a new api is introduced to provide a means for other dirvers to fetch sideband activity information on a USB host controller. Signed-off-by: Guan-Yu Lin Link: https://lore.kernel.org/r/20250911142051.90822-4-guanyulin@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250911142051.90822-4-guanyulin@google.com --- drivers/usb/host/xhci-sideband.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/usb/xhci-sideband.h | 9 +++++++++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/drivers/usb/host/xhci-sideband.c b/drivers/usb/host/xhci-sideband.c index d49f9886dd84..e771a476fef2 100644 --- a/drivers/usb/host/xhci-sideband.c +++ b/drivers/usb/host/xhci-sideband.c @@ -266,6 +266,31 @@ xhci_sideband_get_event_buffer(struct xhci_sideband *sb) } EXPORT_SYMBOL_GPL(xhci_sideband_get_event_buffer); +/** + * xhci_sideband_check - check the existence of active sidebands + * @hcd: the host controller driver associated with the target host controller + * + * Allow other drivers, such as usb controller driver, to check if there are + * any sideband activity on the host controller. This information could be used + * for power management or other forms of resource management. The caller should + * ensure downstream usb devices are all either suspended or marked as + * "offload_at_suspend" to ensure the correctness of the return value. + * + * Returns true on any active sideband existence, false otherwise. + */ +bool xhci_sideband_check(struct usb_hcd *hcd) +{ + struct usb_device *udev = hcd->self.root_hub; + bool active; + + usb_lock_device(udev); + active = usb_offload_check(udev); + usb_unlock_device(udev); + + return active; +} +EXPORT_SYMBOL_GPL(xhci_sideband_check); + /** * xhci_sideband_create_interrupter - creates a new interrupter for this sideband * @sb: sideband instance for this usb device @@ -286,6 +311,7 @@ xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, bool ip_autoclear, u32 imod_interval, int intr_num) { int ret = 0; + struct usb_device *udev; if (!sb || !sb->xhci) return -ENODEV; @@ -304,6 +330,9 @@ xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, goto out; } + udev = sb->vdev->udev; + ret = usb_offload_get(udev); + sb->ir->ip_autoclear = ip_autoclear; out: @@ -323,6 +352,8 @@ EXPORT_SYMBOL_GPL(xhci_sideband_create_interrupter); void xhci_sideband_remove_interrupter(struct xhci_sideband *sb) { + struct usb_device *udev; + if (!sb || !sb->ir) return; @@ -330,6 +361,11 @@ xhci_sideband_remove_interrupter(struct xhci_sideband *sb) xhci_remove_secondary_interrupter(xhci_to_hcd(sb->xhci), sb->ir); sb->ir = NULL; + udev = sb->vdev->udev; + + if (udev->state != USB_STATE_NOTATTACHED) + usb_offload_put(udev); + mutex_unlock(&sb->mutex); } EXPORT_SYMBOL_GPL(xhci_sideband_remove_interrupter); diff --git a/include/linux/usb/xhci-sideband.h b/include/linux/usb/xhci-sideband.h index 45288c392f6e..005257085dcb 100644 --- a/include/linux/usb/xhci-sideband.h +++ b/include/linux/usb/xhci-sideband.h @@ -11,6 +11,7 @@ #include #include +#include #define EP_CTX_PER_DEV 31 /* FIXME defined twice, from xhci.h */ @@ -83,6 +84,14 @@ xhci_sideband_get_endpoint_buffer(struct xhci_sideband *sb, struct usb_host_endpoint *host_ep); struct sg_table * xhci_sideband_get_event_buffer(struct xhci_sideband *sb); + +#if IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) +bool xhci_sideband_check(struct usb_hcd *hcd); +#else +static inline bool xhci_sideband_check(struct usb_hcd *hcd) +{ return false; } +#endif /* IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) */ + int xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, bool ip_autoclear, u32 imod_interval, int intr_num); -- cgit v1.2.3 From f338529ca9279e3bea392cb53cec8bd292909cb1 Mon Sep 17 00:00:00 2001 From: Sarthak Garg Date: Mon, 8 Sep 2025 16:11:21 +0530 Subject: mmc: core: Parse and use the new max-sd-hs-hz DT property Introduce a new device tree flag to cap the maximum High-Speed (HS) mode frequency for SD cards, accommodating board-specific electrical limitations which cannot support the default 50Mhz HS frequency and others. Signed-off-by: Sarthak Garg Signed-off-by: Ulf Hansson --- drivers/mmc/core/host.c | 2 ++ drivers/mmc/core/sd.c | 2 +- include/linux/mmc/host.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 5f0ec23aeff5..88c95dbfd9cf 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -302,6 +302,8 @@ int mmc_of_parse(struct mmc_host *host) /* f_max is obtained from the optional "max-frequency" property */ device_property_read_u32(dev, "max-frequency", &host->f_max); + device_property_read_u32(dev, "max-sd-hs-hz", &host->max_sd_hs_hz); + /* * Configure CD and WP pins. They are both by default active low to * match the SDHCI spec. If GPIOs are provided for CD and / or WP, the diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index ec02067f03c5..67cd63004829 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -359,7 +359,7 @@ static int mmc_read_switch(struct mmc_card *card) } if (status[13] & SD_MODE_HIGH_SPEED) - card->sw_caps.hs_max_dtr = HIGH_SPEED_MAX_DTR; + card->sw_caps.hs_max_dtr = card->host->max_sd_hs_hz ?: HIGH_SPEED_MAX_DTR; if (card->scr.sda_spec3) { card->sw_caps.sd3_bus_mode = status[13]; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e0d935a4ac1d..e0e2c265e5d1 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -576,6 +576,7 @@ struct mmc_host { int hsq_depth; u32 err_stats[MMC_ERR_MAX]; + u32 max_sd_hs_hz; unsigned long private[] ____cacheline_aligned; }; -- cgit v1.2.3 From d81c041ed5e444233daaa2d4b0f9ae4008c57f59 Mon Sep 17 00:00:00 2001 From: Ling Xu Date: Fri, 12 Sep 2025 14:13:00 +0100 Subject: misc: fastrpc: Remove kernel-side domain checks from capability ioctl Domain ID in the uAPI is misleading. Remove checks and log messages related to 'domain' field in capability structure. Update UAPI to mark the field as unused. Reviewed-by: Dmitry Baryshkov Reviewed-by: Ekansh Gupta Signed-off-by: Ling Xu Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250912131302.303199-3-srini@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 14 +------------- include/uapi/misc/fastrpc.h | 2 +- 2 files changed, 2 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 53e88a1bc430..2dfcf8a66806 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -1723,7 +1723,6 @@ static int fastrpc_get_info_from_kernel(struct fastrpc_ioctl_capability *cap, uint32_t attribute_id = cap->attribute_id; uint32_t *dsp_attributes; unsigned long flags; - uint32_t domain = cap->domain; int err; spin_lock_irqsave(&cctx->lock, flags); @@ -1741,7 +1740,7 @@ static int fastrpc_get_info_from_kernel(struct fastrpc_ioctl_capability *cap, err = fastrpc_get_info_from_dsp(fl, dsp_attributes, FASTRPC_MAX_DSP_ATTRIBUTES); if (err == DSP_UNSUPPORTED_API) { dev_info(&cctx->rpdev->dev, - "Warning: DSP capabilities not supported on domain: %d\n", domain); + "Warning: DSP capabilities not supported\n"); kfree(dsp_attributes); return -EOPNOTSUPP; } else if (err) { @@ -1769,17 +1768,6 @@ static int fastrpc_get_dsp_info(struct fastrpc_user *fl, char __user *argp) return -EFAULT; cap.capability = 0; - if (cap.domain >= FASTRPC_DEV_MAX) { - dev_err(&fl->cctx->rpdev->dev, "Error: Invalid domain id:%d, err:%d\n", - cap.domain, err); - return -ECHRNG; - } - - /* Fastrpc Capablities does not support modem domain */ - if (cap.domain == MDSP_DOMAIN_ID) { - dev_err(&fl->cctx->rpdev->dev, "Error: modem not supported %d\n", err); - return -ECHRNG; - } if (cap.attribute_id >= FASTRPC_MAX_DSP_ATTRIBUTES) { dev_err(&fl->cctx->rpdev->dev, "Error: invalid attribute: %d, err: %d\n", diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index f33d914d8f46..c6e2925f47e6 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -134,7 +134,7 @@ struct fastrpc_mem_unmap { }; struct fastrpc_ioctl_capability { - __u32 domain; + __u32 unused; /* deprecated, ignored by the kernel */ __u32 attribute_id; __u32 capability; /* dsp capability */ __u32 reserved[4]; -- cgit v1.2.3 From e9671ddd82eee96146a7359431a4e1f04ac2b076 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 12 Sep 2025 01:47:04 +0800 Subject: dt-bindings: clock: sun55i-a523-ccu: Add missing NPU module clock The main clock controller on the A523/T527 has the NPU's module clock. It was missing from the original submission, likely because that was based on the A523 user manual; the A523 is marketed without the NPU. Reviewed-by: Andre Przywara Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250911174710.3149589-2-wens@kernel.org Signed-off-by: Chen-Yu Tsai --- include/dt-bindings/clock/sun55i-a523-ccu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/sun55i-a523-ccu.h b/include/dt-bindings/clock/sun55i-a523-ccu.h index c8259ac5ada7..54808fcfd556 100644 --- a/include/dt-bindings/clock/sun55i-a523-ccu.h +++ b/include/dt-bindings/clock/sun55i-a523-ccu.h @@ -185,5 +185,6 @@ #define CLK_FANOUT0 176 #define CLK_FANOUT1 177 #define CLK_FANOUT2 178 +#define CLK_NPU 179 #endif /* _DT_BINDINGS_CLK_SUN55I_A523_CCU_H_ */ -- cgit v1.2.3 From 0f610e650d4e979490ccfa4c22ca29ca547f41e7 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 12 Sep 2025 01:47:05 +0800 Subject: dt-bindings: clock: sun55i-a523-ccu: Add A523 MCU CCU clock controller There are four clock controllers in the A523 SoC. The existing binding already covers two of them that are critical for basic operation. The remaining ones are the MCU clock controller and CPU PLL clock controller. Add a description for the MCU CCU. This unit controls and provides clocks to the MCU (RISC-V) subsystem and peripherals meant to operate under low power conditions. Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250911174710.3149589-3-wens@kernel.org Signed-off-by: Chen-Yu Tsai --- .../bindings/clock/allwinner,sun55i-a523-ccu.yaml | 37 ++++++++++++++- include/dt-bindings/clock/sun55i-a523-mcu-ccu.h | 54 ++++++++++++++++++++++ include/dt-bindings/reset/sun55i-a523-mcu-ccu.h | 30 ++++++++++++ 3 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 include/dt-bindings/clock/sun55i-a523-mcu-ccu.h create mode 100644 include/dt-bindings/reset/sun55i-a523-mcu-ccu.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun55i-a523-ccu.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun55i-a523-ccu.yaml index f5f62e9a10a1..58be701a720e 100644 --- a/Documentation/devicetree/bindings/clock/allwinner,sun55i-a523-ccu.yaml +++ b/Documentation/devicetree/bindings/clock/allwinner,sun55i-a523-ccu.yaml @@ -19,6 +19,7 @@ properties: compatible: enum: - allwinner,sun55i-a523-ccu + - allwinner,sun55i-a523-mcu-ccu - allwinner,sun55i-a523-r-ccu reg: @@ -26,11 +27,11 @@ properties: clocks: minItems: 4 - maxItems: 5 + maxItems: 9 clock-names: minItems: 4 - maxItems: 5 + maxItems: 9 required: - "#clock-cells" @@ -63,6 +64,38 @@ allOf: - const: iosc - const: losc-fanout + - if: + properties: + compatible: + enum: + - allwinner,sun55i-a523-mcu-ccu + + then: + properties: + clocks: + items: + - description: High Frequency Oscillator (usually at 24MHz) + - description: Low Frequency Oscillator (usually at 32kHz) + - description: Internal Oscillator + - description: Audio PLL (4x) + - description: Peripherals PLL 0 (300 MHz output) + - description: DSP module clock + - description: MBUS clock + - description: PRCM AHB clock + - description: PRCM APB0 clock + + clock-names: + items: + - const: hosc + - const: losc + - const: iosc + - const: pll-audio0-4x + - const: pll-periph0-300m + - const: dsp + - const: mbus + - const: r-ahb + - const: r-apb0 + - if: properties: compatible: diff --git a/include/dt-bindings/clock/sun55i-a523-mcu-ccu.h b/include/dt-bindings/clock/sun55i-a523-mcu-ccu.h new file mode 100644 index 000000000000..6efc6bc7e11a --- /dev/null +++ b/include/dt-bindings/clock/sun55i-a523-mcu-ccu.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Chen-Yu Tsai + */ + +#ifndef _DT_BINDINGS_CLK_SUN55I_A523_MCU_CCU_H_ +#define _DT_BINDINGS_CLK_SUN55I_A523_MCU_CCU_H_ + +#define CLK_MCU_PLL_AUDIO1 0 +#define CLK_MCU_PLL_AUDIO1_DIV2 1 +#define CLK_MCU_PLL_AUDIO1_DIV5 2 +#define CLK_MCU_AUDIO_OUT 3 +#define CLK_MCU_DSP 4 +#define CLK_MCU_I2S0 5 +#define CLK_MCU_I2S1 6 +#define CLK_MCU_I2S2 7 +#define CLK_MCU_I2S3 8 +#define CLK_MCU_I2S3_ASRC 9 +#define CLK_BUS_MCU_I2S0 10 +#define CLK_BUS_MCU_I2S1 11 +#define CLK_BUS_MCU_I2S2 12 +#define CLK_BUS_MCU_I2S3 13 +#define CLK_MCU_SPDIF_TX 14 +#define CLK_MCU_SPDIF_RX 15 +#define CLK_BUS_MCU_SPDIF 16 +#define CLK_MCU_DMIC 17 +#define CLK_BUS_MCU_DMIC 18 +#define CLK_MCU_AUDIO_CODEC_DAC 19 +#define CLK_MCU_AUDIO_CODEC_ADC 20 +#define CLK_BUS_MCU_AUDIO_CODEC 21 +#define CLK_BUS_MCU_DSP_MSGBOX 22 +#define CLK_BUS_MCU_DSP_CFG 23 +#define CLK_BUS_MCU_NPU_HCLK 24 +#define CLK_BUS_MCU_NPU_ACLK 25 +#define CLK_MCU_TIMER0 26 +#define CLK_MCU_TIMER1 27 +#define CLK_MCU_TIMER2 28 +#define CLK_MCU_TIMER3 29 +#define CLK_MCU_TIMER4 30 +#define CLK_MCU_TIMER5 31 +#define CLK_BUS_MCU_TIMER 32 +#define CLK_BUS_MCU_DMA 33 +#define CLK_MCU_TZMA0 34 +#define CLK_MCU_TZMA1 35 +#define CLK_BUS_MCU_PUBSRAM 36 +#define CLK_MCU_MBUS_DMA 37 +#define CLK_MCU_MBUS 38 +#define CLK_MCU_RISCV 39 +#define CLK_BUS_MCU_RISCV_CFG 40 +#define CLK_BUS_MCU_RISCV_MSGBOX 41 +#define CLK_MCU_PWM0 42 +#define CLK_BUS_MCU_PWM0 43 + +#endif /* _DT_BINDINGS_CLK_SUN55I_A523_MCU_CCU_H_ */ diff --git a/include/dt-bindings/reset/sun55i-a523-mcu-ccu.h b/include/dt-bindings/reset/sun55i-a523-mcu-ccu.h new file mode 100644 index 000000000000..a89a0b44f08b --- /dev/null +++ b/include/dt-bindings/reset/sun55i-a523-mcu-ccu.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Chen-Yu Tsai + */ + +#ifndef _DT_BINDINGS_RST_SUN55I_A523_MCU_CCU_H_ +#define _DT_BINDINGS_RST_SUN55I_A523_MCU_CCU_H_ + +#define RST_BUS_MCU_I2S0 0 +#define RST_BUS_MCU_I2S1 1 +#define RST_BUS_MCU_I2S2 2 +#define RST_BUS_MCU_I2S3 3 +#define RST_BUS_MCU_SPDIF 4 +#define RST_BUS_MCU_DMIC 5 +#define RST_BUS_MCU_AUDIO_CODEC 6 +#define RST_BUS_MCU_DSP_MSGBOX 7 +#define RST_BUS_MCU_DSP_CFG 8 +#define RST_BUS_MCU_NPU 9 +#define RST_BUS_MCU_TIMER 10 +#define RST_BUS_MCU_DSP_DEBUG 11 +#define RST_BUS_MCU_DSP 12 +#define RST_BUS_MCU_DMA 13 +#define RST_BUS_MCU_PUBSRAM 14 +#define RST_BUS_MCU_RISCV_CFG 15 +#define RST_BUS_MCU_RISCV_DEBUG 16 +#define RST_BUS_MCU_RISCV_CORE 17 +#define RST_BUS_MCU_RISCV_MSGBOX 18 +#define RST_BUS_MCU_PWM0 19 + +#endif /* _DT_BINDINGS_RST_SUN55I_A523_MCU_CCU_H_ */ -- cgit v1.2.3 From f15bc37d8c336e79491209b268e73868c44733c4 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:21 +0000 Subject: iio: add IIO_ALTCURRENT channel type Add support for IIO_ALTCURRENT channel type to distinguish AC current measurements from DC current measurements. This follows the same pattern as IIO_VOLTAGE and IIO_ALTVOLTAGE. Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 1 + include/uapi/linux/iio/types.h | 1 + tools/iio/iio_event_monitor.c | 2 ++ 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 38848d753a33..4ff2c44f9324 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -97,6 +97,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", [IIO_ATTENTION] = "attention", + [IIO_ALTCURRENT] = "altcurrent", }; static const char * const iio_modifier_names[] = { diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3eb0821af7a4..3c3cc1497a1e 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -52,6 +52,7 @@ enum iio_chan_type { IIO_COLORTEMP, IIO_CHROMATICITY, IIO_ATTENTION, + IIO_ALTCURRENT, }; enum iio_modifier { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index eab7b082f19d..d26aff649f3f 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -64,6 +64,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", [IIO_ATTENTION] = "attention", + [IIO_ALTCURRENT] = "altcurrent", }; static const char * const iio_ev_type_text[] = { @@ -187,6 +188,7 @@ static bool event_is_known(struct iio_event_data *event) case IIO_COLORTEMP: case IIO_CHROMATICITY: case IIO_ATTENTION: + case IIO_ALTCURRENT: break; default: return false; -- cgit v1.2.3 From 70da02061499ca89ab92f7c4310f815d5fe674ec Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:22 +0000 Subject: iio: add power and energy measurement modifiers Add new IIO modifiers to support power and energy measurement devices: Power modifiers: - IIO_MOD_ACTIVE: Real power consumed by the load - IIO_MOD_REACTIVE: Power that oscillates between source and load - IIO_MOD_APPARENT: Magnitude of complex power Signal quality modifiers: - IIO_MOD_RMS: Root Mean Square value Additionally adds: - IIO_CHAN_INFO_POWERFACTOR: Power factor channel info type for representing the ratio of active power to apparent power These modifiers enable proper representation of power measurement devices like energy meters and power analyzers. Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 29 +++++++++++++++++++++++++++++ drivers/iio/industrialio-core.c | 5 +++++ include/linux/iio/types.h | 1 + include/uapi/linux/iio/types.h | 4 ++++ tools/iio/iio_event_monitor.c | 8 ++++++++ 5 files changed, 47 insertions(+) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 2fb2cea4b192..78da68826307 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -167,7 +167,18 @@ Description: is required is a consistent labeling. Units after application of scale and offset are millivolts. +What: /sys/bus/iio/devices/iio:deviceX/in_altvoltageY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled) Root Mean Square (RMS) voltage measurement from + channel Y. Units after application of scale and offset are + millivolts. + What: /sys/bus/iio/devices/iio:deviceX/in_powerY_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_active_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_reactive_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_apparent_raw KernelVersion: 4.5 Contact: linux-iio@vger.kernel.org Description: @@ -176,6 +187,13 @@ Description: unique to allow association with event codes. Units after application of scale and offset are milliwatts. +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_powerfactor +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Power factor measurement from channel Y. Power factor is the + ratio of active power to apparent power. The value is unitless. + What: /sys/bus/iio/devices/iio:deviceX/in_capacitanceY_raw KernelVersion: 3.2 Contact: linux-iio@vger.kernel.org @@ -1569,6 +1587,9 @@ Description: What: /sys/.../iio:deviceX/in_energy_input What: /sys/.../iio:deviceX/in_energy_raw +What: /sys/.../iio:deviceX/in_energyY_active_raw +What: /sys/.../iio:deviceX/in_energyY_reactive_raw +What: /sys/.../iio:deviceX/in_energyY_apparent_raw KernelVersion: 4.0 Contact: linux-iio@vger.kernel.org Description: @@ -1707,6 +1728,14 @@ Description: component of the signal while the 'q' channel contains the quadrature component. +What: /sys/bus/iio/devices/iio:deviceX/in_altcurrentY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled no bias removal etc.) Root Mean Square (RMS) current + measurement from channel Y. Units after application of scale and + offset are milliamps. + What: /sys/.../iio:deviceX/in_energy_en What: /sys/.../iio:deviceX/in_distance_en What: /sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_en diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 4ff2c44f9324..88c3d585a1bd 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -153,6 +153,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; /* relies on pairs of these shared then separate */ @@ -190,6 +194,7 @@ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_ZEROPOINT] = "zeropoint", [IIO_CHAN_INFO_TROUGH] = "trough_raw", [IIO_CHAN_INFO_CONVDELAY] = "convdelay", + [IIO_CHAN_INFO_POWERFACTOR] = "powerfactor", }; /** * iio_device_id() - query the unique ID for the device diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index ad2761efcc83..34eebad12d2c 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -70,6 +70,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_ZEROPOINT, IIO_CHAN_INFO_TROUGH, IIO_CHAN_INFO_CONVDELAY, + IIO_CHAN_INFO_POWERFACTOR, }; #endif /* _IIO_TYPES_H_ */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3c3cc1497a1e..6d269b844271 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -109,6 +109,10 @@ enum iio_modifier { IIO_MOD_ROLL, IIO_MOD_LIGHT_UVA, IIO_MOD_LIGHT_UVB, + IIO_MOD_RMS, + IIO_MOD_ACTIVE, + IIO_MOD_REACTIVE, + IIO_MOD_APPARENT, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index d26aff649f3f..03ca33869ce8 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -141,6 +141,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; static bool event_is_known(struct iio_event_data *event) @@ -240,6 +244,10 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_PM4: case IIO_MOD_PM10: case IIO_MOD_O2: + case IIO_MOD_RMS: + case IIO_MOD_ACTIVE: + case IIO_MOD_REACTIVE: + case IIO_MOD_APPARENT: break; default: return false; -- cgit v1.2.3 From 3422b4bc606eee2ba7758ea9347c83332eeec3e3 Mon Sep 17 00:00:00 2001 From: Duje Mihanović Date: Thu, 11 Sep 2025 14:43:45 +0200 Subject: iio: adc: Add driver for Marvell 88PM886 PMIC ADC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marvell's 88PM886 PMIC has a so-called General Purpose ADC used for monitoring various system voltages and temperatures. Add the relevant register definitions to the MFD header and a driver for the ADC. Acked-by: Karel Balej # for the PMIC Signed-off-by: Duje Mihanović Signed-off-by: Jonathan Cameron --- MAINTAINERS | 5 + drivers/iio/adc/88pm886-gpadc.c | 393 ++++++++++++++++++++++++++++++++++++++++ drivers/iio/adc/Kconfig | 13 ++ drivers/iio/adc/Makefile | 1 + include/linux/mfd/88pm886.h | 58 ++++++ 5 files changed, 470 insertions(+) create mode 100644 drivers/iio/adc/88pm886-gpadc.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 75615d82593e..5c572204933d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14714,6 +14714,11 @@ F: drivers/regulator/88pm886-regulator.c F: drivers/rtc/rtc-88pm886.c F: include/linux/mfd/88pm886.h +MARVELL 88PM886 PMIC GPADC DRIVER +M: Duje Mihanović +S: Maintained +F: drivers/iio/adc/88pm886-gpadc.c + MARVELL ARMADA 3700 PHY DRIVERS M: Miquel Raynal S: Maintained diff --git a/drivers/iio/adc/88pm886-gpadc.c b/drivers/iio/adc/88pm886-gpadc.c new file mode 100644 index 000000000000..cffe35136685 --- /dev/null +++ b/drivers/iio/adc/88pm886-gpadc.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025, Duje Mihanović + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +struct pm886_gpadc { + struct regmap *map; +}; + +enum pm886_gpadc_channel { + VSC_CHAN, + VCHG_PWR_CHAN, + VCF_OUT_CHAN, + VBAT_CHAN, + VBAT_SLP_CHAN, + VBUS_CHAN, + + GPADC0_CHAN, + GPADC1_CHAN, + GPADC2_CHAN, + GPADC3_CHAN, + + GND_DET1_CHAN, + GND_DET2_CHAN, + MIC_DET_CHAN, + + TINT_CHAN, +}; + +static const int pm886_gpadc_regs[] = { + [VSC_CHAN] = PM886_REG_GPADC_VSC, + [VCHG_PWR_CHAN] = PM886_REG_GPADC_VCHG_PWR, + [VCF_OUT_CHAN] = PM886_REG_GPADC_VCF_OUT, + [VBAT_CHAN] = PM886_REG_GPADC_VBAT, + [VBAT_SLP_CHAN] = PM886_REG_GPADC_VBAT_SLP, + [VBUS_CHAN] = PM886_REG_GPADC_VBUS, + + [GPADC0_CHAN] = PM886_REG_GPADC_GPADC0, + [GPADC1_CHAN] = PM886_REG_GPADC_GPADC1, + [GPADC2_CHAN] = PM886_REG_GPADC_GPADC2, + [GPADC3_CHAN] = PM886_REG_GPADC_GPADC3, + + [GND_DET1_CHAN] = PM886_REG_GPADC_GND_DET1, + [GND_DET2_CHAN] = PM886_REG_GPADC_GND_DET2, + [MIC_DET_CHAN] = PM886_REG_GPADC_MIC_DET, + + [TINT_CHAN] = PM886_REG_GPADC_TINT, +}; + +#define ADC_CHANNEL_VOLTAGE(index, lsb, name) \ +{ \ + .type = IIO_VOLTAGE, \ + .indexed = 1, \ + .channel = index, \ + .address = lsb, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \ + BIT(IIO_CHAN_INFO_SCALE), \ + .datasheet_name = name, \ +} + +#define ADC_CHANNEL_RESISTANCE(index, lsb, name) \ +{ \ + .type = IIO_RESISTANCE, \ + .indexed = 1, \ + .channel = index, \ + .address = lsb, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED), \ + .datasheet_name = name, \ +} + +#define ADC_CHANNEL_TEMPERATURE(index, lsb, name) \ +{ \ + .type = IIO_TEMP, \ + .indexed = 1, \ + .channel = index, \ + .address = lsb, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \ + BIT(IIO_CHAN_INFO_SCALE) | \ + BIT(IIO_CHAN_INFO_OFFSET), \ + .datasheet_name = name, \ +} + +static const struct iio_chan_spec pm886_gpadc_channels[] = { + ADC_CHANNEL_VOLTAGE(VSC_CHAN, 1367, "vsc"), + ADC_CHANNEL_VOLTAGE(VCHG_PWR_CHAN, 1709, "vchg_pwr"), + ADC_CHANNEL_VOLTAGE(VCF_OUT_CHAN, 1367, "vcf_out"), + ADC_CHANNEL_VOLTAGE(VBAT_CHAN, 1367, "vbat"), + ADC_CHANNEL_VOLTAGE(VBAT_SLP_CHAN, 1367, "vbat_slp"), + ADC_CHANNEL_VOLTAGE(VBUS_CHAN, 1709, "vbus"), + + ADC_CHANNEL_RESISTANCE(GPADC0_CHAN, 342, "gpadc0"), + ADC_CHANNEL_RESISTANCE(GPADC1_CHAN, 342, "gpadc1"), + ADC_CHANNEL_RESISTANCE(GPADC2_CHAN, 342, "gpadc2"), + ADC_CHANNEL_RESISTANCE(GPADC3_CHAN, 342, "gpadc3"), + + ADC_CHANNEL_VOLTAGE(GND_DET1_CHAN, 342, "gnddet1"), + ADC_CHANNEL_VOLTAGE(GND_DET2_CHAN, 342, "gnddet2"), + ADC_CHANNEL_VOLTAGE(MIC_DET_CHAN, 1367, "mic_det"), + + ADC_CHANNEL_TEMPERATURE(TINT_CHAN, 104, "tint"), +}; + +static const struct regmap_config pm886_gpadc_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = PM886_GPADC_MAX_REGISTER, +}; + +static int gpadc_get_raw(struct iio_dev *iio, enum pm886_gpadc_channel chan) +{ + struct pm886_gpadc *gpadc = iio_priv(iio); + __be16 buf; + int ret; + + ret = regmap_bulk_read(gpadc->map, pm886_gpadc_regs[chan], &buf, sizeof(buf)); + if (ret) + return ret; + + return be16_to_cpu(buf) >> 4; +} + +static int +gpadc_set_bias(struct pm886_gpadc *gpadc, enum pm886_gpadc_channel chan, bool on) +{ + unsigned int gpadc_num = chan - GPADC0_CHAN; + unsigned int bits = BIT(gpadc_num + 4) | BIT(gpadc_num); + + return regmap_assign_bits(gpadc->map, PM886_REG_GPADC_CONFIG(0x14), bits, on); +} + +static int +gpadc_find_bias_current(struct iio_dev *iio, struct iio_chan_spec const *chan, + unsigned int *raw_uV, unsigned int *raw_uA) +{ + struct pm886_gpadc *gpadc = iio_priv(iio); + unsigned int gpadc_num = chan->channel - GPADC0_CHAN; + unsigned int reg = PM886_REG_GPADC_CONFIG(0xb + gpadc_num); + unsigned long lsb = chan->address; + int ret; + + for (unsigned int i = 0; i < PM886_GPADC_BIAS_LEVELS; i++) { + ret = regmap_update_bits(gpadc->map, reg, GENMASK(3, 0), i); + if (ret) + return ret; + + /* Wait for the new bias level to apply. */ + fsleep(5 * USEC_PER_MSEC); + + *raw_uA = PM886_GPADC_INDEX_TO_BIAS_uA(i); + *raw_uV = gpadc_get_raw(iio, chan->channel) * lsb; + + /* + * Vendor kernel errors out above 1.25 V, but testing shows + * that the resistance of the battery detection channel (GPADC2 + * on coreprimevelte) reaches about 1.4 MΩ when the battery is + * removed, which can't be measured with such a low upper + * limit. Therefore, to be able to detect the battery without + * ugly externs as used in the vendor fuel gauge driver, + * increase this limit a bit. + */ + if (WARN_ON(*raw_uV > 1500 * (MICRO / MILLI))) + return -EIO; + + /* + * Vendor kernel errors out under 300 mV, but for the same + * reason as above (except the channel hovers around 3.5 kΩ + * with battery present) reduce this limit. + */ + if (*raw_uV < 200 * (MICRO / MILLI)) { + dev_dbg(&iio->dev, "bad bias for chan %d: %d uA @ %d uV\n", + chan->channel, *raw_uA, *raw_uV); + continue; + } + + dev_dbg(&iio->dev, "good bias for chan %d: %d uA @ %d uV\n", + chan->channel, *raw_uA, *raw_uV); + return 0; + } + + dev_err(&iio->dev, "failed to find good bias for chan %d\n", chan->channel); + return -EINVAL; +} + +static int +gpadc_get_resistance_ohm(struct iio_dev *iio, struct iio_chan_spec const *chan) +{ + struct pm886_gpadc *gpadc = iio_priv(iio); + unsigned int raw_uV, raw_uA; + int ret; + + ret = gpadc_set_bias(gpadc, chan->channel, true); + if (ret) + goto out; + + ret = gpadc_find_bias_current(iio, chan, &raw_uV, &raw_uA); + if (ret) + goto out; + + ret = DIV_ROUND_CLOSEST(raw_uV, raw_uA); +out: + gpadc_set_bias(gpadc, chan->channel, false); + return ret; +} + +static int +__pm886_gpadc_read_raw(struct iio_dev *iio, struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + unsigned long lsb = chan->address; + + switch (mask) { + case IIO_CHAN_INFO_RAW: + *val = gpadc_get_raw(iio, chan->channel); + if (*val < 0) + return *val; + + return IIO_VAL_INT; + case IIO_CHAN_INFO_SCALE: + *val = lsb; + + if (chan->type == IIO_VOLTAGE) { + *val2 = MILLI; + return IIO_VAL_FRACTIONAL; + } else { + return IIO_VAL_INT; + } + case IIO_CHAN_INFO_OFFSET: + /* Raw value is 104 millikelvin/LSB, convert it to 104 millicelsius/LSB */ + *val = ABSOLUTE_ZERO_MILLICELSIUS; + *val2 = lsb; + return IIO_VAL_FRACTIONAL; + case IIO_CHAN_INFO_PROCESSED: + *val = gpadc_get_resistance_ohm(iio, chan); + if (*val < 0) + return *val; + + return IIO_VAL_INT; + default: + return -EINVAL; + } +} + +static int pm886_gpadc_read_raw(struct iio_dev *iio, struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + struct device *dev = iio->dev.parent; + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return ret; + + ret = __pm886_gpadc_read_raw(iio, chan, val, val2, mask); + + pm_runtime_put_autosuspend(dev); + return ret; +} + +static int pm886_gpadc_hw_enable(struct regmap *map) +{ + const u8 config[] = { + PM886_GPADC_CONFIG1_EN_ALL, + PM886_GPADC_CONFIG2_EN_ALL, + PM886_GPADC_GND_DET2_EN, + }; + int ret; + + /* Enable the ADC block. */ + ret = regmap_set_bits(map, PM886_REG_GPADC_CONFIG(0x6), BIT(0)); + if (ret) + return ret; + + /* Enable all channels. */ + return regmap_bulk_write(map, PM886_REG_GPADC_CONFIG(0x1), config, ARRAY_SIZE(config)); +} + +static int pm886_gpadc_hw_disable(struct regmap *map) +{ + return regmap_clear_bits(map, PM886_REG_GPADC_CONFIG(0x6), BIT(0)); +} + +static const struct iio_info pm886_gpadc_iio_info = { + .read_raw = pm886_gpadc_read_raw, +}; + +static int pm886_gpadc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pm886_chip *chip = dev_get_drvdata(dev->parent); + struct i2c_client *client = chip->client; + struct pm886_gpadc *gpadc; + struct i2c_client *page; + struct iio_dev *iio; + int ret; + + iio = devm_iio_device_alloc(dev, sizeof(*gpadc)); + if (!iio) + return -ENOMEM; + + gpadc = iio_priv(iio); + dev_set_drvdata(dev, iio); + + page = devm_i2c_new_dummy_device(dev, client->adapter, + client->addr + PM886_PAGE_OFFSET_GPADC); + if (IS_ERR(page)) + return dev_err_probe(dev, PTR_ERR(page), "Failed to initialize GPADC page\n"); + + gpadc->map = devm_regmap_init_i2c(page, &pm886_gpadc_regmap_config); + if (IS_ERR(gpadc->map)) + return dev_err_probe(dev, PTR_ERR(gpadc->map), + "Failed to initialize GPADC regmap\n"); + + iio->name = "88pm886-gpadc"; + iio->modes = INDIO_DIRECT_MODE; + iio->info = &pm886_gpadc_iio_info; + iio->channels = pm886_gpadc_channels; + iio->num_channels = ARRAY_SIZE(pm886_gpadc_channels); + device_set_node(&iio->dev, dev_fwnode(dev->parent)); + + ret = devm_pm_runtime_enable(dev); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable runtime PM\n"); + + pm_runtime_set_autosuspend_delay(dev, 50); + pm_runtime_use_autosuspend(dev); + ret = devm_iio_device_register(dev, iio); + if (ret) + return dev_err_probe(dev, ret, "Failed to register ADC\n"); + + return 0; +} + +static int pm886_gpadc_runtime_resume(struct device *dev) +{ + struct iio_dev *iio = dev_get_drvdata(dev); + struct pm886_gpadc *gpadc = iio_priv(iio); + + return pm886_gpadc_hw_enable(gpadc->map); +} + +static int pm886_gpadc_runtime_suspend(struct device *dev) +{ + struct iio_dev *iio = dev_get_drvdata(dev); + struct pm886_gpadc *gpadc = iio_priv(iio); + + return pm886_gpadc_hw_disable(gpadc->map); +} + +static DEFINE_RUNTIME_DEV_PM_OPS(pm886_gpadc_pm_ops, + pm886_gpadc_runtime_suspend, + pm886_gpadc_runtime_resume, NULL); + +static const struct platform_device_id pm886_gpadc_id[] = { + { "88pm886-gpadc" }, + { } +}; +MODULE_DEVICE_TABLE(platform, pm886_gpadc_id); + +static struct platform_driver pm886_gpadc_driver = { + .driver = { + .name = "88pm886-gpadc", + .pm = pm_ptr(&pm886_gpadc_pm_ops), + }, + .probe = pm886_gpadc_probe, + .id_table = pm886_gpadc_id, +}; +module_platform_driver(pm886_gpadc_driver); + +MODULE_AUTHOR("Duje Mihanović "); +MODULE_DESCRIPTION("Marvell 88PM886 GPADC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 09ab25c8cfdb..8c84b2c1d223 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -9,6 +9,19 @@ menu "Analog to digital converters" config IIO_ADC_HELPER tristate +config 88PM886_GPADC + tristate "Marvell 88PM886 GPADC driver" + depends on MFD_88PM886_PMIC + default MFD_88PM886_PMIC + help + Say Y here to enable support for the GPADC (General Purpose ADC) + found on the Marvell 88PM886 PMIC. The GPADC measures various + internal voltages and temperatures, including (but not limited to) + system, battery and USB Vbus. + + To compile this driver as a module, choose M here: the module will be + called 88pm886-gpadc. + config AB8500_GPADC bool "ST-Ericsson AB8500 GPADC driver" depends on AB8500_CORE && REGULATOR_AB8500 diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index 4d071f290911..611c16430621 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_IIO_ADC_HELPER) += industrialio-adc.o # When adding new entries keep the list in alphabetical order +obj-$(CONFIG_88PM886_GPADC) += 88pm886-gpadc.o obj-$(CONFIG_AB8500_GPADC) += ab8500-gpadc.o obj-$(CONFIG_AD_SIGMA_DELTA) += ad_sigma_delta.o obj-$(CONFIG_AD4000) += ad4000.o diff --git a/include/linux/mfd/88pm886.h b/include/linux/mfd/88pm886.h index 85eca44f39ab..38892ba7b8a4 100644 --- a/include/linux/mfd/88pm886.h +++ b/include/linux/mfd/88pm886.h @@ -10,6 +10,7 @@ #define PM886_IRQ_ONKEY 0 #define PM886_PAGE_OFFSET_REGULATORS 1 +#define PM886_PAGE_OFFSET_GPADC 2 #define PM886_REG_ID 0x00 @@ -70,6 +71,63 @@ #define PM886_LDO_VSEL_MASK 0x0f #define PM886_BUCK_VSEL_MASK 0x7f +/* GPADC enable/disable registers */ +#define PM886_REG_GPADC_CONFIG(n) (n) + +#define PM886_GPADC_VSC_EN BIT(0) +#define PM886_GPADC_VBAT_EN BIT(1) +#define PM886_GPADC_GNDDET1_EN BIT(3) +#define PM886_GPADC_VBUS_EN BIT(4) +#define PM886_GPADC_VCHG_PWR_EN BIT(5) +#define PM886_GPADC_VCF_OUT_EN BIT(6) +#define PM886_GPADC_CONFIG1_EN_ALL \ + (PM886_GPADC_VSC_EN | \ + PM886_GPADC_VBAT_EN | \ + PM886_GPADC_GNDDET1_EN | \ + PM886_GPADC_VBUS_EN | \ + PM886_GPADC_VCHG_PWR_EN | \ + PM886_GPADC_VCF_OUT_EN) + +#define PM886_GPADC_TINT_EN BIT(0) +#define PM886_GPADC_PMODE_EN BIT(1) +#define PM886_GPADC_GPADC0_EN BIT(2) +#define PM886_GPADC_GPADC1_EN BIT(3) +#define PM886_GPADC_GPADC2_EN BIT(4) +#define PM886_GPADC_GPADC3_EN BIT(5) +#define PM886_GPADC_MIC_DET_EN BIT(6) +#define PM886_GPADC_CONFIG2_EN_ALL \ + (PM886_GPADC_TINT_EN | \ + PM886_GPADC_GPADC0_EN | \ + PM886_GPADC_GPADC1_EN | \ + PM886_GPADC_GPADC2_EN | \ + PM886_GPADC_GPADC3_EN | \ + PM886_GPADC_MIC_DET_EN) + +/* No CONFIG3_EN_ALL because this is the only bit there. */ +#define PM886_GPADC_GND_DET2_EN BIT(0) + +/* GPADC channel registers */ +#define PM886_REG_GPADC_VSC 0x40 +#define PM886_REG_GPADC_VCHG_PWR 0x4c +#define PM886_REG_GPADC_VCF_OUT 0x4e +#define PM886_REG_GPADC_TINT 0x50 +#define PM886_REG_GPADC_GPADC0 0x54 +#define PM886_REG_GPADC_GPADC1 0x56 +#define PM886_REG_GPADC_GPADC2 0x58 +#define PM886_REG_GPADC_VBAT 0xa0 +#define PM886_REG_GPADC_GND_DET1 0xa4 +#define PM886_REG_GPADC_GND_DET2 0xa6 +#define PM886_REG_GPADC_VBUS 0xa8 +#define PM886_REG_GPADC_GPADC3 0xaa +#define PM886_REG_GPADC_MIC_DET 0xac +#define PM886_REG_GPADC_VBAT_SLP 0xb0 + +/* VBAT_SLP is the last register and is 2 bytes wide like other channels. */ +#define PM886_GPADC_MAX_REGISTER (PM886_REG_GPADC_VBAT_SLP + 1) + +#define PM886_GPADC_BIAS_LEVELS 16 +#define PM886_GPADC_INDEX_TO_BIAS_uA(i) (1 + (i) * 5) + struct pm886_chip { struct i2c_client *client; unsigned int chip_id; -- cgit v1.2.3 From 1ab40529ad52f339975886a6a9e815dfdcb8d011 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 15 Jul 2025 21:52:54 +0300 Subject: media: uvcvideo: Move MSXU_CONTROL_METADATA definition to header Move the MSXU_CONTROL_METADATA control definitino to the include/linux/usb/uvc.h header, alongside the corresponding XU GUID. Add a UVC_ prefix to avoid namespace clashes. While at it, add the definition for the other controls for that extension unit, as defined in https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5#222-extension-unit-controls. Signed-off-by: Laurent Pinchart Reviewed-by: Ricardo Ribalda Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Hans Verkuil --- drivers/media/usb/uvc/uvc_metadata.c | 11 +++++------ include/linux/usb/uvc.h | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c index 7368400734a3..b35111d08759 100644 --- a/drivers/media/usb/uvc/uvc_metadata.c +++ b/drivers/media/usb/uvc/uvc_metadata.c @@ -171,7 +171,6 @@ static struct uvc_entity *uvc_meta_find_msxu(struct uvc_device *dev) return NULL; } -#define MSXU_CONTROL_METADATA 0x9 static int uvc_meta_detect_msxu(struct uvc_device *dev) { u32 *data __free(kfree) = NULL; @@ -195,7 +194,7 @@ static int uvc_meta_detect_msxu(struct uvc_device *dev) * returns metadata. */ ret = uvc_query_ctrl(dev, UVC_GET_CUR, entity->id, dev->intfnum, - MSXU_CONTROL_METADATA, data, sizeof(*data)); + UVC_MSXU_CONTROL_METADATA, data, sizeof(*data)); if (ret) return 0; @@ -205,23 +204,23 @@ static int uvc_meta_detect_msxu(struct uvc_device *dev) } /* - * Set the value of MSXU_CONTROL_METADATA to the value reported by + * Set the value of UVC_MSXU_CONTROL_METADATA to the value reported by * GET_MAX to enable production of MSXU metadata. The GET_MAX request * reports the maximum size of the metadata, if its value is 0 then MSXU * metadata is not supported. For more information, see * https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5#2229-metadata-control */ ret = uvc_query_ctrl(dev, UVC_GET_MAX, entity->id, dev->intfnum, - MSXU_CONTROL_METADATA, data, sizeof(*data)); + UVC_MSXU_CONTROL_METADATA, data, sizeof(*data)); if (ret || !*data) return 0; /* - * If we can set MSXU_CONTROL_METADATA, the device will report + * If we can set UVC_MSXU_CONTROL_METADATA, the device will report * metadata. */ ret = uvc_query_ctrl(dev, UVC_SET_CUR, entity->id, dev->intfnum, - MSXU_CONTROL_METADATA, data, sizeof(*data)); + UVC_MSXU_CONTROL_METADATA, data, sizeof(*data)); if (!ret) dev->quirks |= UVC_QUIRK_MSXU_META; diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index ee19e9f915b8..12a57e1d3467 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -33,6 +33,23 @@ {0xdc, 0x95, 0x3f, 0x0f, 0x32, 0x26, 0x4e, 0x4c, \ 0x92, 0xc9, 0xa0, 0x47, 0x82, 0xf4, 0x3b, 0xc8} +/* https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5#222-extension-unit-controls */ +#define UVC_MSXU_CONTROL_FOCUS 0x01 +#define UVC_MSXU_CONTROL_EXPOSURE 0x02 +#define UVC_MSXU_CONTROL_EVCOMPENSATION 0x03 +#define UVC_MSXU_CONTROL_WHITEBALANCE 0x04 +#define UVC_MSXU_CONTROL_FACE_AUTHENTICATION 0x06 +#define UVC_MSXU_CONTROL_CAMERA_EXTRINSICS 0x07 +#define UVC_MSXU_CONTROL_CAMERA_INTRINSICS 0x08 +#define UVC_MSXU_CONTROL_METADATA 0x09 +#define UVC_MSXU_CONTROL_IR_TORCH 0x0a +#define UVC_MSXU_CONTROL_DIGITALWINDOW 0x0b +#define UVC_MSXU_CONTROL_DIGITALWINDOW_CONFIG 0x0c +#define UVC_MSXU_CONTROL_VIDEO_HDR 0x0d +#define UVC_MSXU_CONTROL_FRAMERATE_THROTTLE 0x0e +#define UVC_MSXU_CONTROL_FIELDOFVIEW2_CONFIG 0x0f +#define UVC_MSXU_CONTROL_FIELDOFVIEW2 0x10 + #define UVC_GUID_FORMAT_MJPEG \ { 'M', 'J', 'P', 'G', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -- cgit v1.2.3 From 0f99b8bed426b8f5434b10c3f6f6b92d7ce3c467 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 18 Aug 2025 20:15:39 +0000 Subject: media: uvcvideo: Support UVC_CROSXU_CONTROL_IQ_PROFILE The ChromeOS XU provides a control to change the IQ profile for a camera. It can be switched from VIVID (a.k.a. standard) to NONE (a.k.a. natural). Wire it up to the standard v4l2 control. Signed-off-by: Ricardo Ribalda Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Reviewed-by: Laurent Pinchart Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- drivers/media/usb/uvc/uvc_ctrl.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/usb/uvc.h | 5 +++++ 2 files changed, 39 insertions(+) (limited to 'include') diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index 277209ee40c1..2905505c240c 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -376,6 +376,15 @@ static const struct uvc_control_info uvc_ctrls[] = { | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_AUTO_UPDATE, }, + { + .entity = UVC_GUID_CHROMEOS_XU, + .selector = UVC_CROSXU_CONTROL_IQ_PROFILE, + .index = 3, + .size = 1, + .flags = UVC_CTRL_FLAG_SET_CUR + | UVC_CTRL_FLAG_GET_RANGE + | UVC_CTRL_FLAG_RESTORE, + }, }; static const u32 uvc_control_classes[] = { @@ -384,6 +393,19 @@ static const u32 uvc_control_classes[] = { }; static const int exposure_auto_mapping[] = { 2, 1, 4, 8 }; +static const int cros_colorfx_mapping[] = { + 1, /* V4L2_COLORFX_NONE */ + -1, /* V4L2_COLORFX_BW */ + -1, /* V4L2_COLORFX_SEPIA */ + -1, /* V4L2_COLORFX_NEGATIVE */ + -1, /* V4L2_COLORFX_EMBOSS */ + -1, /* V4L2_COLORFX_SKETCH */ + -1, /* V4L2_COLORFX_SKY_BLUE */ + -1, /* V4L2_COLORFX_GRASS_GREEN */ + -1, /* V4L2_COLORFX_SKIN_WHITEN */ + 0, /* V4L2_COLORFX_VIVID */ +}; + static bool uvc_ctrl_mapping_is_compound(struct uvc_control_mapping *mapping) { @@ -975,6 +997,18 @@ static const struct uvc_control_mapping uvc_ctrl_mappings[] = { .data_type = UVC_CTRL_DATA_TYPE_BITMASK, .name = "Region of Interest Auto Ctrls", }, + { + .id = V4L2_CID_COLORFX, + .entity = UVC_GUID_CHROMEOS_XU, + .selector = UVC_CROSXU_CONTROL_IQ_PROFILE, + .size = 8, + .offset = 0, + .v4l2_type = V4L2_CTRL_TYPE_MENU, + .data_type = UVC_CTRL_DATA_TYPE_ENUM, + .menu_mapping = cros_colorfx_mapping, + .menu_mask = BIT(V4L2_COLORFX_VIVID) | + BIT(V4L2_COLORFX_NONE), + }, }; /* ------------------------------------------------------------------------ diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index 12a57e1d3467..22e0dab0809e 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -29,6 +29,9 @@ #define UVC_GUID_EXT_GPIO_CONTROLLER \ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03} +#define UVC_GUID_CHROMEOS_XU \ + {0x24, 0xe9, 0xd7, 0x74, 0xc9, 0x49, 0x45, 0x4a, \ + 0x98, 0xa3, 0xc8, 0x07, 0x7e, 0x05, 0x1c, 0xa3} #define UVC_GUID_MSXU_1_5 \ {0xdc, 0x95, 0x3f, 0x0f, 0x32, 0x26, 0x4e, 0x4c, \ 0x92, 0xc9, 0xa0, 0x47, 0x82, 0xf4, 0x3b, 0xc8} @@ -50,6 +53,8 @@ #define UVC_MSXU_CONTROL_FIELDOFVIEW2_CONFIG 0x0f #define UVC_MSXU_CONTROL_FIELDOFVIEW2 0x10 +#define UVC_CROSXU_CONTROL_IQ_PROFILE 0x04 + #define UVC_GUID_FORMAT_MJPEG \ { 'M', 'J', 'P', 'G', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -- cgit v1.2.3 From f1880f9cc1476171bec3dae8fd56fff5665e22a4 Mon Sep 17 00:00:00 2001 From: Stéphane Grosjean Date: Fri, 12 Sep 2025 10:17:19 +0200 Subject: can: peak: Modification of references to email accounts being deleted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the upcoming deletion of @peak-system.com accounts and following the acquisition of PEAK-System and its brand by HMS-Networks, this fix aims to migrate all address references to @hms-networks.com, as well as to map my personal committer addresses to author addresses, while taking the opportunity to correct the accent on the first ‘e’ of my first name. Signed-off-by: Stéphane Grosjean Link: https://patch.msgid.link/20250912081820.86314-1-stephane.grosjean@free.fr Signed-off-by: Marc Kleine-Budde --- .mailmap | 2 ++ drivers/net/can/peak_canfd/peak_canfd.c | 4 ++-- drivers/net/can/peak_canfd/peak_canfd_user.h | 4 ++-- drivers/net/can/peak_canfd/peak_pciefd_main.c | 6 +++--- drivers/net/can/sja1000/peak_pci.c | 6 +++--- drivers/net/can/sja1000/peak_pcmcia.c | 8 ++++---- drivers/net/can/usb/peak_usb/pcan_usb.c | 6 +++--- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 6 +++--- drivers/net/can/usb/peak_usb/pcan_usb_core.h | 4 ++-- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 3 ++- drivers/net/can/usb/peak_usb/pcan_usb_pro.c | 4 ++-- drivers/net/can/usb/peak_usb/pcan_usb_pro.h | 4 ++-- include/linux/can/dev/peak_canfd.h | 4 ++-- 13 files changed, 32 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/.mailmap b/.mailmap index 4a5f2c8deeff..c393045d9361 100644 --- a/.mailmap +++ b/.mailmap @@ -740,6 +740,8 @@ Sriram Yagnaraman Stanislav Fomichev Stanislav Fomichev Stefan Wahren +Stéphane Grosjean +Stéphane Grosjean Stéphane Witzmann Stephen Hemminger Stephen Hemminger diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index 77292afaed22..b5bc80ac7876 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2007, 2011 Wolfgang Grandegger - * Copyright (C) 2012 Stephane Grosjean * - * Copyright (C) 2016 PEAK System-Technik GmbH + * Copyright (C) 2016-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include diff --git a/drivers/net/can/peak_canfd/peak_canfd_user.h b/drivers/net/can/peak_canfd/peak_canfd_user.h index a72719dc3b74..60c6542028cf 100644 --- a/drivers/net/can/peak_canfd/peak_canfd_user.h +++ b/drivers/net/can/peak_canfd/peak_canfd_user.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* CAN driver for PEAK System micro-CAN based adapters * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2013 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #ifndef PEAK_CANFD_USER_H #define PEAK_CANFD_USER_H diff --git a/drivers/net/can/peak_canfd/peak_pciefd_main.c b/drivers/net/can/peak_canfd/peak_pciefd_main.c index 1df3c4b54f03..93558e33bc02 100644 --- a/drivers/net/can/peak_canfd/peak_pciefd_main.c +++ b/drivers/net/can/peak_canfd/peak_pciefd_main.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2007, 2011 Wolfgang Grandegger - * Copyright (C) 2012 Stephane Grosjean * * Derived from the PCAN project file driver/src/pcan_pci.c: * - * Copyright (C) 2001-2006 PEAK System-Technik GmbH + * Copyright (C) 2001-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include @@ -19,7 +19,7 @@ #include "peak_canfd_user.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCIe/M.2 FD family cards"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c index da396d641e24..10d88cbda465 100644 --- a/drivers/net/can/sja1000/peak_pci.c +++ b/drivers/net/can/sja1000/peak_pci.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007, 2011 Wolfgang Grandegger - * Copyright (C) 2012 Stephane Grosjean * * Derived from the PCAN project file driver/src/pcan_pci.c: * - * Copyright (C) 2001-2006 PEAK System-Technik GmbH + * Copyright (C) 2001-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include @@ -22,7 +22,7 @@ #include "sja1000.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCI family cards"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/sja1000/peak_pcmcia.c b/drivers/net/can/sja1000/peak_pcmcia.c index ce18e9e56059..e1610b527d13 100644 --- a/drivers/net/can/sja1000/peak_pcmcia.c +++ b/drivers/net/can/sja1000/peak_pcmcia.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2010-2012 Stephane Grosjean - * * CAN driver for PEAK-System PCAN-PC Card * Derived from the PCAN project file driver/src/pcan_pccard.c - * Copyright (C) 2006-2010 PEAK System-Technik GmbH + * + * Copyright (C) 2006-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include #include @@ -19,7 +19,7 @@ #include #include "sja1000.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("CAN driver for PEAK-System PCAN-PC Cards"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index 6b293a9056c2..9278a1522aae 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -3,8 +3,8 @@ * CAN driver for PEAK System PCAN-USB adapter * Derived from the PCAN project file driver/src/pcan_usb.c * - * Copyright (C) 2003-2010 PEAK System-Technik GmbH - * Copyright (C) 2011-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean * * Many thanks to Klaus Hitschler */ @@ -919,7 +919,7 @@ static int pcan_usb_init(struct peak_usb_device *dev) CAN_CTRLMODE_LOOPBACK; } else { dev_info(dev->netdev->dev.parent, - "Firmware update available. Please contact support@peak-system.com\n"); + "Firmware update available. Please contact support.peak@hms-networks.com\n"); } return 0; diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 117637b9b995..decbf5ed10bd 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -3,8 +3,8 @@ * CAN driver for PEAK System USB adapters * Derived from the PCAN project file driver/src/pcan_usb_core.c * - * Copyright (C) 2003-2010 PEAK System-Technik GmbH - * Copyright (C) 2010-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean * * Many thanks to Klaus Hitschler */ @@ -24,7 +24,7 @@ #include "pcan_usb_core.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("CAN driver for PEAK-System USB adapters"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.h b/drivers/net/can/usb/peak_usb/pcan_usb_core.h index abab00930b9d..d1c1897d47b9 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.h +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.h @@ -3,8 +3,8 @@ * CAN driver for PEAK System USB adapters * Derived from the PCAN project file driver/src/pcan_usb_core.c * - * Copyright (C) 2003-2010 PEAK System-Technik GmbH - * Copyright (C) 2010-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean * * Many thanks to Klaus Hitschler */ diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index ebefc274b50a..be84191cde56 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -2,7 +2,8 @@ /* * CAN driver for PEAK System PCAN-USB FD / PCAN-USB Pro FD adapter * - * Copyright (C) 2013-2014 Stephane Grosjean + * Copyright (C) 2013-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include #include diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c index f736196383ac..7be286293b1a 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c @@ -3,8 +3,8 @@ * CAN driver for PEAK System PCAN-USB Pro adapter * Derived from the PCAN project file driver/src/pcan_usbpro.c * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include #include diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_pro.h b/drivers/net/can/usb/peak_usb/pcan_usb_pro.h index 28e740af905d..162c7546d3a8 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_pro.h +++ b/drivers/net/can/usb/peak_usb/pcan_usb_pro.h @@ -3,8 +3,8 @@ * CAN driver for PEAK System PCAN-USB Pro adapter * Derived from the PCAN project file driver/src/pcan_usbpro_fw.h * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #ifndef PCAN_USB_PRO_H #define PCAN_USB_PRO_H diff --git a/include/linux/can/dev/peak_canfd.h b/include/linux/can/dev/peak_canfd.h index f38772fd0c07..d3788a3d0942 100644 --- a/include/linux/can/dev/peak_canfd.h +++ b/include/linux/can/dev/peak_canfd.h @@ -2,8 +2,8 @@ /* * CAN driver for PEAK System micro-CAN based adapters * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2013 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #ifndef PUCAN_H #define PUCAN_H -- cgit v1.2.3 From 6eb350a2233100a283f882c023e5ad426d0ed63b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 17:02:30 +0200 Subject: rseq: Protect event mask against membarrier IPI rseq_need_restart() reads and clears task::rseq_event_mask with preemption disabled to guard against the scheduler. But membarrier() uses an IPI and sets the PREEMPT bit in the event mask from the IPI, which leaves that RMW operation unprotected. Use guard(irq) if CONFIG_MEMBARRIER is enabled to fix that. Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ") Signed-off-by: Thomas Gleixner Reviewed-by: Boqun Feng Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org --- include/linux/rseq.h | 11 ++++++++--- kernel/rseq.c | 10 +++++----- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..1fbeb61babeb 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,6 +7,12 @@ #include #include +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig, static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); rseq_handle_notify_resume(ksig, regs); } diff --git a/kernel/rseq.c b/kernel/rseq.c index b7a1ec327e81..2452b7366b00 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) /* * Load and clear event mask atomically with respect to - * scheduler preemption. + * scheduler preemption and membarrier IPIs. */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } return !!event_mask; } -- cgit v1.2.3 From 2da6de30e60dd9bb14600eff1cc99df2fa2ddae3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:23:15 -0700 Subject: mm: folio_may_be_lru_cached() unless folio_test_large() mm/swap.c and mm/mlock.c agree to drain any per-CPU batch as soon as a large folio is added: so collect_longterm_unpinnable_folios() just wastes effort when calling lru_add_drain[_all]() on a large folio. But although there is good reason not to batch up PMD-sized folios, we might well benefit from batching a small number of low-order mTHPs (though unclear how that "small number" limitation will be implemented). So ask if folio_may_be_lru_cached() rather than !folio_test_large(), to insulate those particular checks from future change. Name preferred to "folio_is_batchable" because large folios can well be put on a batch: it's just the per-CPU LRU caches, drained much later, which need care. Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Link: https://lkml.kernel.org/r/57d2eaf8-3607-f318-e0c5-be02dce61ad0@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 ++++++++++ mm/gup.c | 4 ++-- mm/mlock.c | 6 +++--- mm/swap.c | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..7012a0f758d8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/mm/gup.c b/mm/gup.c index b47066a54f52..0bc4d140fc07 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,13 +2307,13 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drained == 0 && + if (drained == 0 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain(); drained = 1; } - if (drained == 1 && + if (drained == 1 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6d..bb0776f5ef7c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/swap.c b/mm/swap.c index 6ae2d5680574..b74ebe865dd9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -192,7 +192,7 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) -- cgit v1.2.3 From e6a0deb6fa5b0fc134ee2aa127d1cfc9456d8445 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:12 -0700 Subject: mm/damon/core: introduce damon_call_control->dealloc_on_cancel Patch series "mm/damon/sysfs: fix refresh_ms control overwriting on multi-kdamonds usages". Automatic esssential DAMON/DAMOS status update feature of DAMON sysfs interface (refresh_ms) is broken [1] for multiple DAMON contexts (kdamonds) use case, since it uses a global single damon_call_control object for all created DAMON contexts. The fields of the object, particularly the list field is over-written for the contexts and it makes unexpected results including user-space hangup and kernel crashes [2]. Fix it by extending damon_call_control for the use case and updating the usage on DAMON sysfs interface to use per-context dynamically allocated damon_call_control object. This patch (of 2): When damon_call_control->repeat is set, damon_call() is executed asynchronously, and is eventually canceled when kdamond finishes. If the damon_call_control object is dynamically allocated, finding the place to deallocate the object is difficult. Introduce a new damon_call_control field, namely dealloc_on_cancel, to ask the kdamond deallocates those dynamically allocated objects when those are canceled. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lkml.kernel.org/r/20250908201513.60802-2-sj@kernel.org Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..9e62b2a85538 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/mm/damon/core.c b/mm/damon/core.c index c2e0b469fd43..08065b363972 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2479,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); -- cgit v1.2.3 From 56b060d0a1d3b1fd0429daeac366f00c030fca59 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 5 Aug 2025 13:50:47 -0700 Subject: mempolicy: clarify what zone reclaim means The zone_reclaim_mode API controls the reclaim behavior when a node runs out of memory. Contrary to its user-facing name, it is internally referred to as "node_reclaim_mode". This can be confusing. But because we cannot change the name of the API since it has been in place since at least 2.6, let's try to be more explicit about what the behavior of this API is. Change the description to clarify what zone reclaim entails, and be explicit about the RECLAIM_ZONE bit, whose purpose has led to some confusion in the past already [1] [2]. While at it, also soften the warning about changing these bits. [joshua.hahnjy@gmail.com: remove the reference to the vm.zone_reclaim_mode sysctl as an ABI] Link: https://lkml.kernel.org/r/20250806134404.2000234-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20250805205048.1518453-1-joshua.hahnjy@gmail.com Link: https://lore.kernel.org/linux-mm/1579005573-58923-1-git-send-email-alex.shi@linux.alibaba.com/ [1] Link: https://lore.kernel.org/linux-mm/20200626003459.D8E015CA@viggo.jf.intel.com/ [2] Signed-off-by: Joshua Hahn Acked-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Huang Ying Acked-by: Zi Yan Acked-by: Byungchul Park Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Mathew Brost Cc: Rakie Kim Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 1f9bb10d1a47..8fbbe613611a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -66,10 +66,16 @@ enum { #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. + * Enabling zone reclaim means the page allocator will attempt to fulfill + * the allocation request on the current node by triggering reclaim and + * trying to shrink the current node. + * Fallback allocations on the next candidates in the zonelist are considered + * when reclaim fails to free up enough memory in the current node/zone. + * + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl. + * New bits are OK, but existing bits should not be changed. */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ -- cgit v1.2.3 From 337135e6124b6d37d7ef1cd5a6c0b9681938c5ee Mon Sep 17 00:00:00 2001 From: Ruan Shiyang Date: Tue, 29 Jul 2025 11:51:01 +0800 Subject: mm: memory-tiering: fix PGPROMOTE_CANDIDATE counting Goto-san reported confusing pgpromote statistics where the pgpromote_success count significantly exceeded pgpromote_candidate. On a system with three nodes (nodes 0-1: DRAM 4GB, node 2: NVDIMM 4GB): # Enable demotion only echo 1 > /sys/kernel/mm/numa/demotion_enabled numactl -m 0-1 memhog -r200 3500M >/dev/null & pid=$! sleep 2 numactl memhog -r100 2500M >/dev/null & sleep 10 kill -9 $pid # terminate the 1st memhog # Enable promotion echo 2 > /proc/sys/kernel/numa_balancing After a few seconds, we observeed `pgpromote_candidate < pgpromote_success` $ grep -e pgpromote /proc/vmstat pgpromote_success 2579 pgpromote_candidate 0 In this scenario, after terminating the first memhog, the conditions for pgdat_free_space_enough() are quickly met, and triggers promotion. However, these migrated pages are only counted for in PGPROMOTE_SUCCESS, not in PGPROMOTE_CANDIDATE. To solve these confusing statistics, introduce PGPROMOTE_CANDIDATE_NRL to count the missed promotion pages. And also, not counting these pages into PGPROMOTE_CANDIDATE is to avoid changing the existing algorithm or performance of the promotion rate limit. Link: https://lkml.kernel.org/r/20250901090122.124262-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20250729035101.1601407-1-ruansy.fnst@fujitsu.com Fixes: c6833e10008f ("memory tiering: rate limit NUMA migration throughput") Co-developed-by: Li Zhijian Signed-off-by: Li Zhijian Signed-off-by: Ruan Shiyang Reported-by: Yasunori Gotou (Fujitsu) Suggested-by: Huang Ying Acked-by: Vlastimil Babka Reviewed-by: Huang Ying Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 16 +++++++++++++++- kernel/sched/fair.c | 5 +++-- mm/vmstat.c | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c5da9141983..9d3ea9085556 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -234,7 +234,21 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ - PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /** + * Candidate pages for promotion based on hint fault latency. This + * counter is used to control the promotion rate and adjust the hot + * threshold. + */ + PGPROMOTE_CANDIDATE, + /** + * Not rate-limited (NRL) candidate pages for those can be promoted + * without considering hot threshold because of enough free pages in + * fast-tier node. These promotions bypass the regular hotness checks + * and do NOT influence the promotion rate-limiter or + * threshold-adjustment logic. + * This is for statistics/monitoring purposes. + */ + PGPROMOTE_CANDIDATE_NRL, #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..82c8d804c54c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1923,11 +1923,13 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; + long nr = folio_nr_pages(folio); pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */ pgdat->nbp_threshold = 0; + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr); return true; } @@ -1941,8 +1943,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, if (latency >= th) return false; - return !numa_promotion_rate_limit(pgdat, rate_limit, - folio_nr_pages(folio)); + return !numa_promotion_rate_limit(pgdat, rate_limit, nr); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..e74f0b2a1021 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", + [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl", #endif [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", -- cgit v1.2.3 From 79e1c24285c40cdfa9eb00fe8131d1ba14b84ef1 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Fri, 18 Jul 2025 10:41:32 +0800 Subject: mm: replace (20 - PAGE_SHIFT) with common macros for pages<->MB conversion Replace repeated (20 - PAGE_SHIFT) calculations with standard macros: - MB_TO_PAGES(mb) converts MB to page count - PAGES_TO_MB(pages) converts pages to MB No functional change. [akpm@linux-foundation.org: remove arc's private PAGES_TO_MB, remove its unused PAGES_TO_KB] [akpm@linux-foundation.org: don't include mm.h due to include file ordering mess] Link: https://lkml.kernel.org/r/20250718024134.1304745-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Dev Jain Acked-by: David Hildenbrand Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Dietmar Eggemann Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Josh Triplett Cc: Juri Lelli Cc: Kairui Song Cc: Kemeng Shi Cc: Lai jiangshan Cc: Liam Howlett Cc: Mariano Pache Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Neeraj Upadhyay Cc: Nhat Pham Cc: "Paul E . McKenney" Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arc/include/asm/arcregs.h | 3 --- include/linux/mm.h | 9 +++++++++ kernel/rcu/rcuscale.c | 2 +- kernel/sched/fair.c | 5 ++--- mm/backing-dev.c | 2 +- mm/huge_memory.c | 2 +- mm/swap.c | 2 +- 7 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index a31bbf5c8bbc..d84908a177bd 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -151,9 +151,6 @@ /* Helpers */ #define TO_KB(bytes) ((bytes) >> 10) #define TO_MB(bytes) (TO_KB(bytes) >> 10) -#define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10)) -#define PAGES_TO_MB(n_pages) (PAGES_TO_KB(n_pages) >> 10) - /* *************************************************************** diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..b626d1bacef5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -69,6 +69,15 @@ static inline void totalram_pages_add(long count) extern void * high_memory; +/* + * Convert between pages and MB + * 20 is the shift for 1MB (2^20 = 1MB) + * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages) + * So (20 - PAGE_SHIFT) converts between pages and MB + */ +#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b521d0455992..7484d8ad5767 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -796,7 +796,7 @@ kfree_scale_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + PAGES_TO_MB(mem_begin - mem_during)); if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82c8d804c54c..e256793b9a08 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1495,7 +1495,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size); rss = get_mm_rss(p->mm); if (!rss) rss = nr_scan_pages; @@ -1934,8 +1934,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, } def_th = sysctl_numa_balancing_hot_threshold; - rate_limit = sysctl_numa_balancing_promote_rate_limit << \ - (20 - PAGE_SHIFT); + rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit); numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..e4d578e6121c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) /* * Initial write bandwidth: 100 MB/s */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +#define INIT_BW MB_TO_PAGES(100) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f09..2b4ea5a2ce7d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -911,7 +911,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < MB_TO_PAGES(512)) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..cb164f9ef9e3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1096,7 +1096,7 @@ static const struct ctl_table swap_sysctl_table[] = { */ void __init swap_setup(void) { - unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ if (megs < 16) -- cgit v1.2.3 From cc483b328881bbccb55265a86731384d5176fe85 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 4 Aug 2025 16:33:48 -0700 Subject: mm: limit the scope of vma_start_read() Limit the scope of vma_start_read() as it is used only as a helper for higher-level locking functions implemented inside mmap_lock.c and we are about to introduce more complex RCU rules for this function. The change is pure code refactoring and has no functional changes. Link: https://lkml.kernel.org/r/20250804233349.1278678-1-surenb@google.com Suggested-by: Vlastimil Babka Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 85 ----------------------------------------------- mm/mmap_lock.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 11a078de9150..2c9fffa58714 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -147,91 +147,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) } } -/* - * Try to read-lock a vma. The function is allowed to occasionally yield false - * locked result to avoid performance overhead, in which case we fall back to - * using mmap_lock. The function should never yield false unlocked result. - * False locked result is possible if mm_lock_seq overflows or if vma gets - * reused and attached to a different mm before we lock it. - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got - * detached. - * - * WARNING! The vma passed to this function cannot be used if the function - * fails to lock it because in certain cases RCU lock is dropped and then - * reacquired. Once RCU lock is dropped the vma can be concurently freed. - */ -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - int oldcnt; - - /* - * Check before locking. A race might cause false locked result. - * We can use READ_ONCE() for the mm_lock_seq here, and don't need - * ACQUIRE semantics, because this is just a lockless check whose result - * we don't rely on for anything - the mm_lock_seq read against which we - * need ordering is below. - */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; - - /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. - * Acquire fence is required here to avoid reordering against later - * vm_lock_seq check and checks inside lock_vma_under_rcu(). - */ - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { - /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); - } - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - - /* - * If vma got attached to another mm from under us, that mm is not - * stable and can be freed in the narrow window after vma->vm_refcnt - * is dropped and before rcuwait_wake_up(mm) is called. Grab it before - * releasing vma->vm_refcnt. - */ - if (unlikely(vma->vm_mm != mm)) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *other_mm = vma->vm_mm; - - /* - * __mmdrop() is a heavy operation and we don't need RCU - * protection here. Release RCU lock during these operations. - * We reinstate the RCU read lock as the caller expects it to - * be held when this function returns even on error. - */ - rcu_read_unlock(); - mmgrab(other_mm); - vma_refcount_put(vma); - mmdrop(other_mm); - rcu_read_lock(); - return NULL; - } - - /* - * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. - * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq - * modification invalidates all existing locks. - * - * We must use ACQUIRE semantics for the mm_lock_seq so that if we are - * racing with vma_end_write_all(), we only start reading from the VMA - * after it has been unlocked. - * This pairs with RELEASE semantics in vma_end_write_all(). - */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { - vma_refcount_put(vma); - return NULL; - } - - return vma; -} - /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index b006cec8e6fe..10826f347a9f 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -127,6 +127,91 @@ void vma_mark_detached(struct vm_area_struct *vma) } } +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + int oldcnt; + + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) + return NULL; + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + return NULL; + } + + return vma; +} + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the -- cgit v1.2.3 From 913fff314547c1922002e655bb25199ee38e8825 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:47 +0800 Subject: mm, swap: remove fragment clusters counter It was used for calculating the iteration number when the swap allocator wants to scan the whole fragment list. Now the allocator only scans one fragment cluster at a time, so no one uses this counter anymore. Remove it as a cleanup; the performance change is marginal: Build linux kernel using 10G ZRAM, make -j96, defconfig with 2G cgroup memory limit, on top of tmpfs, 64kB mTHP enabled: Before: sys time: 6278.45s After: sys time: 6176.34s Change to 8G ZRAM: Before: sys time: 5572.85s After: sys time: 5531.49s Link: https://lkml.kernel.org/r/20250806161748.76651-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/swapfile.c | 7 ------- 2 files changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..a060d102e0d1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,7 +310,6 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f1110e37f68..5fdb3cb2b8b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -965,7 +960,6 @@ new_cluster: * allocation, but reclaim may drop si->lock and race with another user. */ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) @@ -3217,7 +3211,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } /* -- cgit v1.2.3 From 61dc4358d37ae0be3220a0fa32cf7f0ccd4f7636 Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Wed, 6 Aug 2025 22:59:06 +0800 Subject: mm: correct misleading comment on mmap_lock field in mm_struct The comment previously described the offset of mmap_lock as 0x120 (hex), which is misleading. The correct offset is 56 bytes (decimal) from the last cache line boundary. Using '0x120' could confuse readers trying to understand why the count and owner fields reside in separate cachelines. This change also removes an unnecessary space for improved formatting. Link: https://lkml.kernel.org/r/20250806145906.24647-1-adrianhuang0701@gmail.com Signed-off-by: Adrian Huang (Lenovo) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..3ed763e7ec6f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1026,10 +1026,10 @@ struct mm_struct { * counters */ /* - * With some kernel config, the current mmap_lock's offset - * inside 'mm_struct' is at 0x120, which is very optimal, as + * Typically the current mmap_lock's offset is 56 bytes from + * the last cacheline boundary, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different - * cachelines, and when mmap_lock is highly contended, both + * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * -- cgit v1.2.3 From 4c5d3365882dbbc0784688784904f440d7a4c0f1 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:41:08 +0200 Subject: mm/vmalloc: allow to set node and align in vrealloc Patch series "support large align and nid in Rust allocators", v15. The series provides the ability for Rust allocators to set NUMA node and large alignment. This patch (of 4): Reimplement vrealloc() to be able to set node and alignment should a user need to do so. Rename the function to vrealloc_node_align() to better match what it actually does now and introduce macros for vrealloc() and friends for backward compatibility. With that change we also provide the ability for the Rust part of the kernel to set node and alignment in its allocations. Link: https://lkml.kernel.org/r/20250806124034.1724515-1-vitaly.wool@konsulko.se Link: https://lkml.kernel.org/r/20250806124108.1724561-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Vlastimil Babka Cc: Alice Ryhl Cc: Danilo Krummrich Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 12 +++++++++--- mm/nommu.c | 3 ++- mm/vmalloc.c | 29 ++++++++++++++++++++++++----- 3 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2759dac6be44..eb54b7b3202f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) -void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) +void *__must_check vrealloc_node_align_noprof(const void *p, size_t size, + unsigned long align, gfp_t flags, int nid) __realloc_size(2); +#define vrealloc_node_noprof(_p, _s, _f, _nid) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, _nid) +#define vrealloc_noprof(_p, _s, _f) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE) +#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__)) +#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__)) +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/mm/nommu.c b/mm/nommu.c index 8b819fafd57b..6fff462f90d3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..e299b51bd922 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4089,19 +4089,29 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: requested alignment * @flags: the flags for the page level allocator + * @nid: node number of the target node + * + * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * - * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and - * @p is not a %NULL pointer, the object pointed to is freed. + * If the caller wants the new memory to be on specific node *only*, + * __GFP_THISNODE flag should be set, otherwise the function will try to avoid + * reallocation and possibly disregard the specified @nid. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * + * Requesting an alignment that is bigger than the alignment of the existing + * allocation will fail. + * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * @@ -4111,7 +4121,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { struct vm_struct *vm = NULL; size_t alloced_size = 0; @@ -4135,6 +4146,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) if (WARN(alloced_size < old_size, "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) return NULL; + if (WARN(!IS_ALIGNED((unsigned long)p, align), + "will not reallocate with a bigger alignment (0x%lx)\n", align)) + return NULL; + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(vmalloc_to_page(p))) + goto need_realloc; } /* @@ -4165,8 +4182,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) return (void *)p; } +need_realloc: /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ - n = __vmalloc_noprof(size, flags); + n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0)); + if (!n) return NULL; -- cgit v1.2.3 From 2cd8231796b5e7133b1c3d66ad7d2a3c42c97258 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:41:47 +0200 Subject: mm/slub: allow to set node and align in k[v]realloc Reimplement k[v]realloc_node() to be able to set node and alignment should a user need to do so. In order to do that while retaining the maximal backward compatibility, add k[v]realloc_node_align() functions and redefine the rest of API using these new ones. While doing that, we also keep the number of _noprof variants to a minimum, which implies some changes to the existing users of older _noprof functions, that basically being bcachefs. With that change we also provide the ability for the Rust part of the kernel to set node and alignment in its K[v]xxx [re]allocations. Link: https://lkml.kernel.org/r/20250806124147.1724658-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Reviewed-by: Vlastimil Babka Cc: Alice Ryhl Cc: Danilo Krummrich Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- fs/bcachefs/darray.c | 2 +- fs/bcachefs/util.h | 2 +- include/linux/bpfptr.h | 2 +- include/linux/slab.h | 39 ++++++++++++++++++++------------- lib/rhashtable.c | 4 ++-- mm/slub.c | 59 ++++++++++++++++++++++++++++++++++++++------------ 6 files changed, 74 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index e86d36d23e9e..928e83a1ce42 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -21,7 +21,7 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ return -ENOMEM; void *data = likely(bytes < INT_MAX) - ? kvmalloc_noprof(bytes, gfp) + ? kvmalloc_node_align_noprof(bytes, 1, gfp, NUMA_NO_NODE) : vmalloc_noprof(bytes); if (!data) return -ENOMEM; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 6488f098d140..7112fd40ee21 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -61,7 +61,7 @@ static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) ? vmalloc_noprof(n) - : kvmalloc_noprof(n, flags & ~__GFP_ZERO); + : kvmalloc_node_align_noprof(n, 1, flags & ~__GFP_ZERO, NUMA_NO_NODE); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 1af241525a17..f6e0795db484 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len) { - void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN); + void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE); if (!p) return ERR_PTR(-ENOMEM); diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..6dc300bac2a1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -465,9 +465,13 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc_noprof(const void *objp, size_t new_size, - gfp_t flags) __realloc_size(2); -#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) +void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size, + unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE) +#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__)) +#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n) +#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE) void kfree(const void *objp); void kfree_sensitive(const void *objp); @@ -1041,18 +1045,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) #define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) #define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1); -#define kvmalloc_node_noprof(size, flags, node) \ - __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node) -#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) - -#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) -#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) __alloc_size(1); +#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \ + __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node) +#define kvmalloc_node_align(...) \ + alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__)) +#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n) +#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE) #define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node) + #define kmem_buckets_valloc(_b, _size, _flags) \ - alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) + alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE)) static inline __alloc_size(1, 2) void * kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) @@ -1062,7 +1068,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return kvmalloc_node_noprof(bytes, flags, node); + return kvmalloc_node_align_noprof(bytes, 1, flags, node); } #define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE) @@ -1073,9 +1079,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define kvrealloc_node_align(...) \ + alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__)) +#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n) +#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE) extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 3e555d012ed6..fde0f0e556f8 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -184,8 +184,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, - kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), - gfp|__GFP_ZERO, NUMA_NO_NODE)); + kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets), + 1, gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..8dbeabc6a0f0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4881,7 +4881,7 @@ void kfree(const void *object) EXPORT_SYMBOL(kfree); static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) { void *ret; size_t ks = 0; @@ -4895,6 +4895,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (!kasan_check_byte(p)) return NULL; + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -4917,6 +4927,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (new_size > ks) goto alloc_new; + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); @@ -4939,7 +4953,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; alloc_new: - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -4951,14 +4965,19 @@ alloc_new: } /** - * krealloc - reallocate memory. The contents will remain unchanged. + * krealloc_node_align - reallocate memory. The contents will remain unchanged. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. + * @align: desired alignment. * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE * * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -4983,7 +5002,8 @@ alloc_new: * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) { void *ret; @@ -4992,13 +5012,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __do_krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, align, flags, nid); if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; } -EXPORT_SYMBOL(krealloc_noprof); +EXPORT_SYMBOL(krealloc_node_align_noprof); static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) { @@ -5029,9 +5049,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * @@ -5041,7 +5065,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) { void *ret; @@ -5071,7 +5096,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } @@ -5115,14 +5140,19 @@ void kvfree_sensitive(const void *addr, size_t len) EXPORT_SYMBOL(kvfree_sensitive); /** - * kvrealloc - reallocate memory; contents remain unchanged + * kvrealloc_node_align - reallocate memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: desired alignment * @flags: the flags for the page level allocator + * @nid: NUMA node id * * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 * and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -5136,17 +5166,18 @@ EXPORT_SYMBOL(kvfree_sensitive); * * Return: pointer to the allocated memory or %NULL in case of error */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { void *n; if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); + return vrealloc_node_align_noprof(p, size, align, flags, nid); - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); + n = kvmalloc_node_align_noprof(size, align, flags, nid); if (!n) return NULL; @@ -5162,7 +5193,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) return n; } -EXPORT_SYMBOL(kvrealloc_noprof); +EXPORT_SYMBOL(kvrealloc_node_align_noprof); struct detached_freelist { struct slab *slab; -- cgit v1.2.3 From e6d7d3502e00ed6f86e03dcdb282cb7785e55448 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Tue, 5 Aug 2025 21:39:40 +0900 Subject: mm/damon: update expired description of damos_action Nowadays, damos operation actions support a greater operation set. But comments (also, generated documentation) weren't updated. So fix the comments with current support status. Link: https://lkml.kernel.org/r/20250805123940.13691-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Honggyu Kim Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..d01bfee80bd6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -110,7 +110,7 @@ struct damon_target { * * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. - * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_PAGEOUT: Reclaim the region. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. @@ -121,10 +121,10 @@ struct damon_target { * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * * The support of each action is up to running &struct damon_operations. - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except - * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR - * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum - * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. + * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for + * status of the supports. + * + * Note that DAMOS_PAGEOUT doesn't trigger demotions. */ enum damos_action { DAMOS_WILLNEED, -- cgit v1.2.3 From 95c2908f1a4fd608b1cdbb5acef3572e5d769e1c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 16:39:47 +0200 Subject: mm/migrate: remove MIGRATEPAGE_UNMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_folio_unmap() is the only user of MIGRATEPAGE_UNMAP. We want to remove MIGRATEPAGE_* completely. It's rather weird to have a generic MIGRATEPAGE_UNMAP, documented to be returned from address-space callbacks, when it's only used for an internal helper. Let's start by having only a single "success" return value for migrate_folio_unmap() -- 0 -- by moving the "folio was already freed" check into the single caller. There is a remaining comment for PG_isolated, which we renamed to PG_movable_ops_isolated recently and forgot to update. While we might still run into that case with zsmalloc, it's something we want to get rid of soon. So let's just focus that optimization on real folios only for now by excluding movable_ops pages. Note that concurrent freeing can happen at any time and this "already freed" check is not relevant for correctness. [david@redhat.com: no need to pass "reason" to migrate_folio_unmap(), per Lance] Link: https://lkml.kernel.org/r/3bb725f8-28d7-4aa2-b75f-af40d5cab280@redhat.com Link: https://lkml.kernel.org/r/20250811143949.1117439-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lance Yang Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Benjamin LaHaise Cc: Byungchul Park Cc: Chris Mason Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Kleikamp Cc: David Sterba Cc: Eugenio Pé rez Cc: Greg Kroah-Hartman Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Wang Cc: Jerrin Shaji George Cc: Josef Bacik Cc: Joshua Hahn Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Rakie Kim Cc: Sergey Senozhatsky Cc: Xuan Zhuo Cc: Dave Kleikamp Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 - mm/migrate.c | 45 ++++++++++++++++++++++----------------------- 2 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9009e27b5f44..302f3e95faea 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -18,7 +18,6 @@ struct migration_target_control; * - zero on page migration success; */ #define MIGRATEPAGE_SUCCESS 0 -#define MIGRATEPAGE_UNMAP 1 /** * struct movable_operations - Driver page migration diff --git a/mm/migrate.c b/mm/migrate.c index 9e5ef39ce73a..2ef467eae31d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1189,7 +1189,7 @@ static void migrate_folio_done(struct folio *src, static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) + struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1198,16 +1198,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, bool locked = false; bool dst_locked = false; - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - dst = get_new_folio(src, private); if (!dst) return -ENOMEM; @@ -1297,7 +1287,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } /* @@ -1327,7 +1317,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } out: @@ -1870,14 +1860,27 @@ static int migrate_pages_batch(struct list_head *from, continue; } + /* + * If we are holding the last folio reference, the folio + * was freed from under us, so just drop our reference. + */ + if (likely(!page_has_movable_ops(&folio->page)) && + folio_ref_count(folio) == 1) { + folio_clear_active(folio); + folio_clear_unevictable(folio); + list_del(&folio->lru); + migrate_folio_done(folio, reason); + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + continue; + } + rc = migrate_folio_unmap(get_new_folio, put_new_folio, - private, folio, &dst, mode, reason, - ret_folios); + private, folio, &dst, mode, ret_folios); /* * The rules are: - * Success: folio will be freed - * Unmap: folio will be put on unmap_folios list, - * dst folio put on dst_folios list + * 0: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1927,11 +1930,7 @@ static int migrate_pages_batch(struct list_head *from, thp_retry += is_thp; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - case MIGRATEPAGE_UNMAP: + case 0: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; -- cgit v1.2.3 From fb49a4425cfa163faccd91f913773d3401d3a7d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 16:39:48 +0200 Subject: treewide: remove MIGRATEPAGE_SUCCESS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At this point MIGRATEPAGE_SUCCESS is misnamed for all folio users, and now that we remove MIGRATEPAGE_UNMAP, it's really the only "success" return value that the code uses and expects. Let's just get rid of MIGRATEPAGE_SUCCESS completely and just use "0" for success. Link: https://lkml.kernel.org/r/20250811143949.1117439-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan [mm] Acked-by: Dave Kleikamp [jfs] Acked-by: David Sterba [btrfs] Acked-by: Greg Kroah-Hartman Reviewed-by: Byungchul Park Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Benjamin LaHaise Cc: Chris Mason Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Kleikamp Cc: Eugenio Pé rez Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Wang Cc: Jerrin Shaji George Cc: Josef Bacik Cc: Joshua Hahn Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Rakie Kim Cc: Sergey Senozhatsky Cc: Xuan Zhuo Cc: Lance Yang Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 2 +- drivers/misc/vmw_balloon.c | 4 ++-- drivers/virtio/virtio_balloon.c | 2 +- fs/aio.c | 2 +- fs/btrfs/inode.c | 4 ++-- fs/hugetlbfs/inode.c | 4 ++-- fs/jfs/jfs_metapage.c | 8 ++++---- include/linux/migrate.h | 10 +-------- mm/migrate.c | 40 +++++++++++++++++------------------- mm/migrate_device.c | 2 +- mm/zsmalloc.c | 4 ++-- 11 files changed, 36 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 5e0a718d1be7..0823fa2da151 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -545,7 +545,7 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, /* balloon page list reference */ put_page(page); - return MIGRATEPAGE_SUCCESS; + return 0; } static void cmm_balloon_compaction_init(void) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 6653fc53c951..6df51ee8db62 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1806,7 +1806,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * the list after acquiring the lock. */ get_page(newpage); - ret = MIGRATEPAGE_SUCCESS; + ret = 0; } /* Update the balloon list under the @pages_lock */ @@ -1817,7 +1817,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * If we succeed just insert it to the list and update the statistics * under the lock. */ - if (ret == MIGRATEPAGE_SUCCESS) { + if (!ret) { balloon_page_insert(&b->b_dev_info, newpage); __count_vm_event(BALLOON_MIGRATE); } diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index e299e18346a3..eae65136cdfb 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -875,7 +875,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, balloon_page_finalize(page); put_page(page); /* balloon reference */ - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/fs/aio.c b/fs/aio.c index 7fc7b6221312..059e03cfa088 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -445,7 +445,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, folio_get(dst); rc = folio_migrate_mapping(mapping, dst, src, 1); - if (rc != MIGRATEPAGE_SUCCESS) { + if (rc) { folio_put(dst); goto out_unlock; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7..0bb604dbd673 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7421,7 +7421,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, { int ret = filemap_migrate_folio(mapping, dst, src, mode); - if (ret != MIGRATEPAGE_SUCCESS) + if (ret) return ret; if (folio_test_ordered(src)) { @@ -7429,7 +7429,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, folio_set_ordered(dst); } - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define btrfs_migrate_folio NULL diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf..34d496a2b7de 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1052,7 +1052,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (hugetlb_folio_subpool(src)) { @@ -1063,7 +1063,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define hugetlbfs_migrate_folio NULL diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b98cf3bb6c1f..871cf4fb3636 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -169,7 +169,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; for (i = 0; i < MPS_PER_PAGE; i++) { @@ -199,7 +199,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, } } - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_MIGRATION */ @@ -242,7 +242,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, return -EAGAIN; rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (unlikely(insert_metapage(dst, mp))) @@ -253,7 +253,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, mp->folio = dst; remove_metapage(src, mp); - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 302f3e95faea..1f0ac122c3bf 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -12,13 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private); struct migration_target_control; -/* - * Return values from addresss_space_operations.migratepage(): - * - negative errno on page migration failure; - * - zero on page migration success; - */ -#define MIGRATEPAGE_SUCCESS 0 - /** * struct movable_operations - Driver page migration * @isolate_page: @@ -34,8 +27,7 @@ struct migration_target_control; * @src page. The driver should copy the contents of the * @src page to the @dst page and set up the fields of @dst page. * Both pages are locked. - * If page migration is successful, the driver should - * return MIGRATEPAGE_SUCCESS. + * If page migration is successful, the driver should return 0. * If the driver cannot migrate the page at the moment, it can return * -EAGAIN. The VM interprets this as a temporary migration failure and * will retry it later. Any other error value is a permanent migration diff --git a/mm/migrate.c b/mm/migrate.c index 2ef467eae31d..8e435a078fc3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page) * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * - * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error - * code. + * Returns 0 on success, otherwise a negative error code. */ static int migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode) { - int rc = MIGRATEPAGE_SUCCESS; + int rc; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); rc = page_movable_ops(src)->migrate_page(dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) ClearPageMovableOpsIsolated(src); return rc; } @@ -587,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - return MIGRATEPAGE_SUCCESS; + return 0; } oldzone = folio_zone(folio); @@ -688,7 +687,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } local_irq_enable(); - return MIGRATEPAGE_SUCCESS; + return 0; } int folio_migrate_mapping(struct address_space *mapping, @@ -737,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xas_unlock_irq(&xas); - return MIGRATEPAGE_SUCCESS; + return 0; } /* @@ -853,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } /** @@ -967,7 +966,7 @@ recheck_buffers: } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) goto unlock_buffers; bh = head; @@ -1071,7 +1070,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * * Return value: * < 0 - error code - * MIGRATEPAGE_SUCCESS - success + * 0 - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1099,7 +1098,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, else rc = fallback_migrate_folio(mapping, dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed. @@ -1449,7 +1448,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_hugetlb(src); - return MIGRATEPAGE_SUCCESS; + return 0; } dst = get_new_folio(src, private); @@ -1512,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1522,7 +1520,7 @@ put_anon: if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } @@ -1530,7 +1528,7 @@ put_anon: out_unlock: folio_unlock(src); out: - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1640,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, reason, ret_folios); /* * The rules are: - * Success: hugetlb folio will be put back + * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1657,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, retry++; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; break; default: @@ -1711,7 +1709,7 @@ static void migrate_folios_move(struct list_head *src_folios, reason, ret_folios); /* * The rules are: - * Success: folio will be freed + * 0: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ @@ -1721,7 +1719,7 @@ static void migrate_folios_move(struct list_head *src_folios, *thp_retry += is_thp; *nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e05e14d6eacd..abd9f6850db6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r != MIGRATEPAGE_SUCCESS) + if (r) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 805a10b41266..153783d49d34 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1746,7 +1746,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * instead. */ if (!zpdesc->zspage) - return MIGRATEPAGE_SUCCESS; + return 0; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1813,7 +1813,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, reset_zpdesc(zpdesc); zpdesc_put(zpdesc); - return MIGRATEPAGE_SUCCESS; + return 0; } static void zs_page_putback(struct page *page) -- cgit v1.2.3 From b22cc9a9c7ff0ad8998d58fdd7122de6038c46a7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:27 +0200 Subject: mm/rmap: convert "enum rmap_level" to "enum pgtable_level" Let's factor it out, and convert all checks for unsupported levels to BUILD_BUG(). The code is written in a way such that force-inlining will optimize out the levels. [nathan@kernel.org: always inline __folio_rmap_sanity_checks()] Link: https://lkml.kernel.org/r/20250814-rmap-fix-build_bug-conversion-v1-1-fb7b10a0b362@kernel.org Link: https://lkml.kernel.org/r/20250811112631.759341-8-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 8 +++++++ include/linux/rmap.h | 62 ++++++++++++++++++++++--------------------------- mm/rmap.c | 56 ++++++++++++++++++++++++-------------------- 3 files changed, 67 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b..4f88e460eb9c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1975,6 +1975,14 @@ static inline bool arch_has_pfn_modify_check(void) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; +enum pgtable_level { + PGTABLE_LEVEL_PTE = 0, + PGTABLE_LEVEL_PMD, + PGTABLE_LEVEL_PUD, + PGTABLE_LEVEL_P4D, + PGTABLE_LEVEL_PGD, +}; + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6cd020eea37a..e8aff6d2deda 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -394,18 +394,8 @@ typedef int __bitwise rmap_t; /* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) -/* - * Internally, we're using an enum to specify the granularity. We make the - * compiler emit specialized code for each granularity. - */ -enum rmap_level { - RMAP_LEVEL_PTE = 0, - RMAP_LEVEL_PMD, - RMAP_LEVEL_PUD, -}; - -static inline void __folio_rmap_sanity_checks(const struct folio *folio, - const struct page *page, int nr_pages, enum rmap_level level) +static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum pgtable_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: /* * We don't support folios larger than a single PMD yet. So - * when RMAP_LEVEL_PMD is set, we assume that we are creating + * when PGTABLE_LEVEL_PMD is set, we assume that we are creating * a single "entire" mapping of the folio. */ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Assume that we are creating a single "entire" mapping of the * folio. @@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); break; default: - VM_WARN_ON_ONCE(true); + BUILD_BUG(); } /* @@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio) static __always_inline void __folio_dup_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - enum rmap_level level) + enum pgtable_level level) { const int orig_nr_pages = nr_pages; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { atomic_inc(&folio->_mapcount); break; @@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } } @@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, static inline void folio_dup_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE); } /** @@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif @@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, enum rmap_level level) + struct vm_area_struct *src_vma, enum pgtable_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * copying if the folio maybe pinned. */ switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (unlikely(maybe_pinned)) { for (i = 0; i < nr_pages; i++) if (PageAnonExclusive(page + i)) @@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; @@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } return 0; } @@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, - src_vma, RMAP_LEVEL_PTE); + src_vma, PGTABLE_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, @@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, - src_vma, RMAP_LEVEL_PMD); + src_vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; @@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, } static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, enum pgtable_level level) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); @@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, static inline int folio_try_share_anon_rmap_pte(struct folio *folio, struct page *page) { - return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); + return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE); } /** @@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; diff --git a/mm/rmap.c b/mm/rmap.c index 84a8d8b02ef7..0e9c4041f868 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1265,7 +1265,7 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; @@ -1274,7 +1274,7 @@ static __always_inline void __folio_add_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; @@ -1300,11 +1300,11 @@ static __always_inline void __folio_add_rmap(struct folio *folio, folio_add_large_mapcount(folio, orig_nr_pages, vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { - if (level == RMAP_LEVEL_PMD && first) + if (level == PGTABLE_LEVEL_PMD && first) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) @@ -1323,7 +1323,7 @@ static __always_inline void __folio_add_rmap(struct folio *folio, * We only track PMD mappings of PMD-sized * folios separately. */ - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ @@ -1336,6 +1336,8 @@ static __always_inline void __folio_add_rmap(struct folio *folio, } folio_inc_large_mapcount(folio, vma); break; + default: + BUILD_BUG(); } __folio_mod_stat(folio, nr, nr_pmdmapped); } @@ -1427,7 +1429,7 @@ static void __page_check_anon_rmap(const struct folio *folio, static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - unsigned long address, rmap_t flags, enum rmap_level level) + unsigned long address, rmap_t flags, enum pgtable_level level) { int i; @@ -1440,20 +1442,22 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, if (flags & RMAP_EXCLUSIVE) { switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; + default: + BUILD_BUG(); } } @@ -1507,7 +1511,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -1528,7 +1532,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1609,7 +1613,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); @@ -1634,7 +1638,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1651,7 +1655,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1672,7 +1676,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif @@ -1680,7 +1684,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; @@ -1689,7 +1693,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; @@ -1719,11 +1723,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && atomic_read(mapped); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); - if (level == RMAP_LEVEL_PMD && last) + if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { @@ -1743,7 +1747,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ @@ -1757,6 +1761,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && nr < nr_pmdmapped; break; + default: + BUILD_BUG(); } /* @@ -1796,7 +1802,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1813,7 +1819,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1834,7 +1840,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif -- cgit v1.2.3 From ec63a44011dccebca24e7ef7e8a9521306de1bc9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:28 +0200 Subject: mm/memory: convert print_bad_pte() to print_bad_page_map() print_bad_pte() looks like something that should actually be a WARN or similar, but historically it apparently has proven to be useful to detect corruption of page tables even on production systems -- report the issue and keep the system running to make it easier to actually detect what is going wrong (e.g., multiple such messages might shed a light). As we want to unify vm_normal_page_*() handling for PTE/PMD/PUD, we'll have to take care of print_bad_pte() as well. Let's prepare for using print_bad_pte() also for non-PTEs by adjusting the implementation and renaming the function to print_bad_page_map(). Provide print_bad_pte() as a simple wrapper. Document the implicit locking requirements for the page table re-walk. To make the function a bit more readable, factor out the ratelimit check into is_bad_page_map_ratelimited() and place the printing of page table content into __print_bad_page_map_pgtable(). We'll now dump information from each level in a single line, and just stop the table walk once we hit something that is not a present page table. The report will now look something like (dumping pgd to pmd values): [ 77.943408] BUG: Bad page map in process XXX pte:80000001233f5867 [ 77.944077] addr:00007fd84bb1c000 vm_flags:08100071 anon_vma: ... [ 77.945186] pgd:10a89f067 p4d:10a89f067 pud:10e5a2067 pmd:105327067 Not using pgdp_get(), because that does not work properly on some arm configs where pgd_t is an array. Note that we are dumping all levels even when levels are folded for simplicity. [david@redhat.com: drop warning] Link: https://lkml.kernel.org/r/923b279c-de33-44dd-a923-2959afad8626@redhat.com Link: https://lkml.kernel.org/r/20250811112631.759341-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 18 +++++++++ mm/memory.c | 104 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 102 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 4f88e460eb9c..94249e671a7e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1983,6 +1983,24 @@ enum pgtable_level { PGTABLE_LEVEL_PGD, }; +static inline const char *pgtable_level_to_str(enum pgtable_level level) +{ + switch (level) { + case PGTABLE_LEVEL_PTE: + return "pte"; + case PGTABLE_LEVEL_PMD: + return "pmd"; + case PGTABLE_LEVEL_PUD: + return "pud"; + case PGTABLE_LEVEL_P4D: + return "p4d"; + case PGTABLE_LEVEL_PGD: + return "pgd"; + default: + return "unknown"; + } +} + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/mm/memory.c b/mm/memory.c index 626caedce35e..dc0107354d37 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", @@ -549,6 +611,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) /* * vm_normal_page -- This function gets the "struct page" associated with a pte. -- cgit v1.2.3 From 2db308160b5a191b494746fd167dbbaaead3fb26 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:30 +0200 Subject: mm: introduce and use vm_normal_page_pud() Let's introduce vm_normal_page_pud(), which ends up being fairly simple because of our new common helpers and there not being a PUD-sized zero folio. Use vm_normal_page_pud() in folio_walk_start() to resolve a TODO, structuring the code like the other (pmd/pte) cases. Defer introducing vm_normal_folio_pud() until really used. Note that we can so far get PUDs with hugetlb, daxfs and PFNMAP entries. Link: https://lkml.kernel.org/r/20250811112631.759341-11-david@redhat.com Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory.c | 19 +++++++++++++++++++ mm/pagewalk.c | 20 ++++++++++---------- 3 files changed, 31 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b626d1bacef5..8ca7d2fa7134 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2360,6 +2360,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); +struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t pud); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index 78af3f243cee..6f806bf3cc99 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -809,6 +809,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif /** diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..c6753d370ff4 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -902,23 +902,23 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ - if (!pud_present(pud) || pud_special(pud)) { + if (pud_none(pud)) { spin_unlock(ptl); goto not_found; - } else if (!pud_leaf(pud)) { + } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; + } else if (pud_present(pud)) { + page = vm_normal_page_pud(vma, addr, pud); + if (page) + goto found; } /* - * TODO: vm_normal_page_pud() will be handy once we want to - * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. */ - page = pud_page(pud); - goto found; + spin_unlock(ptl); + goto not_found; } pmd_table: -- cgit v1.2.3 From 4c89792ea0a224340ff198abc7caffa211baccd6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:31 +0200 Subject: mm: rename vm_ops->find_special_page() to vm_ops->find_normal_page() ... and hide it behind a kconfig option. There is really no need for any !xen code to perform this check. The naming is a bit off: we want to find the "normal" page when a PTE was marked "special". So it's really not "finding a special" page. Improve the documentation, and add a comment in the code where XEN ends up performing the pte_mkspecial() through a hypercall. More details can be found in commit 923b2919e2c3 ("xen/gntdev: mark userspace PTEs as special on x86 PV guests"). Link: https://lkml.kernel.org/r/20250811112631.759341-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Cc: David Vrabel Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/xen/Kconfig | 1 + drivers/xen/gntdev.c | 5 +++-- include/linux/mm.h | 18 +++++++++++++----- mm/Kconfig | 2 ++ mm/memory.c | 12 ++++++++++-- tools/testing/vma/vma_internal.h | 18 +++++++++++++----- 6 files changed, 42 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 24f485827e03..f9a35ed266ec 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -138,6 +138,7 @@ config XEN_GNTDEV depends on XEN default m select MMU_NOTIFIER + select FIND_NORMAL_PAGE help Allows userspace processes to use grants. diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1f2160765618..26f13b37c78e 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -321,6 +321,7 @@ static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; + /* Note: this will perform a pte_mkspecial() through the hypercall. */ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); @@ -528,7 +529,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma) gntdev_put_map(priv, map); } -static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, +static struct page *gntdev_vma_find_normal_page(struct vm_area_struct *vma, unsigned long addr) { struct gntdev_grant_map *map = vma->vm_private_data; @@ -539,7 +540,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, static const struct vm_operations_struct gntdev_vmops = { .open = gntdev_vma_open, .close = gntdev_vma_close, - .find_special_page = gntdev_vma_find_special_page, + .find_normal_page = gntdev_vma_find_normal_page, }; /* ------------------------------------------------------------------ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ca7d2fa7134..3868ca1a25f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -657,13 +657,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..59a04d0b2e27 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1381,6 +1381,8 @@ config PT_RECLAIM Note: now only empty user PTE page table pages will be reclaimed. +config FIND_NORMAL_PAGE + def_bool n source "mm/damon/Kconfig" diff --git a/mm/memory.c b/mm/memory.c index 6f806bf3cc99..002c28795d8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -639,6 +639,12 @@ static void print_bad_page_map(struct vm_area_struct *vma, * trivial. Secondly, an architecture may not have a spare page table * entry bit, which requires a more complicated scheme, described below. * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. + * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. @@ -679,8 +685,10 @@ static inline struct page *__vm_normal_page(struct vm_area_struct *vma, { if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (unlikely(special)) { - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 3639aa8dd2b0..cb1c2a8afe26 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -467,13 +467,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; struct vm_unmapped_area_info { -- cgit v1.2.3 From 2843408ca971d1472a6a8b32ee4647f55ecab598 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:10 +0200 Subject: mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO As all the helper functions has been renamed from *_page to *_folio, rename the MM flag from MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO. No functional changes. Link: https://lkml.kernel.org/r/20250811084113.647267-3-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- mm/huge_memory.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3ed763e7ec6f..cf94df4955c7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1758,7 +1758,7 @@ enum { #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3f0c8c2856d3..2801ce9bbde9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,13 +248,13 @@ static void put_huge_zero_folio(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -262,7 +262,7 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); } -- cgit v1.2.3 From 2d8bd8049e89efe42a5397de4effd899e8dd2249 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:11 +0200 Subject: mm: add persistent huge zero folio Many places in the kernel need to zero out larger chunks, but the maximum segment that can be zeroed out at a time by ZERO_PAGE is limited by PAGE_SIZE. This is especially annoying in block devices and filesystems where multiple ZERO_PAGEs are attached to the bio in different bvecs. With multipage bvec support in block layer, it is much more efficient to send out larger zero pages as a part of single bvec. This concern was raised during the review of adding Large Block Size support to XFS[1][2]. Usually huge_zero_folio is allocated on demand, and it will be deallocated by the shrinker if there are no users of it left. At moment, huge_zero_folio infrastructure refcount is tied to the process lifetime that created it. This might not work for bio layer as the completions can be async and the process that created the huge_zero_folio might no longer be alive. And, one of the main points that came up during discussion is to have something bigger than zero page as a drop-in replacement. Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will result in allocating the huge zero folio during early init and never free the memory by disabling the shrinker. This makes using the huge_zero_folio without having to pass any mm struct and does not tie the lifetime of the zero folio to anything, making it a drop-in replacement for ZERO_PAGE. If PERSISTENT_HUGE_ZERO_FOLIO config option is enabled, then mm_get_huge_zero_folio() will simply return the allocated page instead of dynamically allocating a new PMD page. Use this option carefully in resource constrained systems as it uses one full PMD sized page for zeroing purposes. [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/ [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/ Link: https://lkml.kernel.org/r/20250811084113.647267-4-kernel@pankajraghav.com Signed-off-by: David Hildenbrand Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Co-developed-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 16 ++++++++++++++++ mm/Kconfig | 16 ++++++++++++++++ mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7748489fde1b..bd547857c6c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return NULL; + + if (unlikely(!huge_zero_folio)) + return NULL; + + return huge_zero_folio; +} + static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, { return 0; } + +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/Kconfig b/mm/Kconfig index 59a04d0b2e27..4108bcd96784 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config PERSISTENT_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on TRANSPARENT_HUGEPAGE + help + Enable this option to reduce the runtime refcounting overhead + of the huge zero folio and expand the places in the kernel + that can use huge zero folios. For instance, block I/O benefits + from access to large folios for zeroing memory. + + With this option enabled, the huge zero folio is allocated + once and never freed. One full huge page's worth of memory shall + be used. + + Say Y if your system has lots of memory. Say N if you are + memory constrained. + config MM_ID def_bool n diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2801ce9bbde9..b8bb078a1a34 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,6 +248,9 @@ static void put_huge_zero_folio(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return; + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); } @@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_folio_shrinker) - return -ENOMEM; - deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); - if (!deferred_split_shrinker) { - shrinker_free(huge_zero_folio_shrinker); + if (!deferred_split_shrinker) + return -ENOMEM; + + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { + /* + * Bump the reference of the huge_zero_folio and do not + * initialize the shrinker. + * + * huge_zero_folio will always be NULL on failure. We assume + * that get_huge_zero_folio() will most likely not fail as + * thp_shrinker_init() is invoked early on during boot. + */ + if (!get_huge_zero_folio()) + pr_warn("Allocating persistent huge zero folio failed\n"); + return 0; + } + + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) { + shrinker_free(deferred_split_shrinker); return -ENOMEM; } @@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void) huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; shrinker_register(huge_zero_folio_shrinker); - deferred_split_shrinker->count_objects = deferred_split_count; - deferred_split_shrinker->scan_objects = deferred_split_scan; - shrinker_register(deferred_split_shrinker); - return 0; } -- cgit v1.2.3 From 415a0fd62f1899fe2bb81d661e427194b1c97201 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:12 +0200 Subject: mm: add largest_zero_folio() routine The callers of mm_get_huge_zero_folio() have access to a mm struct and the lifetime of the huge_zero_folio is tied to the lifetime of the mm struct. largest_zero_folio() will give access to huge_zero_folio when PERSISTENT_HUGE_ZERO_FOLIO config option is enabled for callers that do not want to tie the lifetime to a mm struct. This is very useful for filesystem and block layers where the request completions can be async and there is no guarantee on the mm struct lifetime. This function will return a ZERO_PAGE folio if PERSISTENT_HUGE_ZERO_FOLIO is disabled or if we failed to allocate a huge_zero_folio during early init. Link: https://lkml.kernel.org/r/20250811084113.647267-5-kernel@pankajraghav.com Signed-off-by: David Hildenbrand Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Co-developed-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd547857c6c1..14d424830fa8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -714,4 +714,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } +/** + * largest_zero_folio - Get the largest zero size folio available + * + * This function shall be used when mm_get_huge_zero_folio() cannot be + * used as there is no appropriate mm lifetime to tie the huge zero folio + * from the caller. + * + * Deduce the size of the folio with folio_size instead of assuming the + * folio size. + * + * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO + * is enabled or a single page sized zero folio + */ +static inline struct folio *largest_zero_folio(void) +{ + struct folio *folio = get_persistent_huge_zero_folio(); + + if (folio) + return folio; + + return page_folio(ZERO_PAGE(0)); +} #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.2.3 From bb6525f2f8c41e89ba3fc506bc1705c68cf845ae Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:10 +0100 Subject: mm: add bitmap mm->flags field Patch series "mm: make mm->flags a bitmap and 64-bit on all arches". We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. In order to execute this change, we introduce a new opaque type - mm_flags_t - which wraps a bitmap. We go further and mark the bitmap field __private, which forces users to have to use accessors, which allows us to enforce atomicity rules around mm->flags (except on those occasions they are not required - fork, etc.) and makes it far easier to keep track of how mm flags are being utilised. In order to implement this change sensibly and an an iterative way, we start by introducing the type with the same bitsize as the current mm flags (system word size) and place it in union with mm->flags. We are then able to gradually update users as we go without being forced to do everything in a single patch. In the course of working on this series I noticed the MMF_* flag masks encounter a sign extension bug that, due to the 32-bit limit on mm->flags thus far, has not caused any issues in practice, but required fixing for this series. We must make special dispensation for two cases - coredump and initailisation on fork, but of which use masks extensively. Since coredump flags are set in stone, we can safely assume they will remain in the first 32-bits of the flags. We therefore provide special non-atomic accessors for this case that access the first system word of flags, keeping everything there essentially the same. For mm->flags initialisation on fork, we adjust the logic to ensure all bits are cleared correctly, and then adjust the existing intialisation logic, dubbing the implementation utilising flags as legacy. This means we get the same fast operations as we do now, but in future we can also choose to update the forking logic to additionally propagate flags beyond 32-bits across fork. With this change in place we can, in future, decide to have as many bits as we please. Since the size of the bitmap will scale in system word multiples, there should be no issues with changes in alignment in mm_struct. Additionally, the really sensitive field (mmap_lock) is located prior to the flags field so this should have no impact on that either. This patch (of 10): We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. To keep things manageable, firstly we introduce the bitmap at a system word system as a new field mm->_flags, in union. This means the new bitmap mm->_flags is bitwise exactly identical to the existing mm->flags field. We have an opportunity to also introduce some type safety here, so let's wrap the mm flags field as a struct and declare it as an mm_flags_t typedef to keep it consistent with vm_flags_t for VMAs. We make the internal field privately accessible, in order to force the use of helper functions so we can enforce that accesses are bitwise as required. We therefore introduce accessors prefixed with mm_flags_*() for callers to use. We place the bit parameter first so as to match the parameter ordering of the *_bit() functions. Having this temporary union arrangement allows us to incrementally swap over users of mm->flags patch-by-patch rather than having to do everything in one fell swoop. [lorenzo.stoakes@oracle.com: place __private in correct place, const-ify __mm_flags_get_word] Link: https://lkml.kernel.org/r/d4ba117d-6234-4069-b871-254d152d7d21@lucifer.local Link: https://lkml.kernel.org/r/cover.1755012943.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/9de8dfd9de8c95cd31622d6e52051ba0d1848f5a.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3868ca1a25f9..4ed4a0b9dad6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,6 +34,8 @@ #include #include #include +#include +#include struct mempolicy; struct anon_vma; @@ -720,6 +722,36 @@ static inline void assert_fault_locked(struct vm_fault *vmf) } #endif /* CONFIG_PER_VMA_LOCK */ +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) +{ + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) +{ + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_set(int flag, struct mm_struct *mm) +{ + set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear(int flag, struct mm_struct *mm) +{ + clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear_all(struct mm_struct *mm) +{ + bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); +} + extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cf94df4955c7..0e001dbad455 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -927,6 +928,15 @@ struct mm_cid { }; #endif +/* + * Opaque type representing current mm_struct flag state. Must be accessed via + * mm_flags_xxx() helper functions. + */ +#define NUM_MM_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} __private mm_flags_t; + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1109,7 +1119,11 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - unsigned long flags; /* Must use atomic bitops to access */ + /* Temporary union while we convert users to mm_flags_t. */ + union { + unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ + }; #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1219,6 +1233,28 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +/* Set the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + bitmap_copy(bitmap, &value, BITS_PER_LONG); +} + +/* Obtain a read-only view of the bitmap. */ +static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) +{ + return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); +} + +/* Read the first system word of mm flags, non-atomically. */ +static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) +{ + const unsigned long *bitmap = __mm_flags_get_bitmap(mm); + + return bitmap_read(bitmap, 0, BITS_PER_LONG); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; -- cgit v1.2.3 From 12e423ba4eaed7b1561b677d32e6599f932d03db Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:11 +0100 Subject: mm: convert core mm to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. This will result in the debug output being that of a bitmap output, which will result in a minor change here, but since this is for debug only, this should have no bearing. Otherwise, no functional changes intended. [akpm@linux-foundation.org: fix typo in comment]Link: https://lkml.kernel.org/r/1eb2266f4408798a55bda00cb04545a3203aa572.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- include/linux/khugepaged.h | 6 ++++-- include/linux/ksm.h | 6 +++--- include/linux/mm.h | 2 +- include/linux/mman.h | 2 +- include/linux/oom.h | 2 +- mm/debug.c | 4 ++-- mm/gup.c | 10 +++++----- mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 10 +++++----- mm/ksm.c | 32 ++++++++++++++++---------------- mm/mmap.c | 8 ++++---- mm/oom_kill.c | 26 +++++++++++++------------- mm/util.c | 6 +++--- 14 files changed, 63 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 14d424830fa8..84b7eebe0d68 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -327,7 +327,7 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, * example, s390 kvm. */ return (vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags); + mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index ff6120463745..eb1946a70cff 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,6 +2,8 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H +#include + extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, mm)) __khugepaged_exit(mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c17b955e7b0b..22e67ca7cba3 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,13 +56,13 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return __ksm_enter(mm); return 0; @@ -70,7 +70,7 @@ static inline int ksm_execve(struct mm_struct *mm) static inline void ksm_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, mm)) __ksm_exit(mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ed4a0b9dad6..34311ebe62cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) return false; return folio_maybe_dma_pinned(folio); diff --git a/include/linux/mman.h b/include/linux/mman.h index de9e8e6229a4..0ba8a7e8b90a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void) static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 1e0fc6931ce9..7b02bc1d0a7e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) */ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { - if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + if (unlikely(mm_flags_test(MMF_UNSTABLE, mm))) return VM_FAULT_SIGBUS; return 0; } diff --git a/mm/debug.c b/mm/debug.c index b4388f4dcd4d..64ddb0c4b4be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx\n" + "binfmt %px flags %*pb\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, + mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm), #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/gup.c b/mm/gup.c index adffe663594d..331d22bf7b2d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios); * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU @@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -3210,7 +3210,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b8bb078a1a34..a2f476e7419a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -251,13 +251,13 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return huge_zero_folio; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -268,7 +268,7 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); } @@ -1145,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) + if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) return ret + size; ret += off_sub; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..550eb00116c5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP, mm); } static bool hugepage_pmd_enabled(void) @@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; mm_slot = mm_slot_alloc(mm_slot_cache); @@ -472,7 +472,7 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) @@ -497,7 +497,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { @@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ diff --git a/mm/ksm.c b/mm/ksm.c index 160787bb121c..2ef29802a49b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1217,8 +1217,8 @@ mm_exiting: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2620,8 +2620,8 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; @@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm) { int err; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } - set_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; @@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) { int err; - if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); @@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) return err; } - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } @@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } @@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!vma_ksm_compatible(vma)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; @@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm) list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) @@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm) if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); + mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b5..7a057e0e8da9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) + if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm) * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); @@ -1859,14 +1859,14 @@ loop_out: mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..17650f0b516e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c - * + * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... @@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || + mm_flags_test(MMF_OOM_SKIP, p->mm) || in_vfork(p)) { task_unlock(p); return LONG_MIN; @@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm)) goto next; goto abort; } @@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * should imply barriers already and the reader would hit a page fault * if it stumbled over a reaped memory. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) @@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * under mmap_lock for reading because it serializes against the * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { trace_skip_task_reaping(tsk->pid); goto out_unlock; } @@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + mm_flags_test(MMF_OOM_SKIP, mm)) goto done; pr_info("oom_reaper: unable to reap pid:%d (%s)\n", @@ -634,7 +634,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call mmap_write_unlock(mm). */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); @@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer) unsigned long flags; /* The victim managed to terminate on its own - see exit_mmap */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { put_task_struct(tsk); return; } @@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm)) return; get_task_struct(tsk); @@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, mm)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); @@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) reap = true; else { /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + if (!mm_flags_test(MMF_OOM_SKIP, mm)) ret = -EINVAL; } task_unlock(p); @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); diff --git a/mm/util.c b/mm/util.c index f814e6a59ab1..d235b74f7aff 100644 --- a/mm/util.c +++ b/mm/util.c @@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU -- cgit v1.2.3 From 39f8049cd49f7e88f89a33f97f996c7306e8be0b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:15 +0100 Subject: mm: update coredump logic to correctly use bitmap mm flags The coredump logic is slightly different from other users in that it both stores mm flags and additionally sets and gets using masks. Since the MMF_DUMPABLE_* flags must remain as they are for uABI reasons, and of course these are within the first 32-bits of the flags, it is reasonable to provide access to these in the same fashion so this logic can all still keep working as it has been. Therefore, introduce coredump-specific helpers __mm_flags_get_dumpable() and __mm_flags_set_mask_dumpable() for this purpose, and update all core dump users of mm flags to use these. [lorenzo.stoakes@oracle.com: abstract set_mask_bits() invocation to mm_types.h to satisfy ARC] Link: https://lkml.kernel.org/r/0e7ad263-1ff7-446d-81fe-97cff9c0e7ed@lucifer.local Link: https://lkml.kernel.org/r/2a5075f7e3c5b367d988178c79a3063d12ee53a9.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Christian Brauner Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/coredump.c | 4 +++- fs/exec.c | 2 +- fs/pidfs.c | 7 +++++-- fs/proc/base.c | 8 +++++--- include/linux/mm_types.h | 12 ++++++++++++ include/linux/sched/coredump.h | 18 +++++++++++++++++- 6 files changed, 43 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc..f9d82ffc4b88 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1103,8 +1103,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) * We must use the same mm->flags while dumping core to avoid * inconsistency of bit flags, since this flag is not protected * by any locks. + * + * Note that we only care about MMF_DUMP* flags. */ - .mm_flags = mm->flags, + .mm_flags = __mm_flags_get_dumpable(mm), .vma_meta = NULL, .cpu = raw_smp_processor_id(), }; diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..dbac0e84cc3e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1999,7 +1999,7 @@ void set_dumpable(struct mm_struct *mm, int value) if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; - set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); + __mm_flags_set_mask_dumpable(mm, value); } SYSCALL_DEFINE3(execve, diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..9913c5268fef 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -357,8 +357,11 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { task_lock(task); - if (task->mm) - kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); + if (task->mm) { + unsigned long flags = __mm_flags_get_dumpable(task->mm); + + kinfo.coredump_mask = pidfs_coredump_mask(flags); + } task_unlock(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 62d35631ba8c..f0c093c58aaf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2962,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { + unsigned long flags = __mm_flags_get_dumpable(mm); + len = snprintf(buffer, sizeof(buffer), "%08lx\n", - ((mm->flags & MMF_DUMP_FILTER_MASK) >> + ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -3002,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) - set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else - clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e001dbad455..9d224075d895 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1255,6 +1255,18 @@ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) return bitmap_read(bitmap, 0, BITS_PER_LONG); } +/* + * Update the first system word of mm flags ONLY, applying the specified mask to + * it, then setting all flags specified by bits. + */ +static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, + unsigned long mask, unsigned long bits) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + set_mask_bits(bitmap, mask, bits); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 6eb65ceed213..b7fafe999073 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,6 +8,20 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ +static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +{ + /* + * By convention, dumpable bits are contained in first 32 bits of the + * bitmap, so we can simply access this first unsigned long directly. + */ + return __mm_flags_get_word(mm); +} + +static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) +{ + __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); +} + extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags) static inline int get_dumpable(struct mm_struct *mm) { - return __get_dumpable(mm->flags); + unsigned long flags = __mm_flags_get_dumpable(mm); + + return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */ -- cgit v1.2.3 From 01f86753a05a3b971d69147d6d074c1a8d29b57d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:16 +0100 Subject: mm: correct sign-extension issue in MMF_* flag masks There is an issue with the mask declarations in linux/mm_types.h, which naively do (1 << bit) operations. Unfortunately this results in the 1 being defaulted as a signed (32-bit) integer. When the compiler expands the MMF_INIT_MASK bitmask it comes up with: (((1 << 2) - 1) | (((1 << 9) - 1) << 2) | (1 << 24) | (1 << 28) | (1 << 30) | (1 << 31)) Which overflows the signed integer to -788,527,105. Implicitly casting this to an unsigned integer results in sign-expansion, and thus this value becomes 0xffffffffd10007ff, rather than the intended 0xd10007ff. While we're limited to a maximum of 32 bits in mm->flags, this isn't an issue as the remaining bits being masked will always be zero. However, now we are moving towards having more bits in this flag, this becomes an issue. Simply resolve this by using the _BITUL() helper to cast the shifted value to an unsigned long. [lorenzo.stoakes@oracle.com: prefer BIT() to _BITUL()] Link: https://lkml.kernel.org/r/a0290c77-cd88-46d6-8d9a-073be7600d88@lucifer.local Link: https://lkml.kernel.org/r/f92194bee8c92a04fd4c9b2c14c7e65229639300.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9d224075d895..de09ae2a0de6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1767,7 +1767,7 @@ enum { * the modes are SUID_DUMP_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1782,13 +1782,13 @@ enum { #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) + ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \ + BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif @@ -1808,7 +1808,7 @@ enum { #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -1821,16 +1821,15 @@ enum { #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - +#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE) #define MMF_HAS_MDWE_NO_INHERIT 29 #define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY) #define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) +#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ -- cgit v1.2.3 From 19148a19da86f1b7d1a1b067c9f656b0f3a60fb1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:17 +0100 Subject: mm: update fork mm->flags initialisation to use bitmap We now need to account for flag initialisation on fork. We retain the existing logic as much as we can, but dub the existing flag mask legacy. These flags are therefore required to fit in the first 32-bits of the flags field. However, further flag propagation upon fork can be implemented in mm_init() on a per-flag basis. We ensure we clear the entire bitmap prior to setting it, and use __mm_flags_get_word() and __mm_flags_set_word() to manipulate these legacy fields efficiently. Link: https://lkml.kernel.org/r/9fb8954a7a0f0184f012a8e66f8565bcbab014ba.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 13 ++++++++++--- kernel/fork.c | 7 +++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de09ae2a0de6..69ce407b4343 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1831,16 +1831,23 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) -static inline unsigned long mmf_init_flags(unsigned long flags) +/* Legacy flags must fit within 32 bits. */ +static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX); + +/* + * Initialise legacy flags according to masks, propagating selected flags on + * fork. Further flag manipulation can be performed by the caller. + */ +static inline unsigned long mmf_init_legacy_flags(unsigned long flags) { if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) flags &= ~((1UL << MMF_HAS_MDWE) | (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; + return flags & MMF_INIT_LEGACY_MASK; } #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..b04ecba4a709 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1057,11 +1057,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_set_word(mm, default_dump_filter); mm->def_flags = 0; } -- cgit v1.2.3 From 8166353fb8841390bf3ffe1b923a5ddeb348e4f7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:19 +0100 Subject: mm: replace mm->flags with bitmap entirely and set to 64 bits Now we have updated all users of mm->flags to use the bitmap accessors, repalce it with the bitmap version entirely. We are then able to move to having 64 bits of mm->flags on both 32-bit and 64-bit architectures. We also update the VMA userland tests to ensure that everything remains functional there. No functional changes intended, other than there now being 64 bits of available mm_struct flags. Link: https://lkml.kernel.org/r/e1f6654e016d36c43959764b01355736c5cbcdf8.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++------ include/linux/mm_types.h | 14 +++++--------- tools/testing/vma/vma_internal.h | 19 +++++++++++++++++-- 3 files changed, 28 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 34311ebe62cc..b61e2d4858cf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,32 +724,32 @@ static inline void assert_fault_locked(struct vm_fault *vmf) static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { - return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) { - return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) { - return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_set(int flag, struct mm_struct *mm) { - set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear(int flag, struct mm_struct *mm) { - clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear_all(struct mm_struct *mm) { - bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); + bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); } extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 69ce407b4343..05475b5fd516 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -932,7 +932,7 @@ struct mm_cid { * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. */ -#define NUM_MM_FLAG_BITS BITS_PER_LONG +#define NUM_MM_FLAG_BITS (64) typedef struct { DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } __private mm_flags_t; @@ -1119,11 +1119,7 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - /* Temporary union while we convert users to mm_flags_t. */ - union { - unsigned long flags; /* Must use atomic bitops to access */ - mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ - }; + mm_flags_t flags; /* Must use mm_flags_* hlpers to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1236,7 +1232,7 @@ struct mm_struct { /* Set the first system word of mm flags, non-atomically. */ static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); bitmap_copy(bitmap, &value, BITS_PER_LONG); } @@ -1244,7 +1240,7 @@ static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value /* Obtain a read-only view of the bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { - return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); + return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); } /* Read the first system word of mm flags, non-atomically. */ @@ -1262,7 +1258,7 @@ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, unsigned long mask, unsigned long bits) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); set_mask_bits(bitmap, mask, bits); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index cb1c2a8afe26..f13354bf0a1e 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -249,6 +249,14 @@ struct mutex {}; #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = {} +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define NUM_MM_FLAG_BITS (64) +typedef struct { + __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} mm_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -260,7 +268,7 @@ struct mm_struct { unsigned long def_flags; - unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t flags; /* Must use mm_flags_* helpers to access */ }; struct vm_area_struct; @@ -1333,6 +1341,13 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +# define ACCESS_PRIVATE(p, member) ((p)->member) + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1363,7 +1378,7 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ -- cgit v1.2.3 From 0f9ab62a6e44ef51ea3e7f1c552b447cf4eb20ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Aug 2025 10:30:08 +0200 Subject: mempool: rename struct mempool_s to struct mempool Drop the pointless _s prefix and align to the usual struct naming to prepare for actually using the struct instead of the typedef so that random headers don't need to include mempool.h for just having a pointer to the mempool. Link: https://lkml.kernel.org/r/20250812083105.371295-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Harry Yoo Reviewed-by: Vlastimil Babka Cc: Christoph Lameter (Ampere) Cc: David Rientjes Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/blkdev.h | 2 +- include/linux/mempool.h | 2 +- include/linux/netfs.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe1797bbec42..28ceaeffc0c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -199,7 +199,7 @@ struct gendisk { unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; - struct mempool_s *zone_wplugs_pool; + struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7b151441341b..34941a4b9026 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -15,7 +15,7 @@ struct kmem_cache; typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); typedef void (mempool_free_t)(void *element, void *pool_data); -typedef struct mempool_s { +typedef struct mempool { spinlock_t lock; int min_nr; /* nr of elements at *elements */ int curr_nr; /* Current nr of elements at *elements */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 98c96d649bf9..72ee7d210a74 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,7 +21,7 @@ #include enum netfs_sreq_ref_trace; -typedef struct mempool_s mempool_t; +typedef struct mempool mempool_t; struct folio_queue; /** -- cgit v1.2.3 From 85b8cec15034e07500a6e5b8a5aea8185a3d775a Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 12 Aug 2025 00:10:59 -0700 Subject: mm: swap.h: Remove deleted field from comments The comment for struct swap_info_struct.lock incorrectly mentions fields that have already been deleted from the structure. Update the comments to accurately reflect the current struct swap_info_struct. There is no functional change. Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-2-6d73504d267b@kernel.org Signed-off-by: Chris Li Reviewed-by: Kairui Song Acked-by: Nhat Pham Reviewed-by: Barry Song Cc: Baoquan He Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index a060d102e0d1..c2da85cb7fe7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,11 +320,8 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, lowest_bit, highest_bit, - * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc, - * highest_alloc, free/discard cluster - * list. other fields are only changed + * swap_map, inuse_pages and all cluster + * lists. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If -- cgit v1.2.3 From ec45783fce52f358c9e8680d2837bc0d477f16ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 16:57:55 +0200 Subject: memcg: optimize exit to user space memcg uses TIF_NOTIFY_RESUME to handle reclaiming on exit to user space. TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by other entities as well. This results in a unconditional mem_cgroup_handle_over_high() call for every invocation of resume_user_mode_work(), which is a pointless exercise as most of the time there is no reclaim work to do. Especially since RSEQ is used by glibc, TIF_NOTIFY_RESUME is raised quite frequently and the empty calls show up in exit path profiling. Optimize this by doing a quick check of the reclaim condition before invoking it. [akpm@linux-foundation.org: remove now-unneeded test of memcg_nr_pages_over_high==0, per Shakeel] Link: https://lkml.kernel.org/r/87tt2b6zgs.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 +++++++- mm/memcontrol.c | 7 ++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..9fa3afc90dd5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,7 +900,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask); + +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) +{ + if (unlikely(current->memcg_nr_pages_over_high)) + __mem_cgroup_handle_over_high(gfp_mask); +} unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..9712a751690f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2203,7 +2203,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void mem_cgroup_handle_over_high(gfp_t gfp_mask) +void __mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2213,9 +2213,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) struct mem_cgroup *memcg; bool in_retry = false; - if (likely(!nr_pages)) - return; - memcg = get_mem_cgroup_from_mm(current->mm); current->memcg_nr_pages_over_high = 0; @@ -2486,7 +2483,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) - mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask); return 0; } -- cgit v1.2.3 From 9dc21bbd62edeae6f63e6f25e1edb7167452457b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:53 +0100 Subject: prctl: extend PR_SET_THP_DISABLE to optionally exclude VM_HUGEPAGE Patch series "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised", v5. This will allow individual processes to opt-out of THP = "always" into THP = "madvise", without affecting other workloads on the system. This has been extensively discussed on the mailing list and has been summarized very well by David in the first patch which also includes the links to alternatives, please refer to the first patch commit message for the motivation for this series. Patch 1 adds the PR_THP_DISABLE_EXCEPT_ADVISED flag to implement this, along with the MMF changes. Patch 2 is a cleanup patch for tva_flags that will allow the forced collapse case to be transmitted to vma_thp_disabled (which is done in patch 3). Patch 4 adds documentation for PR_SET_THP_DISABLE/PR_GET_THP_DISABLE. Patches 6-7 implement the selftests for PR_SET_THP_DISABLE for completely disabling THPs (old behaviour) and only enabling it at advise (PR_THP_DISABLE_EXCEPT_ADVISED). This patch (of 7): People want to make use of more THPs, for example, moving from the "never" system policy to "madvise", or from "madvise" to "always". While this is great news for every THP desperately waiting to get allocated out there, apparently there are some workloads that require a bit of care during that transition: individual processes may need to opt-out from this behavior for various reasons, and this should be permitted without needing to make all other workloads on the system similarly opt-out. The following scenarios are imaginable: (1) Switch from "none" system policy to "madvise"/"always", but keep THPs disabled for selected workloads. (2) Stay at "none" system policy, but enable THPs for selected workloads, making only these workloads use the "madvise" or "always" policy. (3) Switch from "madvise" system policy to "always", but keep the "madvise" policy for selected workloads: allocate THPs only when advised. (4) Stay at "madvise" system policy, but enable THPs even when not advised for selected workloads -- "always" policy. Once can emulate (2) through (1), by setting the system policy to "madvise"/"always" while disabling THPs for all processes that don't want THPs. It requires configuring all workloads, but that is a user-space problem to sort out. (4) can be emulated through (3) in a similar way. Back when (1) was relevant in the past, as people started enabling THPs, we added PR_SET_THP_DISABLE, so relevant workloads that were not ready yet (i.e., used by Redis) were able to just disable THPs completely. Redis still implements the option to use this interface to disable THPs completely. With PR_SET_THP_DISABLE, we added a way to force-disable THPs for a workload -- a process, including fork+exec'ed process hierarchy. That essentially made us support (1): simply disable THPs for all workloads that are not ready for THPs yet, while still enabling THPs system-wide. The quest for handling (3) and (4) started, but current approaches (completely new prctl, options to set other policies per process, alternatives to prctl -- mctrl, cgroup handling) don't look particularly promising. Likely, the future will use bpf or something similar to implement better policies, in particular to also make better decisions about THP sizes to use, but this will certainly take a while as that work just started. Long story short: a simple enable/disable is not really suitable for the future, so we're not willing to add completely new toggles. While we could emulate (3)+(4) through (1)+(2) by simply disabling THPs completely for these processes, this is a step backwards, because these processes can no longer allocate THPs in regions where THPs were explicitly advised: regions flagged as VM_HUGEPAGE. Apparently, that imposes a problem for relevant workloads, because "not THPs" is certainly worse than "THPs only when advised". Could we simply relax PR_SET_THP_DISABLE, to "disable THPs unless not explicitly advised by the app through MAD_HUGEPAGE"? *maybe*, but this would change the documented semantics quite a bit, and the versatility to use it for debugging purposes, so I am not 100% sure that is what we want -- although it would certainly be much easier. So instead, as an easy way forward for (3) and (4), add an option to make PR_SET_THP_DISABLE disable *less* THPs for a process. In essence, this patch: (A) Adds PR_THP_DISABLE_EXCEPT_ADVISED, to be used as a flag in arg3 of prctl(PR_SET_THP_DISABLE) when disabling THPs (arg2 != 0). prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED). (B) Makes prctl(PR_GET_THP_DISABLE) return 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set while disabling. Previously, it would return 1 if THPs were disabled completely. Now it returns the set flags as well: 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set. (C) Renames MMF_DISABLE_THP to MMF_DISABLE_THP_COMPLETELY, to express the semantics clearly. Fortunately, there are only two instances outside of prctl() code. (D) Adds MMF_DISABLE_THP_EXCEPT_ADVISED to express "no THP except for VMAs with VM_HUGEPAGE" -- essentially "thp=madvise" behavior Fortunately, we only have to extend vma_thp_disabled(). (E) Indicates "THP_enabled: 0" in /proc/pid/status only if THPs are disabled completely Only indicating that THPs are disabled when they are really disabled completely, not only partially. For now, we don't add another interface to obtained whether THPs are disabled partially (PR_THP_DISABLE_EXCEPT_ADVISED was set). If ever required, we could add a new entry. The documented semantics in the man page for PR_SET_THP_DISABLE "is inherited by a child created via fork(2) and is preserved across execve(2)" is maintained. This behavior, for example, allows for disabling THPs for a workload through the launching process (e.g., systemd where we fork() a helper process to then exec()). For now, MADV_COLLAPSE will *fail* in regions without VM_HUGEPAGE and VM_NOHUGEPAGE. As MADV_COLLAPSE is a clear advise that user space thinks a THP is a good idea, we'll enable that separately next (requiring a bit of cleanup first). There is currently not way to prevent that a process will not issue PR_SET_THP_DISABLE itself to re-enable THP. There are not really known users for re-enabling it, and it's against the purpose of the original interface. So if ever required, we could investigate just forbidding to re-enable them, or make this somehow configurable. Link: https://lkml.kernel.org/r/20250815135549.130506-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250815135549.130506-2-usamaarif642@gmail.com Acked-by: Zi Yan Acked-by: Usama Arif Tested-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Usama Arif Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 ++-- fs/proc/array.c | 2 +- include/linux/huge_mm.h | 20 +++++++++---- include/linux/mm_types.h | 13 ++++----- include/uapi/linux/prctl.h | 10 +++++++ kernel/sys.c | 59 ++++++++++++++++++++++++++++++-------- mm/khugepaged.c | 2 +- 7 files changed, 82 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..915a3e44bc12 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -291,8 +291,9 @@ It's slow but very precise. HugetlbPages size of hugetlb memory portions CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) - THP_enabled process is allowed to use THP (returns 0 when - PR_SET_THP_DISABLE is set on the process + THP_enabled process is allowed to use THP (returns 0 when + PR_SET_THP_DISABLE is set on the process to disable + THP completely, not just partially) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/array.c b/fs/proc/array.c index c286dc12325e..d84b291dd1ed 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 84b7eebe0d68..22b8b067b295 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -318,16 +318,26 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_mm)) + return true; /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. + * Are THPs disabled only for VMAs where we didn't get an explicit + * advise to use them? */ - return (vm_flags & VM_NOHUGEPAGE) || - mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); + if (vm_flags & VM_HUGEPAGE) + return false; + return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 05475b5fd516..d247da2fdb52 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1792,19 +1792,16 @@ enum { #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) +#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ +#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ + BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index ed3aed264aeb..150b6deebfb1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* diff --git a/kernel/sys.c b/kernel/sys.c index 605f7fe9a143..a46d9b75880b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - mm_flags_set(MMF_DISABLE_THP, me->mm); - else - mm_flags_clear(MMF_DISABLE_THP, me->mm); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 550eb00116c5..1a416b865997 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - mm_flags_test(MMF_DISABLE_THP, mm); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) -- cgit v1.2.3 From 1f1c061089dcd274befa0c76fb9f6e253a8368c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:54 +0100 Subject: mm/huge_memory: convert "tva_flags" to "enum tva_type" When determining which THP orders are eligible for a VMA mapping, we have previously specified tva_flags, however it turns out it is really not necessary to treat these as flags. Rather, we distinguish between distinct modes. The only case where we previously combined flags was with TVA_ENFORCE_SYSFS, but we can avoid this by observing that this is the default, except for MADV_COLLAPSE or an edge cases in collapse_pte_mapped_thp() and hugepage_vma_revalidate(), and adding a mode specifically for this case - TVA_FORCED_COLLAPSE. We have: * smaps handling for showing "THPeligible" * Pagefault handling * khugepaged handling * Forced collapse handling: primarily MADV_COLLAPSE, but also for an edge case in collapse_pte_mapped_thp() Disregarding the edge cases, we only want to ignore sysfs settings only when we are forcing a collapse through MADV_COLLAPSE, otherwise we want to enforce it, hence this patch does the following flag to enum conversions: * TVA_SMAPS | TVA_ENFORCE_SYSFS -> TVA_SMAPS * TVA_IN_PF | TVA_ENFORCE_SYSFS -> TVA_PAGEFAULT * TVA_ENFORCE_SYSFS -> TVA_KHUGEPAGED * 0 -> TVA_FORCED_COLLAPSE With this change, we immediately know if we are in the forced collapse case, which will be valuable next. Link: https://lkml.kernel.org/r/20250815135549.130506-3-usamaarif642@gmail.com Signed-off-by: David Hildenbrand Signed-off-by: Usama Arif Acked-by: Usama Arif Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 30 ++++++++++++++++++------------ mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 17 ++++++++--------- mm/memory.c | 14 ++++++-------- 5 files changed, 38 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e8e7bef34531..ced01cf3c5ab 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1369,8 +1369,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, + THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 22b8b067b295..92ea0b9771fa 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL \ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) -#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ -#define TVA_IN_PF (1 << 1) /* Page fault handler */ -#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ +enum tva_type { + TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ + TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_KHUGEPAGED, /* Khugepaged collapse. */ + TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ +}; -#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define thp_vma_allowable_order(vma, vm_flags, type, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) #define split_folio(f) split_folio_to_list(f, NULL) @@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @tva_flags: Which TVA flags to honour + * @type: TVA type * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - /* Optimization to check if required orders are enabled early. */ - if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { + /* + * Optimization to check if required orders are enabled early. Only + * forced collapse ignores sysfs configs. + */ + if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); + return __thp_vma_allowable_orders(vma, vm_flags, type, orders); } struct thpsize { @@ -547,7 +553,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f476e7419a..899d9ac86ecd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - bool smaps = tva_flags & TVA_SMAPS; - bool in_pf = tva_flags & TVA_IN_PF; - bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + const bool smaps = type == TVA_SMAPS; + const bool in_pf = type == TVA_PAGEFAULT; + const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1a416b865997..d3d4f116e14b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -474,8 +474,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -921,7 +920,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -932,7 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1533,9 +1533,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2432,8 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2767,7 +2766,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 002c28795d8b..7b1e8f137fa3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4515,8 +4515,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -5063,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -6254,8 +6254,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6289,8 +6288,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- cgit v1.2.3 From 8cdc4d27019356b0304308eb799484c899b62a87 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:55 +0100 Subject: mm/huge_memory: respect MADV_COLLAPSE with PR_THP_DISABLE_EXCEPT_ADVISED Let's allow for making MADV_COLLAPSE succeed on areas that neither have VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). MADV_COLLAPSE is a clear advice that we want to collapse. Note that we still respect the VM_NOHUGEPAGE flag, just like MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED, including for shmem. Link: https://lkml.kernel.org/r/20250815135549.130506-4-usamaarif642@gmail.com Co-developed-by: Usama Arif Signed-off-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++++++- include/uapi/linux/prctl.h | 2 +- mm/huge_memory.c | 5 +++-- mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ea0b9771fa..1ac0d06fb3c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -329,7 +329,7 @@ struct thpsize { * through madvise or prctl. */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags) + vm_flags_t vm_flags, bool forced_collapse) { /* Are THPs disabled for this VMA? */ if (vm_flags & VM_NOHUGEPAGE) @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, */ if (vm_flags & VM_HUGEPAGE) return false; + /* + * Forcing a collapse (e.g., madv_collapse), is a clear advice to + * use THPs. + */ + if (forced_collapse) + return false; return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 150b6deebfb1..51c4e8c82b1e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -185,7 +185,7 @@ struct prctl_mm_map { #define PR_SET_THP_DISABLE 41 /* * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / - * VM_HUGEPAGE). + * VM_HUGEPAGE, MADV_COLLAPSE). */ # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 899d9ac86ecd..d89992b65acc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, { const bool smaps = type == TVA_SMAPS; const bool in_pf = type == TVA_PAGEFAULT; - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; + const bool enforce_sysfs = !forced_collapse; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ diff --git a/mm/memory.c b/mm/memory.c index 7b1e8f137fa3..d9de6c056179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5332,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..d945de3a7f0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, -- cgit v1.2.3 From 53fbef56e07df822ea3029109ffca25328c2e5ac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:51 +0100 Subject: mm: introduce memdesc_flags_t Patch series "Add and use memdesc_flags_t". At some point struct page will be separated from struct slab and struct folio. This is a step towards that by introducing a type for the 'flags' word of all three structures. This gives us a certain amount of type safety by establishing that some of these unsigned longs are different from other unsigned longs in that they contain things like node ID, section number and zone number in the upper bits. That lets us have functions that can be easily called by anyone who has a slab, folio or page (but not easily by anyone else) to get the node or zone. There's going to be some unusual merge problems with this as some odd bits of the kernel decide they want to print out the flags value or something similar by writing page->flags and now they'll need to write page->flags.f instead. That's most of the churn here. Maybe we should be removing these things from the debug output? This patch (of 11): Wrap the unsigned long flags in a typedef. In upcoming patches, this will provide a strong hint that you can't just pass a random unsigned long to functions which take this as an argument. [willy@infradead.org: s/flags/flags.f/ in several architectures] Link: https://lkml.kernel.org/r/aKMgPRLD-WnkPxYm@casper.infradead.org [nicola.vetrini@gmail.com: mips: fix compilation error] Link: https://lore.kernel.org/lkml/CA+G9fYvkpmqGr6wjBNHY=dRp71PLCoi2341JxOudi60yqaeUdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250825214245.1838158-1-nicola.vetrini@gmail.com Link: https://lkml.kernel.org/r/20250805172307.1302730-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250805172307.1302730-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- arch/arc/mm/cache.c | 8 +++---- arch/arc/mm/tlb.c | 2 +- arch/arm/include/asm/hugetlb.h | 2 +- arch/arm/mm/copypage-v4mc.c | 2 +- arch/arm/mm/copypage-v6.c | 2 +- arch/arm/mm/copypage-xscale.c | 2 +- arch/arm/mm/dma-mapping.c | 2 +- arch/arm/mm/fault-armv.c | 2 +- arch/arm/mm/flush.c | 10 ++++----- arch/arm64/include/asm/hugetlb.h | 6 ++--- arch/arm64/include/asm/mte.h | 16 +++++++------- arch/arm64/mm/flush.c | 8 +++---- arch/csky/abiv1/cacheflush.c | 6 ++--- arch/mips/include/asm/cacheflush.h | 6 ++--- arch/nios2/mm/cacheflush.c | 6 ++--- arch/openrisc/include/asm/cacheflush.h | 2 +- arch/openrisc/mm/cache.c | 2 +- arch/parisc/kernel/cache.c | 6 ++--- arch/powerpc/include/asm/cacheflush.h | 4 ++-- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- arch/powerpc/mm/pgtable.c | 12 +++++----- arch/riscv/include/asm/cacheflush.h | 4 ++-- arch/riscv/include/asm/hugetlb.h | 2 +- arch/riscv/mm/cacheflush.c | 4 ++-- arch/s390/include/asm/hugetlb.h | 2 +- arch/s390/kernel/uv.c | 12 +++++----- arch/s390/mm/gmap.c | 2 +- arch/s390/mm/hugetlbpage.c | 2 +- arch/sh/include/asm/hugetlb.h | 2 +- arch/sh/mm/cache-sh4.c | 2 +- arch/sh/mm/cache-sh7705.c | 2 +- arch/sh/mm/cache.c | 14 ++++++------ arch/sh/mm/kmap.c | 2 +- arch/sparc/mm/init_64.c | 10 ++++----- arch/x86/mm/pat/memtype.c | 6 ++--- arch/xtensa/mm/cache.c | 12 +++++----- fs/fuse/dev.c | 2 +- fs/gfs2/glops.c | 2 +- fs/jffs2/file.c | 4 ++-- fs/nilfs2/page.c | 2 +- fs/proc/page.c | 4 ++-- fs/ubifs/file.c | 6 ++--- include/linux/mm.h | 32 +++++++++++++-------------- include/linux/mm_inline.h | 12 +++++----- include/linux/mm_types.h | 8 +++++-- include/linux/mmzone.h | 2 +- include/linux/page-flags.h | 40 +++++++++++++++++----------------- include/linux/pgalloc_tag.h | 7 +++--- include/trace/events/page_ref.h | 4 ++-- mm/filemap.c | 8 +++---- mm/huge_memory.c | 4 ++-- mm/memory-failure.c | 12 +++++----- mm/mmzone.c | 4 ++-- mm/page_alloc.c | 12 +++++----- mm/swap.c | 8 +++---- mm/vmscan.c | 18 +++++++-------- mm/workingset.c | 2 +- 58 files changed, 195 insertions(+), 190 deletions(-) (limited to 'include') diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 9106ceac323c..7d2f93dc1e91 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -704,7 +704,7 @@ static inline void arc_slc_enable(void) void flush_dcache_folio(struct folio *folio) { - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); return; } EXPORT_SYMBOL(flush_dcache_folio); @@ -889,8 +889,8 @@ void copy_user_highpage(struct page *to, struct page *from, copy_page(kto, kfrom); - clear_bit(PG_dc_clean, &dst->flags); - clear_bit(PG_dc_clean, &src->flags); + clear_bit(PG_dc_clean, &dst->flags.f); + clear_bit(PG_dc_clean, &src->flags.f); kunmap_atomic(kto); kunmap_atomic(kfrom); @@ -900,7 +900,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) { struct folio *folio = page_folio(page); clear_page(to); - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); } EXPORT_SYMBOL(clear_user_page); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index cae4a7aae0ed..ed6915ba76ec 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -488,7 +488,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, */ if (vma->vm_flags & VM_EXEC) { struct folio *folio = page_folio(page); - int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f); if (dirty) { unsigned long offset = offset_in_folio(folio, paddr); nr = folio_nr_pages(folio); diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index b766c4b373f6..700055b1ccb3 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -17,7 +17,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 7ddd82b9fe8b..ed843bb22020 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -67,7 +67,7 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from, struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index a1a71f36d850..0710dba5c0bf 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -73,7 +73,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, unsigned int offset = CACHE_COLOUR(vaddr); unsigned long kfrom, kto; - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); /* FIXME: not highmem safe */ diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index f1e29d3e8193..e16af68d709f 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -87,7 +87,7 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from, struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 88c2d68a69c9..08641a936394 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -718,7 +718,7 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, if (size < sz) break; if (!offset) - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); offset = 0; size -= sz; if (!size) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 39fd5df73317..91e488767783 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -203,7 +203,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, folio = page_folio(pfn_to_page(pfn)); mapping = folio_flush_mapping(folio); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 5219158d54cf..19470d938b23 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -304,7 +304,7 @@ void __sync_icache_dcache(pte_t pteval) else mapping = NULL; - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(mapping, folio); if (pte_exec(pteval)) @@ -343,8 +343,8 @@ void flush_dcache_folio(struct folio *folio) return; if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); return; } @@ -352,14 +352,14 @@ void flush_dcache_folio(struct folio *folio) if (!cache_ops_need_broadcast() && mapping && !folio_mapped(folio)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { __flush_dcache_folio(mapping, folio); if (mapping && cache_is_vivt()) __flush_dcache_aliases(mapping, folio); else if (mapping) __flush_icache_all(); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 2a8155c4a882..44c1f757bfcf 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -21,12 +21,12 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); #ifdef CONFIG_ARM64_MTE if (system_supports_mte()) { - clear_bit(PG_mte_tagged, &folio->flags); - clear_bit(PG_mte_lock, &folio->flags); + clear_bit(PG_mte_tagged, &folio->flags.f); + clear_bit(PG_mte_lock, &folio->flags.f); } #endif } diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 6567df8ec8ca..3b5069f4683d 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -48,12 +48,12 @@ static inline void set_page_mte_tagged(struct page *page) * before the page flags update. */ smp_wmb(); - set_bit(PG_mte_tagged, &page->flags); + set_bit(PG_mte_tagged, &page->flags.f); } static inline bool page_mte_tagged(struct page *page) { - bool ret = test_bit(PG_mte_tagged, &page->flags); + bool ret = test_bit(PG_mte_tagged, &page->flags.f); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); @@ -82,7 +82,7 @@ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); - if (!test_and_set_bit(PG_mte_lock, &page->flags)) + if (!test_and_set_bit(PG_mte_lock, &page->flags.f)) return true; /* @@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page) * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ - smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); + smp_cond_load_acquire(&page->flags.f, VAL & (1UL << PG_mte_tagged)); return false; } @@ -173,13 +173,13 @@ static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) * before the folio flags update. */ smp_wmb(); - set_bit(PG_mte_tagged, &folio->flags); + set_bit(PG_mte_tagged, &folio->flags.f); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { - bool ret = test_bit(PG_mte_tagged, &folio->flags); + bool ret = test_bit(PG_mte_tagged, &folio->flags.f); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); @@ -196,7 +196,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); - if (!test_and_set_bit(PG_mte_lock, &folio->flags)) + if (!test_and_set_bit(PG_mte_lock, &folio->flags.f)) return true; /* @@ -204,7 +204,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ - smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); + smp_cond_load_acquire(&folio->flags.f, VAL & (1UL << PG_mte_tagged)); return false; } diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 013eead9b695..fbf08b543c3f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -53,11 +53,11 @@ void __sync_icache_dcache(pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { sync_icache_aliases((unsigned long)folio_address(folio), (unsigned long)folio_address(folio) + folio_size(folio)); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -69,8 +69,8 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 171e8fb32285..4bc0aad3cf8a 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -25,12 +25,12 @@ void flush_dcache_folio(struct folio *folio) mapping = folio_flush_mapping(folio); if (mapping && !folio_mapped(folio)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { dcache_wbinv_all(); if (mapping) icache_inv_all(); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); @@ -56,7 +56,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, return; folio = page_folio(pfn_to_page(pfn)); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) dcache_wbinv_all(); if (folio_flush_mapping(folio)) { diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 1f14132b3fc9..5d283ef89d90 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -37,11 +37,11 @@ #define PG_dcache_dirty PG_arch_1 #define folio_test_dcache_dirty(folio) \ - test_bit(PG_dcache_dirty, &(folio)->flags) + test_bit(PG_dcache_dirty, &(folio)->flags.f) #define folio_set_dcache_dirty(folio) \ - set_bit(PG_dcache_dirty, &(folio)->flags) + set_bit(PG_dcache_dirty, &(folio)->flags.f) #define folio_clear_dcache_dirty(folio) \ - clear_bit(PG_dcache_dirty, &(folio)->flags) + clear_bit(PG_dcache_dirty, &(folio)->flags.f) extern void (*flush_cache_all)(void); extern void (*__flush_cache_all)(void); diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 0ee9c5f02e08..8321182eb927 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -187,7 +187,7 @@ void flush_dcache_folio(struct folio *folio) /* Flush this page if there are aliases. */ if (mapping && !mapping_mapped(mapping)) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } else { __flush_dcache_folio(folio); if (mapping) { @@ -195,7 +195,7 @@ void flush_dcache_folio(struct folio *folio) flush_aliases(mapping, folio); flush_icache_range(start, start + folio_size(folio)); } - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); @@ -227,7 +227,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, return; folio = page_folio(pfn_to_page(pfn)); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(folio); mapping = folio_flush_mapping(folio); diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 0e60af486ec1..cd8f971c0fec 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -75,7 +75,7 @@ static inline void sync_icache_dcache(struct page *page) static inline void flush_dcache_folio(struct folio *folio) { - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index 0f265b8e73ec..f33df46dae4e 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -83,7 +83,7 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, { unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT; struct folio *folio = page_folio(pfn_to_page(pfn)); - int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f); /* * Since icaches do not snoop for updated data on OpenRISC, we diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 37ca484cc495..4c5240d3a3c7 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -122,10 +122,10 @@ void __update_cache(pte_t pte) pfn = folio_pfn(folio); nr = folio_nr_pages(folio); if (folio_flush_mapping(folio) && - test_bit(PG_dcache_dirty, &folio->flags)) { + test_bit(PG_dcache_dirty, &folio->flags.f)) { while (nr--) flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); - clear_bit(PG_dcache_dirty, &folio->flags); + clear_bit(PG_dcache_dirty, &folio->flags.f); } else if (parisc_requires_coherency()) while (nr--) flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); @@ -481,7 +481,7 @@ void flush_dcache_folio(struct folio *folio) pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_dcache_dirty, &folio->flags); + set_bit(PG_dcache_dirty, &folio->flags.f); return; } diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index f2656774aaa9..1fea42928f64 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -40,8 +40,8 @@ static inline void flush_dcache_folio(struct folio *folio) if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) return; /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ca3829d47ab7..0953f2daa466 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -939,9 +939,9 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) /* Clear i-cache for new pages */ folio = page_folio(pfn_to_page(pfn)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 4693c464fc5a..3aee3af614af 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1562,11 +1562,11 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &folio->flags) && + if (!test_bit(PG_dcache_clean, &folio->flags.f) && !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } else pp |= HPTE_R_N; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index dfaa9fd86f7e..56d7e8960e77 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -87,9 +87,9 @@ static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) struct folio *folio = maybe_pte_to_folio(pte); if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } return pte; @@ -127,13 +127,13 @@ static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &folio->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); return pte; } @@ -175,12 +175,12 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &folio->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) goto bail; /* Clean the page and set PG_dcache_clean */ flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); bail: return pte_mkexec(pte); diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 6086b38d5427..0092513c3376 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -23,8 +23,8 @@ static inline void local_flush_icache_range(unsigned long start, static inline void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 446126497768..0872d43fc0c0 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -7,7 +7,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 4ca5aafce22e..d83a612464f6 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -101,9 +101,9 @@ void flush_icache_pte(struct mm_struct *mm, pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_icache_mm(mm, false); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } #endif /* CONFIG_MMU */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 931fcc413598..69131736daaa 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,7 +39,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 47f574cd1728..93b2a01bae40 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -144,7 +144,7 @@ int uv_destroy_folio(struct folio *folio) folio_get(folio); rc = uv_destroy(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -193,7 +193,7 @@ int uv_convert_from_secure_folio(struct folio *folio) folio_get(folio); rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -289,7 +289,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) expected = expected_folio_refs(folio) + 1; if (!folio_ref_freeze(folio, expected)) return -EBUSY; - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -483,18 +483,18 @@ int arch_make_folio_accessible(struct folio *folio) * convert_to_secure. * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &folio->flags)) + if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index c7defe4ed1f6..8ff6bba107e8 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2272,7 +2272,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, start = pmd_val(*pmd) & HPAGE_MASK; end = start + HPAGE_SIZE; __storage_key_init_range(start, end); - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); cond_resched(); return 0; } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e88c02c9e642..72e8fa136af5 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -155,7 +155,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) paddr = rste & PMD_MASK; } - if (!test_and_set_bit(PG_arch_1, &folio->flags)) + if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) __storage_key_init_range(paddr, paddr + size); } diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 4a92e6e4d627..974512f359f0 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -14,7 +14,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 46393b00137e..83fb34b39ca7 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -114,7 +114,7 @@ static void sh4_flush_dcache_folio(void *arg) struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else #endif { diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index b509a407588f..71f8be9fc8e0 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -138,7 +138,7 @@ static void sh7705_flush_dcache_folio(void *arg) struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { unsigned long pfn = folio_pfn(folio); unsigned int i, nr = folio_nr_pages(folio); diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 6ebdeaff3021..c3f028bed049 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -64,14 +64,14 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, struct folio *folio = page_folio(page); if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(vto); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } if (vma->vm_flags & VM_EXEC) @@ -85,14 +85,14 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, struct folio *folio = page_folio(page); if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(vfrom); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } } @@ -105,7 +105,7 @@ void copy_user_highpage(struct page *to, struct page *from, vto = kmap_atomic(to); if (boot_cpu_data.dcache.n_aliases && folio_mapped(src) && - test_bit(PG_dcache_clean, &src->flags)) { + test_bit(PG_dcache_clean, &src->flags.f)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(vfrom); @@ -148,7 +148,7 @@ void __update_cache(struct vm_area_struct *vma, if (pfn_valid(pfn)) { struct folio *folio = page_folio(pfn_to_page(pfn)); - int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags.f); if (dirty) __flush_purge_region(folio_address(folio), folio_size(folio)); @@ -162,7 +162,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) if (pages_do_alias(addr, vmaddr)) { if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index fa50e8f6e7a9..c9f32d5a54b8 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -31,7 +31,7 @@ void *kmap_coherent(struct page *page, unsigned long addr) enum fixed_addresses idx; unsigned long vaddr; - BUG_ON(!test_bit(PG_dcache_clean, &folio->flags)); + BUG_ON(!test_bit(PG_dcache_clean, &folio->flags.f)); preempt_disable(); pagefault_disable(); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 7ed58bf3aaca..df9f7c444c39 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -224,7 +224,7 @@ inline void flush_dcache_folio_impl(struct folio *folio) ((1UL<flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) + (((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) static inline void set_dcache_dirty(struct folio *folio, int this_cpu) { @@ -243,7 +243,7 @@ static inline void set_dcache_dirty(struct folio *folio, int this_cpu) "bne,pn %%xcc, 1b\n\t" " nop" : /* no outputs */ - : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags) + : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f) : "g1", "g7"); } @@ -265,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu " nop\n" "2:" : /* no outputs */ - : "r" (cpu), "r" (mask), "r" (&folio->flags), + : "r" (cpu), "r" (mask), "r" (&folio->flags.f), "i" (PG_dcache_cpu_mask), "i" (PG_dcache_cpu_shift) : "g1", "g7"); @@ -292,7 +292,7 @@ static void flush_dcache(unsigned long pfn) struct folio *folio = page_folio(page); unsigned long pg_flags; - pg_flags = folio->flags; + pg_flags = folio->flags.f; if (pg_flags & (1UL << PG_dcache_dirty)) { int cpu = ((pg_flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask); @@ -480,7 +480,7 @@ void flush_dcache_folio(struct folio *folio) mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) { - bool dirty = test_bit(PG_dcache_dirty, &folio->flags); + bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f); if (dirty) { int dirty_cpu = dcache_dirty_cpu(folio); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index c09284302dd3..b68200a0e0c6 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -126,7 +126,7 @@ __setup("debugpat", pat_debug_setup); static inline enum page_cache_mode get_page_memtype(struct page *pg) { - unsigned long pg_flags = pg->flags & _PGMT_MASK; + unsigned long pg_flags = pg->flags.f & _PGMT_MASK; if (pg_flags == _PGMT_WB) return _PAGE_CACHE_MODE_WB; @@ -161,10 +161,10 @@ static inline void set_page_memtype(struct page *pg, break; } - old_flags = READ_ONCE(pg->flags); + old_flags = READ_ONCE(pg->flags.f); do { new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&pg->flags.f, &old_flags, new_flags)); } #else static inline enum page_cache_mode get_page_memtype(struct page *pg) diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 23be0e7516ce..5354df52d61f 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -134,8 +134,8 @@ void flush_dcache_folio(struct folio *folio) */ if (mapping && !mapping_mapped(mapping)) { - if (!test_bit(PG_arch_1, &folio->flags)) - set_bit(PG_arch_1, &folio->flags); + if (!test_bit(PG_arch_1, &folio->flags.f)) + set_bit(PG_arch_1, &folio->flags.f); return; } else { @@ -232,7 +232,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #if (DCACHE_WAY_SIZE > PAGE_SIZE) - if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags)) { + if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags.f)) { unsigned long phys = folio_pfn(folio) * PAGE_SIZE; unsigned long tmp; @@ -247,10 +247,10 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, } preempt_enable(); - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); } #else - if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags) + if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags.f) && (vma->vm_flags & VM_EXEC) != 0) { for (i = 0; i < nr; i++) { void *paddr = kmap_local_folio(folio, i * PAGE_SIZE); @@ -258,7 +258,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __invalidate_icache_page((unsigned long)paddr); kunmap_local(paddr); } - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); } #endif } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..8a89f0aa1d4d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -935,7 +935,7 @@ static int fuse_check_folio(struct folio *folio) { if (folio_mapped(folio) || folio->mapping != NULL || - (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & + (folio->flags.f & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_lru | diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fe0faad4892f..0c0a80b3baca 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -40,7 +40,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, - bh->b_folio->mapping, bh->b_folio->flags); + bh->b_folio->mapping, bh->b_folio->flags.f); fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index dd3dff95cb24..b697f3c259ef 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -230,7 +230,7 @@ static int jffs2_write_begin(const struct kiocb *iocb, goto release_sem; } } - jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags); + jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags.f); release_sem: mutex_unlock(&c->alloc_sem); @@ -259,7 +259,7 @@ static int jffs2_write_end(const struct kiocb *iocb, jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", __func__, inode->i_ino, folio_pos(folio), - start, end, folio->flags); + start, end, folio->flags.f); /* We need to avoid deadlock with page_cache_read() in jffs2_garbage_collect_pass(). So the folio must be diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 806b056d2260..56c4da417b6a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -167,7 +167,7 @@ void nilfs_folio_bug(struct folio *folio) printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", folio, folio_ref_count(folio), - (unsigned long long)folio->index, folio->flags, m, ino); + (unsigned long long)folio->index, folio->flags.f, m, ino); head = folio_buffers(folio); if (head) { diff --git a/fs/proc/page.c b/fs/proc/page.c index ba3568e97fd1..771e0b6bc630 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -163,7 +163,7 @@ u64 stable_page_flags(const struct page *page) snapshot_page(&ps, page); folio = &ps.folio_snapshot; - k = folio->flags; + k = folio->flags.f; mapping = (unsigned long)folio->mapping; is_anon = mapping & FOLIO_MAPPING_ANON; @@ -238,7 +238,7 @@ u64 stable_page_flags(const struct page *page) if (u & (1 << KPF_HUGE)) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); else - u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison); + u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e75a6cec67be..ca41ce8208c4 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -107,7 +107,7 @@ static int do_readpage(struct folio *folio) size_t offset = 0; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, folio->index, i_size, folio->flags); + inode->i_ino, folio->index, i_size, folio->flags.f); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); @@ -600,7 +600,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, folio->index, i_size, folio->flags); + inode->i_ino, folio->index, i_size, folio->flags.f); end_index = (i_size - 1) >> PAGE_SHIFT; if (!i_size || folio->index > end_index) { @@ -988,7 +988,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, folio->index, folio->flags); + inode->i_ino, folio->index, folio->flags.f); ubifs_assert(c, folio->private != NULL); /* Is the folio fully outside @i_size? (truncate in progress) */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b61e2d4858cf..da562f23f50c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,7 +1024,7 @@ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 0; return folio_large_order(folio); } @@ -1554,7 +1554,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; + return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -1562,7 +1562,7 @@ int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags.f >> NODES_PGSHIFT) & NODES_MASK; } #endif @@ -1637,14 +1637,14 @@ static inline void page_cpupid_reset_last(struct page *page) #else static inline int folio_last_cpupid(struct folio *folio) { - return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { - page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; + page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ @@ -1740,7 +1740,7 @@ static inline u8 page_kasan_tag(const struct page *page) u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { - tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } @@ -1755,12 +1755,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) return; tag ^= 0xff; - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) @@ -1804,13 +1804,13 @@ static inline pg_data_t *folio_pgdat(const struct folio *folio) #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { - page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); - page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; + page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { - return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (page->flags.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif @@ -2015,14 +2015,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) static inline void set_page_zone(struct page *page, enum zone_type zone) { - page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); - page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; + page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { - page->flags &= ~(NODES_MASK << NODES_PGSHIFT); - page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; + page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, @@ -2064,7 +2064,7 @@ static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 1; return folio_large_nr_pages(folio); } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 89b518ff097e..150302b4a905 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -143,7 +143,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset) static inline int folio_lru_refs(struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); if (!(flags & BIT(PG_referenced))) return 0; @@ -156,7 +156,7 @@ static inline int folio_lru_refs(struct folio *folio) static inline int folio_lru_gen(struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -268,7 +268,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ @@ -293,7 +293,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; - flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); + flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); @@ -304,9 +304,9 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, static inline void folio_migrate_refs(struct folio *new, struct folio *old) { - unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; + unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; - set_mask_bits(&new->flags, LRU_REFS_MASK, refs); + set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d247da2fdb52..d934a3a5b443 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -34,6 +34,10 @@ struct address_space; struct futex_private_hash; struct mem_cgroup; +typedef struct { + unsigned long f; +} memdesc_flags_t; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -72,7 +76,7 @@ struct mem_cgroup; #endif struct page { - unsigned long flags; /* Atomic flags, some possibly + memdesc_flags_t flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. @@ -383,7 +387,7 @@ struct folio { union { struct { /* public: */ - unsigned long flags; + memdesc_flags_t flags; union { struct list_head lru; /* private: avoid cluttering the output */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9d3ea9085556..990560cd99ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1186,7 +1186,7 @@ static inline bool zone_is_empty(struct zone *zone) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; + return (page->flags.f >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..d53a86e68c89 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags)) { + test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least @@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page) static __always_inline int PageCompound(const struct page *page) { - return test_bit(PG_head, &page->flags) || + return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM @@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio, const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) @@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } /* @@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ -{ return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ -{ set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ -{ clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ -{ __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ -{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ -{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ -{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) @@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page) */ if (PageHuge(page)) page = compound_head(page); - return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU @@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ static inline int folio_has_private(const struct folio *folio) { - return !!(folio->flags & PAGE_FLAGS_PRIVATE); + return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 8a7f4f802c57..38a82d65e58e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, if (static_key_enabled(&mem_profiling_compressed)) { pgalloc_tag_idx idx; - idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx = (page->flags.f >> alloc_tag_ref_offs) & + alloc_tag_ref_mask; idx_to_ref(idx, ref); handle->page = page; } else { @@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code idx = (unsigned long)ref_to_idx(ref); idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; do { - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); flags = old_flags; flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); flags |= idx; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } else { if (WARN_ON(!handle.ref || !ref)) return; diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h index fe33a255b7d0..ea6b5c4baf3d 100644 --- a/include/trace/events/page_ref.h +++ b/include/trace/events/page_ref.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; @@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; diff --git a/mm/filemap.c b/mm/filemap.c index 1a388b11cfa9..f3a6c24897f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1140,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->folio->flags)) + if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } @@ -1226,9 +1226,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &folio->flags)) + if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; - } else if (test_bit(bit_nr, &folio->flags)) + } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d89992b65acc..aac5f0a2cb54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3303,8 +3303,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ - new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - new_folio->flags |= (folio->flags & + new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; + new_folio->flags.f |= (folio->flags.f & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf..c15ffee7d32b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1707,10 +1707,10 @@ static int identify_page_state(unsigned long pfn, struct page *p, * carried out only if the first check can't determine the page status. */ for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) + if ((p->flags.f & ps->mask) == ps->res) break; - page_flags |= (p->flags & (1UL << PG_dirty)); + page_flags |= (p->flags.f & (1UL << PG_dirty)); if (!ps->mask) for (ps = error_states;; ps++) @@ -2137,7 +2137,7 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = folio->flags; + page_flags = folio->flags.f; if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); @@ -2398,7 +2398,7 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = folio->flags; + page_flags = folio->flags.f; /* * __munlock_folio() may clear a writeback folio's LRU flag without @@ -2744,13 +2744,13 @@ static int soft_offline_in_use_page(struct page *page) putback_movable_pages(&pagelist); pr_info("%#lx: %s migration failed %ld, type %pGp\n", - pfn, msg_page[huge], ret, &page->flags); + pfn, msg_page[huge], ret, &page->flags.f); if (ret > 0) ret = -EBUSY; } } else { pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", - pfn, msg_page[huge], page_count(page), &page->flags); + pfn, msg_page[huge], page_count(page), &page->flags.f); ret = -EBUSY; } return ret; diff --git a/mm/mmzone.c b/mm/mmzone.c index f9baa8882fbf..0c8f181d9d50 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid) unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(folio->flags); + old_flags = READ_ONCE(folio->flags.f); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags))); return last_cpupid; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ee21e46f0fb..ca9e6b9633f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -950,7 +950,7 @@ static inline void __free_one_page(struct page *page, bool to_tail; VM_BUG_ON(!zone_is_initialized(zone)); - VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); @@ -1043,7 +1043,7 @@ static inline bool page_expected_state(struct page *page, page->memcg_data | #endif page_pool_page_is_pp(page) | - (page->flags & check_flags))) + (page->flags.f & check_flags))) return false; return true; @@ -1059,7 +1059,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & flags)) { + if (unlikely(page->flags.f & flags)) { if (flags == PAGE_FLAGS_CHECK_AT_PREP) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; else @@ -1358,7 +1358,7 @@ __always_inline bool free_pages_prepare(struct page *page, int i; if (compound) { - page[1].flags &= ~PAGE_FLAGS_SECOND; + page[1].flags.f &= ~PAGE_FLAGS_SECOND; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif @@ -1372,7 +1372,7 @@ __always_inline bool free_pages_prepare(struct page *page, continue; } } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (folio_test_anon(folio)) { @@ -1391,7 +1391,7 @@ __always_inline bool free_pages_prepare(struct page *page, } page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); diff --git a/mm/swap.c b/mm/swap.c index cb164f9ef9e3..6dd22a904b37 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -387,14 +387,14 @@ static void __lru_cache_activate_folio(struct folio *folio) static void lru_gen_inc_refs(struct folio *folio) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return; } @@ -406,7 +406,7 @@ static void lru_gen_inc_refs(struct folio *folio) } new_flags = old_flags + BIT(LRU_REFS_PGOFF); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) @@ -418,7 +418,7 @@ static bool lru_gen_clear_refs(struct folio *folio) if (gen < 0) return true; - set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b9a1cfeb2ddf..e336577c4454 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -888,11 +888,11 @@ static bool lru_gen_set_refs(struct folio *folio) { /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return false; } - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); return true; } #else @@ -3257,13 +3257,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return -1; } @@ -3274,7 +3274,7 @@ static int folio_update_gen(struct folio *folio, int gen) new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -3285,7 +3285,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); @@ -3302,7 +3302,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -4553,7 +4553,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); @@ -4766,7 +4766,7 @@ retry: /* don't add rejected folios to the oldest generation */ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); diff --git a/mm/workingset.c b/mm/workingset.c index 6e7f4cb1b9a7..68a76a91111f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else - set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } -- cgit v1.2.3 From 56d578c1300f7efe9605b75714173dd3fda16fe2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:52 +0100 Subject: mm: convert page_to_section() to memdesc_section() Pass in the memdesc_flags_t instead of a pointer to the page. This will allow us to remove a few conversions to struct page in upcoming patches. Link: https://lkml.kernel.org/r/20250805172307.1302730-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/asm-generic/memory_model.h | 2 +- include/linux/mm.h | 4 ++-- mm/sparse.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 74d0077cc5fa..efa6610acbc7 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -53,7 +53,7 @@ static inline int pfn_valid(unsigned long pfn) */ #define __page_to_pfn(pg) \ ({ const struct page *__pg = (pg); \ - int __sec = page_to_section(__pg); \ + int __sec = memdesc_section(__pg->flags); \ (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \ }) diff --git a/include/linux/mm.h b/include/linux/mm.h index da562f23f50c..82617c4cfa24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1808,9 +1808,9 @@ static inline void set_page_section(struct page *page, unsigned long section) page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } -static inline unsigned long page_to_section(const struct page *page) +static inline unsigned long memdesc_section(memdesc_flags_t mdf) { - return (page->flags.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif diff --git a/mm/sparse.c b/mm/sparse.c index e6075b622407..7cb42cbfc7f9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -45,7 +45,7 @@ static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; int page_to_nid(const struct page *page) { - return section_to_node_table[page_to_section(page)]; + return section_to_node_table[memdesc_section(page->flags)]; } EXPORT_SYMBOL(page_to_nid); -- cgit v1.2.3 From eb00fdd84ddabd6948d26595bb5e8c1302220d37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:53 +0100 Subject: mm: introduce memdesc_nid() Remove a conversion from folio to page by passing the folio->flags (which are a copy of the page->flags) to the new memdesc_nid() function. Link: https://lkml.kernel.org/r/20250805172307.1302730-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++-------- mm/sparse.c | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 82617c4cfa24..00c8a54127d3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1558,17 +1558,22 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -int page_to_nid(const struct page *page); +int memdesc_nid(memdesc_flags_t mdf); #else -static inline int page_to_nid(const struct page *page) +static inline int memdesc_nid(memdesc_flags_t mdf) { - return (PF_POISONED_CHECK(page)->flags.f >> NODES_PGSHIFT) & NODES_MASK; + return (mdf.f >> NODES_PGSHIFT) & NODES_MASK; } #endif +static inline int page_to_nid(const struct page *page) +{ + return memdesc_nid(PF_POISONED_CHECK(page)->flags); +} + static inline int folio_nid(const struct folio *folio) { - return page_to_nid(&folio->page); + return memdesc_nid(folio->flags); } #ifdef CONFIG_NUMA_BALANCING @@ -1791,14 +1796,14 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } -static inline struct zone *folio_zone(const struct folio *folio) +static inline pg_data_t *folio_pgdat(const struct folio *folio) { - return page_zone(&folio->page); + return NODE_DATA(folio_nid(folio)); } -static inline pg_data_t *folio_pgdat(const struct folio *folio) +static inline struct zone *folio_zone(const struct folio *folio) { - return page_pgdat(&folio->page); + return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)]; } #ifdef SECTION_IN_PAGE_FLAGS diff --git a/mm/sparse.c b/mm/sparse.c index 7cb42cbfc7f9..17c50a6415c2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(const struct page *page) +int memdesc_nid(memdesc_flags_t mdf) { - return section_to_node_table[memdesc_section(page->flags)]; + return section_to_node_table[memdesc_section(mdf)]; } -EXPORT_SYMBOL(page_to_nid); +EXPORT_SYMBOL(memdesc_nid); static void set_section_nid(unsigned long section_nr, int nid) { -- cgit v1.2.3 From 4aff03fbe508780394039053bebfc4f4800b286e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:54 +0100 Subject: mm: introduce memdesc_zonenum() Remove a conversion from folio to page by passing the folio->flags (which are a copy of the page->flags) to the new memdesc_zonenum() function. Link: https://lkml.kernel.org/r/20250805172307.1302730-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 990560cd99ee..80a3b6642603 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1183,15 +1183,20 @@ static inline bool zone_is_empty(struct zone *zone) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) +static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags) +{ + ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT); + return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK; +} + static inline enum zone_type page_zonenum(const struct page *page) { - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags.f >> ZONES_PGSHIFT) & ZONES_MASK; + return memdesc_zonenum(page->flags); } static inline enum zone_type folio_zonenum(const struct folio *folio) { - return page_zonenum(&folio->page); + return memdesc_zonenum(folio->flags); } #ifdef CONFIG_ZONE_DEVICE -- cgit v1.2.3 From 89ef6ad6fa849b780b5a5caae9068261603e1738 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:57 +0100 Subject: mm: introduce memdesc_is_zone_device() Remove the conversion from folio to page in folio_is_zone_device() by introducing memdesc_is_zone_device() which takes a memdesc_flags_t from either a page or a folio. Link: https://lkml.kernel.org/r/20250805172307.1302730-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 80a3b6642603..fe13ad175fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,14 +1200,14 @@ static inline enum zone_type folio_zonenum(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { - return page_zonenum(page) == ZONE_DEVICE; + return memdesc_zonenum(mdf) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { - VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page); return page_folio(page)->pgmap; } @@ -1222,9 +1222,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { - if (is_zone_device_page(a) != is_zone_device_page(b)) + if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags)) return false; - if (!is_zone_device_page(a)) + if (!memdesc_is_zone_device(a->flags)) return true; return page_pgmap(a) == page_pgmap(b); } @@ -1232,7 +1232,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return false; } @@ -1247,9 +1247,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) } #endif +static inline bool is_zone_device_page(const struct page *page) +{ + return memdesc_is_zone_device(page->flags); +} + static inline bool folio_is_zone_device(const struct folio *folio) { - return is_zone_device_page(&folio->page); + return memdesc_is_zone_device(folio->flags); } static inline bool is_zone_movable_page(const struct page *page) -- cgit v1.2.3 From 7cfe9cafb6adebc13a246bebafcd69cd37add4e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:58 +0100 Subject: mm: reimplement folio_is_device_private() For callers of folio_is_device_private(), we save a folio->page->folio conversion. Callers of is_device_private_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_device_private_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 4aa151914eab..5d18cb7a70e5 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -157,16 +157,17 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } -static inline bool is_device_private_page(const struct page *page) +static inline bool folio_is_device_private(const struct folio *folio) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PRIVATE; } -static inline bool folio_is_device_private(const struct folio *folio) +static inline bool is_device_private_page(const struct page *page) { - return is_device_private_page(&folio->page); + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + folio_is_device_private(page_folio(page)); } static inline bool is_pci_p2pdma_page(const struct page *page) -- cgit v1.2.3 From bd0dbbb3fd902c7eea7eb166d91bda4530a8de96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:59 +0100 Subject: mm: reimplement folio_is_device_coherent() For callers of folio_is_device_coherent(), we save a folio->page->folio conversion. Callers of is_device_coherent_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_device_coherent_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 5d18cb7a70e5..06d29794abe6 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -177,15 +177,15 @@ static inline bool is_pci_p2pdma_page(const struct page *page) page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; } -static inline bool is_device_coherent_page(const struct page *page) +static inline bool folio_is_device_coherent(const struct folio *folio) { - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_COHERENT; } -static inline bool folio_is_device_coherent(const struct folio *folio) +static inline bool is_device_coherent_page(const struct page *page) { - return is_device_coherent_page(&folio->page); + return folio_is_device_coherent(page_folio(page)); } static inline bool is_fsdax_page(const struct page *page) -- cgit v1.2.3 From c995ac3aa3747ec2a0373e5f319a22e0cb31d613 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:23:00 +0100 Subject: mm: reimplement folio_is_fsdax() For callers of folio_is_fsdax(), we save a folio->page->folio conversion. Callers of is_fsdax_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_fsdax_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 06d29794abe6..450d4bb6835c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -188,15 +188,15 @@ static inline bool is_device_coherent_page(const struct page *page) return folio_is_device_coherent(page_folio(page)); } -static inline bool is_fsdax_page(const struct page *page) +static inline bool folio_is_fsdax(const struct folio *folio) { - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_FS_DAX; } -static inline bool folio_is_fsdax(const struct folio *folio) +static inline bool is_fsdax_page(const struct page *page) { - return is_fsdax_page(&folio->page); + return folio_is_fsdax(page_folio(page)); } #ifdef CONFIG_ZONE_DEVICE -- cgit v1.2.3 From 88df6ab2f34b60837ebdab64b2514f356d5ebb65 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:23:01 +0100 Subject: mm: add folio_is_pci_p2pdma() Reimplement is_pci_p2pdma_page() in terms of folio_is_pci_p2pdma(). Moves the page_folio() call from inside page_pgmap() to is_pci_p2pdma_page(). This removes a page_folio() call from try_grab_folio() which already has a folio and can pass it in. Link: https://lkml.kernel.org/r/20250805172307.1302730-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 ++++++++-- mm/gup.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 450d4bb6835c..aa1b6aa877a0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -170,11 +170,17 @@ static inline bool is_device_private_page(const struct page *page) folio_is_device_private(page_folio(page)); } +static inline bool folio_is_pci_p2pdma(const struct folio *folio) +{ + return IS_ENABLED(CONFIG_PCI_P2PDMA) && + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; + folio_is_pci_p2pdma(page_folio(page)); } static inline bool folio_is_device_coherent(const struct folio *folio) diff --git a/mm/gup.c b/mm/gup.c index 331d22bf7b2d..b2a78f029127 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -148,7 +148,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio))) return -EREMOTEIO; if (flags & FOLL_GET) -- cgit v1.2.3 From 7bebb41b96b5a898134b757fda520b7b990a91fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Aug 2025 08:10:10 +0200 Subject: mm: remove write_cache_pages No users left. Link: https://lkml.kernel.org/r/20250818061017.1526853-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Kent Overstreet Cc: Konstantin Komarov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/writeback.h | 6 ------ mm/page-writeback.c | 30 ------------------------------ 2 files changed, 36 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..2a7e134d03ee 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -360,12 +360,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); -typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, - void *data); - -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..7e1e798e7213 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2590,36 +2590,6 @@ done: } EXPORT_SYMBOL_GPL(writeback_iter); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * Return: %0 on success, negative error code otherwise - * - * Note: please use writeback_iter() instead. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) { - error = writepage(folio, wbc, data); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } - } - - return error; -} -EXPORT_SYMBOL(write_cache_pages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; -- cgit v1.2.3 From 0cd01c4a5cc140efb9fc203dd05ffccf3c2197d0 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Thu, 21 Aug 2025 06:38:55 +0800 Subject: mm/cma: add 'available count' and 'total count' to trace_cma_alloc_start This makes cma info more intuitive during debugging. Show up in the trace as: 279.814717: cma_alloc_start: name=reserved request_count=4 available_count=8096 total_count=8192 align=0 309.790580: cma_alloc_start: name=reserved request_count=4 available_count=8092 total_count=8192 align=0 317.046609: cma_alloc_start: name=reserved request_count=4 available_count=8088 total_count=8192 align=0 Link: https://lkml.kernel.org/r/8a79284879c529f467478552825154b018076e95.1755729178.git.gaoxiang17@xiaomi.com Signed-off-by: gaoxiang17 Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/cma.h | 19 +++++++++++++------ mm/cma.c | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 383c09f583ac..37195edf2498 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -38,25 +38,32 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned long count, unsigned int align), + TP_PROTO(const char *name, unsigned long request_count, unsigned long available_count, + unsigned long total_count, unsigned int align), - TP_ARGS(name, count, align), + TP_ARGS(name, request_count, available_count, total_count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned long, count) + __field(unsigned long, request_count) + __field(unsigned long, available_count) + __field(unsigned long, total_count) __field(unsigned int, align) ), TP_fast_assign( __assign_str(name); - __entry->count = count; + __entry->request_count = request_count; + __entry->available_count = available_count; + __entry->total_count = total_count; __entry->align = align; ), - TP_printk("name=%s count=%lu align=%u", + TP_printk("name=%s request_count=%lu available_count=%lu total_count=%lu align=%u", __get_str(name), - __entry->count, + __entry->request_count, + __entry->available_count, + __entry->total_count, __entry->align) ); diff --git a/mm/cma.c b/mm/cma.c index 2ffa4befb99a..e56ec64d0567 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -864,7 +864,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) return page; - trace_cma_alloc_start(name, count, align); + trace_cma_alloc_start(name, count, cma->available_count, cma->count, align); for (r = 0; r < cma->nranges; r++) { page = NULL; -- cgit v1.2.3 From dfd04add595b97758c8ad1ee970554b7af5c57dd Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Mon, 25 Aug 2025 09:59:26 -0300 Subject: kmem/tracing: add kmem name to kmem_cache_alloc tracepoint The kmem_cache_free tracepoint includes a "name" field, which allows for easy identification and filtering of specific kmem's. However, the kmem_cache_alloc tracepoint lacks this field, making it difficult to pair corresponding alloc and free events for analysis. Add the "name" field to kmem_cache_alloc to enable consistent tracking and correlation of kmem alloc and free events. Link: https://lkml.kernel.org/r/20250825125927.59816-1-wander@redhat.com Signed-off-by: Wander Lairson Costa Cc: David Hildenbrand Cc: David Rientjes Cc: Martin Liu Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 474358773abe..7f93e754da5c 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -22,6 +22,7 @@ TRACE_EVENT(kmem_cache_alloc, TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) + __string( name, s->name ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) @@ -32,6 +33,7 @@ TRACE_EVENT(kmem_cache_alloc, TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; + __assign_str(name); __entry->bytes_req = s->object_size; __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; @@ -41,9 +43,10 @@ TRACE_EVENT(kmem_cache_alloc, (s->flags & SLAB_ACCOUNT)) : false; ), - TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", + TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, + __get_str(name), __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), -- cgit v1.2.3 From ef49b7b39d50b9e4f9d63e64f5d8acafe3c71158 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Aug 2025 15:13:44 +0000 Subject: maple_tree: fix MAPLE_PARENT_RANGE32 and parent pointer docs MAPLE_PARENT_RANGE32 should be 0x02 as a 32 bit node is indicated by the bit pattern 0b010 which is the hex value 0x02. There are no users currently, so there is no associated bug with this wrong value. Fix typo Note -> Node and replace x with b to indicate binary values. Link: https://lkml.kernel.org/r/20250826151344.403286-1-sidhartha.kumar@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 16 ++++++++-------- lib/maple_tree.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..41e633264e51 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -57,17 +57,17 @@ * MT_FLAGS_ALLOC_RANGE flag. * * Node types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location - * 0x??1 : Root - * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 - * 0x010 : 32 bit values, type in 0-2, slot in 3-6 - * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + * 0b??1 : Root + * 0b?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0b010 : 32 bit values, type in 0-2, slot in 3-6 + * 0b110 : 64 bit values, type in 0-2, slot in 3-6 */ /* diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b4ee2d29d7a9..c57a4615bdff 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -405,11 +405,11 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt) * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * - * Note types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * Node types: + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root @@ -427,7 +427,7 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt) #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 -#define MAPLE_PARENT_RANGE32 0x04 +#define MAPLE_PARENT_RANGE32 0x02 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* -- cgit v1.2.3 From cf1dec76ba8a00b20e51d205f3c9f5c45bc96df2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:35 -0700 Subject: mm/filemap: add AS_KERNEL_FILE Patch series "introduce kernel file mapped folios", v4. Btrfs currently tracks its metadata pages in the page cache, using a fake inode (fs_info->btree_inode) with offsets corresponding to where the metadata is stored in the filesystem's full logical address space. A consequence of this is that when btrfs uses filemap_add_folio(), this usage is charged to the cgroup of whichever task happens to be running at the time. These folios don't belong to any particular user cgroup, so I don't think it makes much sense for them to be charged in that way. Some negative consequences as a result: - A task can be holding some important btrfs locks, then need to lookup some metadata and go into reclaim, extending the duration it holds that lock for, and unfairly pushing its own reclaim pain onto other cgroups. - If that cgroup goes into reclaim, it might reclaim these folios a different non-reclaiming cgroup might need soon. This is naturally offset by LRU reclaim, but still. We have two options for how to manage such file pages: 1. charge them to the root cgroup. 2. don't charge them to any cgroup at all. 2. breaks the invariant that every mapped page has a cgroup. This is workable, but unnecessarily risky. Therefore, go with 1. A very similar proposal to use the root cgroup was previously made by Qu, where he eventually proposed the idea of setting it per address_space. This makes good sense for the btrfs use case, as the behavior should apply to all use of the address_space, not select allocations. I.e., if someone adds another filemap_add_folio() call using btrfs's btree_inode, we would almost certainly want to account that to the root cgroup as well. This patch (of 3): Add the flag AS_KERNEL_FILE to the address_space to indicate that this mapping's memory is exempt from the usual memcg accounting. [boris@bur.io: fix CONFIG_MEMCG build for AS_KERNEL_FILE] Link: https://lkml.kernel.org/r/6de59ddeec81b5c294d337c001ba0061631d4ec6.1755816635.git.boris@bur.io Link: https://lore.kernel.org/linux-mm/b5fef5372ae454a7b6da4f2f75c427aeab6a07d6.1727498749.git.wqu@suse.com/ Link: https://lkml.kernel.org/r/f09c4e2c90351d4cb30a1969f7a863b9238bd291.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Qu Wenruo Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ include/linux/pagemap.h | 2 ++ mm/filemap.c | 6 ++++++ 3 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9fa3afc90dd5..e693978b2022 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1059,6 +1059,8 @@ extern int mem_cgroup_init(void); #define MEM_CGROUP_ID_SHIFT 0 +#define root_mem_cgroup (NULL) + static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..f0dfdfb13cd9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -211,6 +211,8 @@ enum mapping_flags { folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, + AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't + account usage to user cgroups */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, diff --git a/mm/filemap.c b/mm/filemap.c index 6e954156bb77..92ea20356f22 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -960,8 +960,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, { void *shadow = NULL; int ret; + struct mem_cgroup *tmp; + bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); + if (kernel_file) + tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); + if (kernel_file) + set_active_memcg(tmp); if (ret) return ret; -- cgit v1.2.3 From e3a9ac4e866ea746475b4026819a8c08ec1142e6 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:36 -0700 Subject: mm: add vmstat for kernel_file pages Kernel file pages are tricky to track because they are indistinguishable from files whose usage is accounted to the root cgroup. To maintain good accounting, introduce a vmstat counter tracking kernel file pages. Confirmed that these work as expected at a high level by mounting a btrfs using AS_KERNEL_FILE for metadata pages, and seeing the counter rise with fs usage then go back to a minimal level after drop_caches and finally down to 0 after unmounting the fs. Link: https://lkml.kernel.org/r/08ff633e3a005ed5f7691bfd9f58a5df8e474339.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Tested-by: syzbot@syzkaller.appspotmail.com Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qu Wenruo Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + mm/filemap.c | 7 +++++++ mm/vmstat.c | 1 + 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fe13ad175fed..f3272ef5131b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -259,6 +259,7 @@ enum node_stat_item { NR_HUGETLB, #endif NR_BALLOON_PAGES, + NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/filemap.c b/mm/filemap.c index 92ea20356f22..cd9387b0a5b5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by @@ -989,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + if (kernel_file) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, + folio_nr_pages(folio)); } return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index e74f0b2a1021..e522decf6a72 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1290,6 +1290,7 @@ const char * const vmstat_text[] = { [I(NR_HUGETLB)] = "nr_hugetlb", #endif [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", + [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", #undef I /* system-wide enum vm_stat_item counters */ -- cgit v1.2.3 From 98c94f1035fc0c82ab008854a165df2c20c0cb6a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 27 Aug 2025 07:01:05 +0000 Subject: mm/pageblock-flags: remove PB_migratetype_bits/PB_migrate_end enum pageblock_bits defines the meaning of pageblock bits. Currently PB_migratetype_bits says the lowest 3 bits represents migratetype and PB_migrate_end/MIGRATETYPE_MASK's definition rely on it with magical computation. Remove the definition of PB_migratetype_bits/PB_migrate_end. Use PB_migrate_[0|1|2] to represent lowest bits for migratetype. Then we can simplify related definition. Also, MIGRATETYPE_AND_ISO_MASK is MIGRATETYPE_MASK add isolation bit. Use MIGRATETYPE_MASK in the definition of MIGRATETYPE_AND_ISO_MASK looks cleaner. No functional change intended. Link: https://lkml.kernel.org/r/20250827070105.16864-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Vlastimil Babka Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 12 +++++------- mm/page_alloc.c | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 6a44be0f39f4..e046278a01fa 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -13,12 +13,11 @@ #include -#define PB_migratetype_bits 3 /* Bit indices that affect a whole block of pages */ enum pageblock_bits { - PB_migrate, - PB_migrate_end = PB_migrate + PB_migratetype_bits - 1, - /* 3 bits required for migrate types */ + PB_migrate_0, + PB_migrate_1, + PB_migrate_2, PB_compact_skip,/* If set the block is skipped by compaction */ #ifdef CONFIG_MEMORY_ISOLATION @@ -37,11 +36,10 @@ enum pageblock_bits { #define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS)) -#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1) +#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) #ifdef CONFIG_MEMORY_ISOLATION -#define MIGRATETYPE_AND_ISO_MASK \ - (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate)) +#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate)) #else #define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c524b80a3b72..2df6ee6998ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) { - return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; + return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS; } static __always_inline void @@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); -- cgit v1.2.3 From 09a616cbb371e6b843e536f00e38d6b43d796ac4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:32 -0700 Subject: mm/damon/core: add damon_ctx->addr_unit Patch series "mm/damon: support ARM32 with LPAE", v3. Previously, DAMON's physical address space monitoring only supported memory ranges below 4GB on LPAE-enabled systems. This was due to the use of 'unsigned long' in 'struct damon_addr_range', which is 32-bit on ARM32 even with LPAE enabled[1]. To add DAMON support for ARM32 with LPAE enabled, a new core layer parameter called 'addr_unit' was introduced[2]. Operations set layer can translate a core layer address to the real address by multiplying the parameter value to the core layer address. Support of the parameter is up to each operations layer implementation, though. For example, operations set implementations for virtual address space can simply ignore the parameter. Add the support on paddr, which is the DAMON operations set implementation for the physical address space, as we have a clear use case for that. This patch (of 11): In some cases, some of the real address that handled by the underlying operations set cannot be handled by DAMON since it uses only 'unsinged long' as the address type. Using DAMON for physical address space monitoring of 32 bit ARM devices with large physical address extension (LPAE) is one example[1]. Add a parameter name 'addr_unit' to core layer to help such cases. DAMON core API callers can set it as the scale factor that will be used by the operations set for translating the core layer's addresses to the real address by multiplying the parameter value to the core layer address. Support of the parameter is up to each operations set layer. The support from the physical address space operations set (paddr) will be added with following commits. Link: https://lkml.kernel.org/r/20250828171242.59810-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250828171242.59810-2-sj@kernel.org Link: https://lore.kernel.org/20250408075553.959388-1-zuoze1@huawei.com [1] Link: https://lore.kernel.org/all/20250416042551.158131-1-sj@kernel.org/ [2] Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index d01bfee80bd6..6fa52f7495d9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -746,7 +746,7 @@ struct damon_attrs { * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. - * + * @addr_unit: Scale factor for core to ops address conversion. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -788,6 +788,7 @@ struct damon_ctx { struct mutex kdamond_lock; struct damon_operations ops; + unsigned long addr_unit; struct list_head adaptive_targets; struct list_head schemes; diff --git a/mm/damon/core.c b/mm/damon/core.c index 52ecc3a4426f..e098389a2f73 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -544,6 +544,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + ctx->addr_unit = 1; + INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1245,6 +1247,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) return err; } dst->ops = src->ops; + dst->addr_unit = src->addr_unit; return 0; } -- cgit v1.2.3 From d8f867fa0825fb3e358457566d7326d8aab2406a Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Thu, 28 Aug 2025 10:12:42 -0700 Subject: mm/damon: add damon_ctx->min_sz_region Adopting addr_unit would make DAMON_MINREGION 'addr_unit * 4096' bytes and cause data alignment issues[1]. Add damon_ctx->min_sz_region to change DAMON_MIN_REGION from a global macro value to per-context variable. Link: https://lkml.kernel.org/r/20250828171242.59810-12-sj@kernel.org Link: https://lore.kernel.org/all/527714dd-0e33-43ab-bbbd-d89670ba79e7@huawei.com [1] Signed-off-by: Quanmin Yan Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++- mm/damon/core.c | 67 +++++++++++++++++++++++++------------------- mm/damon/sysfs.c | 8 ++++-- mm/damon/tests/core-kunit.h | 21 ++++++++------ mm/damon/tests/vaddr-kunit.h | 2 +- mm/damon/vaddr.c | 2 +- 6 files changed, 61 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 6fa52f7495d9..ec8716292c09 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -747,6 +747,7 @@ struct damon_attrs { * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. + * @min_sz_region: Minimum region size. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -789,6 +790,7 @@ struct damon_ctx { struct damon_operations ops; unsigned long addr_unit; + unsigned long min_sz_region; struct list_head adaptive_targets; struct list_head schemes; @@ -877,7 +879,7 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges); + unsigned int nr_ranges, unsigned long min_sz_region); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); diff --git a/mm/damon/core.c b/mm/damon/core.c index e098389a2f73..12f136a121a2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. + * @min_sz_region: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges) + unsigned int nr_ranges, unsigned long min_sz_region) { struct damon_region *r, *next; unsigned int i; @@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - DAMON_MIN_REGION), - ALIGN(range->end, DAMON_MIN_REGION)); + min_sz_region), + ALIGN(range->end, min_sz_region)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - DAMON_MIN_REGION); - last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + min_sz_region); + last->ar.end = ALIGN(range->end, min_sz_region); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -545,6 +546,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.max_nr_regions = 1000; ctx->addr_unit = 1; + ctx->min_sz_region = DAMON_MIN_REGION; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1127,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * * If @src has no region, @dst keeps current regions. */ -static int damon_commit_target_regions( - struct damon_target *dst, struct damon_target *src) +static int damon_commit_target_regions(struct damon_target *dst, + struct damon_target *src, unsigned long src_min_sz_region) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1145,18 +1147,19 @@ static int damon_commit_target_regions( i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i); + err = damon_set_regions(dst, ranges, i, src_min_sz_region); kfree(ranges); return err; } static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, - struct damon_target *src, bool src_has_pid) + struct damon_target *src, bool src_has_pid, + unsigned long src_min_sz_region) { int err; - err = damon_commit_target_regions(dst, src); + err = damon_commit_target_regions(dst, src, src_min_sz_region); if (err) return err; if (dst_has_pid) @@ -1178,7 +1181,8 @@ static int damon_commit_targets( if (src_target) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) return err; } else { @@ -1201,7 +1205,8 @@ static int damon_commit_targets( if (!new_target) return -ENOMEM; err = damon_commit_target(new_target, false, - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1248,6 +1253,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) } dst->ops = src->ops; dst->addr_unit = src->addr_unit; + dst->min_sz_region = src->min_sz_region; return 0; } @@ -1280,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < ctx->min_sz_region) + sz = ctx->min_sz_region; return sz; } @@ -1625,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. + * @min_sz_region: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1642,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s) + struct damon_region **rp, struct damos *s, unsigned long min_sz_region) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1664,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); + r->ar.start, min_sz_region); if (!sz_to_skip) { - if (damon_sz_region(r) <= DAMON_MIN_REGION) + if (damon_sz_region(r) <= min_sz_region) return true; - sz_to_skip = DAMON_MIN_REGION; + sz_to_skip = min_sz_region; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1693,7 +1700,8 @@ static void damos_update_stat(struct damos *s, } static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos_filter *filter) + struct damon_region *r, struct damos_filter *filter, + unsigned long min_sz_region) { bool matched = false; struct damon_target *ti; @@ -1710,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); - end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); + end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1747,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1882,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); + c->min_sz_region); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); @@ -1930,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s)) + if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; if (!damos_valid_target(c, t, r, s)) @@ -2324,7 +2332,8 @@ static void damon_split_region_at(struct damon_target *t, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs, + unsigned long min_sz_region) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2334,13 +2343,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * DAMON_MIN_REGION; i++) { + sz_region > 2 * min_sz_region; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, DAMON_MIN_REGION); + sz_region / 10, min_sz_region); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2380,7 +2389,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions); + damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); last_nr_regions = nr_regions; } @@ -2769,7 +2778,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1); + return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); } /* diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 98bf15d403b2..0ed404c89f80 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1329,7 +1329,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, } static int damon_sysfs_set_regions(struct damon_target *t, - struct damon_sysfs_regions *sysfs_regions) + struct damon_sysfs_regions *sysfs_regions, + unsigned long min_sz_region) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1351,7 +1352,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); out: kfree(ranges); return err; @@ -1372,7 +1373,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } - return damon_sysfs_set_regions(t, sys_target->regions); + return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1430,6 +1431,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (err) return err; ctx->addr_unit = sys_ctx->addr_unit; + ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 5f5dc9db2e90..51369e35298b 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(t, 2); + damon_split_regions_of(t, 2, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(t, 4); + damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); @@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test) damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1); + damon_set_regions(t, &range, 1, DAMON_MIN_REGION); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -450,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -481,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index d2b37ccf2cc0..fce38dd53cf8 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 66ef9869eafe..8c048f9b129e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); } } -- cgit v1.2.3 From 1e332f303ae93ba4d38b480b1bb5a08f833306f6 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 15:03:11 +0200 Subject: pagevec.h: add `const` to pointer parameters of getter functions For improved const-correctness. Link: https://lkml.kernel.org/r/20250828130311.772993-1-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5d3a0cccc6bf..63be5a451627 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch) fbatch->i = 0; } -static inline unsigned int folio_batch_count(struct folio_batch *fbatch) +static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) { return fbatch->nr; } -static inline unsigned int folio_batch_space(struct folio_batch *fbatch) +static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } -- cgit v1.2.3 From 39b44c8c73312ac535ffdf7c8ecd37ea07d4ef86 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 10:48:20 +0200 Subject: huge_mm.h: disallow is_huge_zero_folio(NULL) Calling is_huge_zero_folio(NULL) should not be legal - it makes no sense, and a different (theoretical) implementation may dereference the pointer. But currently, lacking any explicit documentation, this call is possible. But if somebody really passes NULL, the function should not return true - this isn't the huge zero folio after all! However, if the `huge_zero_folio` hasn't been allocated yet, it's NULL, and is_huge_zero_folio(NULL) just happens to return true, which is a lie. This weird side effect prevented me from reproducing a kernel crash that occurred when the elements of a folio_batch were NULL - since folios_put_refs() skips huge zero folios, this sometimes causes a crash, but sometimes does not. For debugging, it is better to reveal such bugs reliably and not hide them behind random preconditions like "has the huge zero folio already been created?" To improve detection of such bugs, David Hildenbrand suggested adding a VM_WARN_ON_ONCE(). Link: https://lkml.kernel.org/r/20250828084820.570118-1-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Dev Jain Cc: Kairui Song Cc: Kemeng Shi Cc: Liam Howlett Cc: Mariano Pache Cc: Nhat Pham Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1ac0d06fb3c1..29ef70022da1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -501,6 +501,8 @@ extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_folio(const struct folio *folio) { + VM_WARN_ON_ONCE(!folio); + return READ_ONCE(huge_zero_folio) == folio; } -- cgit v1.2.3 From f367474b5884edbc42661e7fecf784cb131dd25d Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:27 -0700 Subject: x86/kexec: carry forward the boot DTB on kexec Currently, the kexec_file_load syscall on x86 does not support passing a device tree blob to the new kernel. Some embedded x86 systems use device trees. On these systems, failing to pass a device tree to the new kernel causes a boot failure. To add support for this, we copy the behavior of ARM64 and PowerPC and copy the current boot's device tree blob for use in the new kernel. We do this on x86 by passing the device tree blob as a setup_data entry in accordance with the x86 boot protocol. This behavior is gated behind the KEXEC_FILE_FORCE_DTB flag. Link: https://lkml.kernel.org/r/20250805211527.122367-3-makb@juniper.net Signed-off-by: Brian Mak Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/kexec-bzimage64.c | 47 ++++++++++++++++++++++++++++++++++++--- include/linux/kexec.h | 5 ++++- include/uapi/linux/kexec.h | 4 ++++ kernel/kexec_file.c | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); -- cgit v1.2.3 From c8a09fc9664f79eeb66cdf4a2a34d5b6a239b727 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:09 +0200 Subject: ida: remove the ida_simple_xxx() API All users of the ida_simple_xxx() have been converted. In Linux 6.11-rc2, the only callers are in tools/testing/. So it is now time to remove the definition of this old and deprecated ida_simple_get() and ida_simple_remove(). Link: https://lkml.kernel.org/r/aa205f45fef70a9c948b6a98bad06da58e4de776.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/idr.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index 2267902d29a7..789e23e67444 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } -/* - * ida_simple_get() and ida_simple_remove() are deprecated. Use - * ida_alloc() and ida_free() instead respectively. - */ -#define ida_simple_get(ida, start, end, gfp) \ - ida_alloc_range(ida, start, (end) - 1, gfp) -#define ida_simple_remove(ida, id) ida_free(ida, id) - static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); -- cgit v1.2.3 From baa96bcb180e979a821ce3ade87a3d2349b2d640 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:10 +0200 Subject: nvmem: update a comment related to struct nvmem_config Update a comment to match the function used in nvmem_register(). ida_simple_get() was replaced by ida_alloc() in commit 1eb51d6a4fce ("nvmem: switch to simpler IDA interface") Link: https://lkml.kernel.org/r/27a9dec93a9f79140b11a77df38b1b45bd342e09.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/nvmem-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 615a560d9edb..f3b13da78aac 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -103,7 +103,7 @@ struct nvmem_cell_info { * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is - * generated with ida_simple_get() and provided id field is ignored. + * generated with ida_alloc() and provided id field is ignored. * * Note: Specifying name and setting id to -1 implies a unique device * whose name is provided as-is (kept unaltered). -- cgit v1.2.3 From 2a8c51bc9391ff3c701f06ae1b678419f843dc1a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Aug 2025 00:55:07 -0700 Subject: list.h: add missing kernel-doc for basic macros kernel-doc for the basic LIST_HEAD() and LIST_HEAD_INIT() macros has been missing forever (i.e., since git). Add them for completeness. Link: https://lkml.kernel.org/r/20250819075507.113639-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Nicolas Frattaroli Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/list.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index e7e28afd28f8..ca63bdea6c1a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -20,8 +20,16 @@ * using the generic single-entry routines. */ +/** + * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself + * @name: name of the list_head + */ #define LIST_HEAD_INIT(name) { &(name), &(name) } +/** + * LIST_HEAD - definition of a &struct list_head with initialization values + * @name: name of the list_head + */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) -- cgit v1.2.3 From 2683df6539cbc3f0eeeba11154bc0cbf042a5cee Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 25 Aug 2025 10:57:00 +0800 Subject: panic: add note that 'panic_print' parameter is deprecated Just like for 'panic_print's systcl interface, add similar note for setup of kernel cmdline parameter and parameter under /sys/module/kernel/. Also add __core_param_cb() macro, which enables to add special get/set operation for a kernel parameter. Link: https://lkml.kernel.org/r/20250825025701.81921-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/moduleparam.h | 13 +++++++++++++ kernel/panic.c | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 3a25122d83e2..6907aedc4f74 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod) __module_param_call("", name, ¶m_ops_##type, &var, perm, \ -1, KERNEL_PARAM_FL_UNSAFE) +/** + * __core_param_cb - similar like core_param, with a set/get ops instead of type. + * @name: the name of the cmdline and sysfs parameter (often the same as var) + * @var: the variable + * @ops: the set & get operations for this parameter. + * @perm: visibility in sysfs + * + * Ideally this should be called 'core_param_cb', but the name has been + * used for module core parameter, so add the '__' prefix + */ +#define __core_param_cb(name, ops, arg, perm) \ + __module_param_call("", name, ops, arg, perm, -1, 0) + #endif /* !MODULE */ /** diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..12a10e17ab4a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -937,12 +937,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) -- cgit v1.2.3 From d0d9c7235548f1d772f1e48c9d5742c65d81c705 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:29 +0800 Subject: panic: introduce helper functions for panic state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "panic: introduce panic status function family", v2. This series introduces a family of helper functions to manage panic state and updates existing code to use them. Before this series, panic state helpers were scattered and inconsistent. For example, panic_in_progress() was defined in printk/printk.c, not in panic.c or panic.h. As a result, developers had to look in unexpected places to understand or re-use panic state logic. Other checks were open- coded, duplicating logic across panic, crash, and watchdog paths. The new helpers centralize the functionality in panic.c/panic.h: - panic_try_start() - panic_reset() - panic_in_progress() - panic_on_this_cpu() - panic_on_other_cpu() Patches 1–8 add the helpers and convert panic/crash and printk/nbcon code to use them. Patch 9 fixes a bug in the watchdog subsystem by skipping checks when a panic is in progress, avoiding interference with the panic CPU. Together, this makes panic state handling simpler, more discoverable, and more robust. This patch (of 9): This patch introduces four new helper functions to abstract the management of the panic_cpu variable. These functions will be used in subsequent patches to refactor existing code. The direct use of panic_cpu can be error-prone and ambiguous, as it requires manual checks to determine which CPU is handling the panic. The new helpers clarify intent: panic_try_start(): Atomically sets the current CPU as the panicking CPU. panic_reset(): Reset panic_cpu to PANIC_CPU_INVALID. panic_in_progress(): Checks if a panic has been triggered. panic_on_this_cpu(): Returns true if the current CPU is the panic originator. panic_on_other_cpu(): Returns true if a panic is on another CPU. This change lays the groundwork for improved code readability and robustness in the panic handling subsystem. Link: https://lkml.kernel.org/r/20250825022947.1596226-1-wangjinchao600@gmail.com Link: https://lkml.kernel.org/r/20250825022947.1596226-2-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) b Signed-off-by: Andrew Morton --- include/linux/panic.h | 6 ++++++ kernel/panic.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 5 ----- 3 files changed, 59 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/panic.h b/include/linux/panic.h index 7be742628c25..6f972a66c13e 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -43,6 +43,12 @@ void abort(void); extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 +bool panic_try_start(void); +void panic_reset(void); +bool panic_in_progress(void); +bool panic_on_this_cpu(void); +bool panic_on_other_cpu(void); + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/panic.c b/kernel/panic.c index 24bca263f896..010a1bfc4843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !this_cpu_in_panic()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5fe35f377b79 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -345,11 +345,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { -- cgit v1.2.3 From c6be36e2997662f423edfa3979a63935873ff648 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:35 +0800 Subject: panic/printk: replace this_cpu_in_panic() with panic_on_this_cpu() The helper this_cpu_in_panic() duplicated logic already provided by panic_on_this_cpu(). Remove this_cpu_in_panic() and switch all users to panic_on_this_cpu(). This simplifies the code and avoids having two helpers for the same check. Link: https://lkml.kernel.org/r/20250825022947.1596226-8-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- include/linux/printk.h | 2 -- kernel/panic.c | 2 +- kernel/printk/nbcon.c | 2 +- kernel/printk/printk.c | 15 ++------------- kernel/printk/printk_ringbuffer.c | 2 +- lib/dump_stack.c | 2 +- 6 files changed, 6 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index 5d22b803f51e..45c663124c9b 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) #endif -bool this_cpu_in_panic(void); - #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); diff --git a/kernel/panic.c b/kernel/panic.c index c4ef86fc643f..a8b1bf60e09f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -348,7 +348,7 @@ EXPORT_SYMBOL(panic_on_this_cpu); */ bool panic_on_other_cpu(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } EXPORT_SYMBOL(panic_on_other_cpu); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 7490865e2f44..c6d1a4a747e9 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5fe35f377b79..faa8b1f0585b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -17,6 +17,7 @@ * 01Mar01 Andrew Morton */ +#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -345,18 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - /* * Return true if a panic is in progress on a remote CPU. * @@ -365,7 +354,7 @@ bool this_cpu_in_panic(void) */ bool other_cpu_in_panic(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } /* diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; diff --git a/lib/dump_stack.c b/lib/dump_stack.c index b3a85fe8b673..f0c78b5b5324 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,7 +102,7 @@ static void __dump_stack(const char *log_lvl) */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { - bool in_panic = this_cpu_in_panic(); + bool in_panic = panic_on_this_cpu(); unsigned long flags; /* -- cgit v1.2.3 From 7b1e502eb17c23fa6459a19bfe5974bffdb95574 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Sep 2025 21:38:57 -0700 Subject: kernel.h: add comments for enum system_states Provide some basic comments about the system_states and what they imply. Also convert the comments to kernel-doc format. Split the enum declaration from the definition of the system_state variable so that kernel-doc notation works cleanly with it. This is picked up by Documentation/driver-api/basics.rst so it does not need further inclusion in the kernel docbooks. Link: https://lkml.kernel.org/r/20250907043857.2941203-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Rafael J. Wysocki # v1 Reviewed-by: Mauro Carvalho Chehab [v5] Cc: "Brown, Len" Cc: Greg Kroah-Hartman Cc: James Bottomley Cc: Jani Nikula Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Cc: Pavel Machek Signed-off-by: Andrew Morton --- include/linux/kernel.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 989315dabb86..5b46924fdff5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -164,11 +164,23 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* - * Values used for system_state. Ordering of the states must not be changed +/** + * enum system_states - Values used for system_state. + * + * @SYSTEM_BOOTING: %0, no init needed + * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU + * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running + * @SYSTEM_RUNNING: system is up and running + * @SYSTEM_HALT: system entered clean system halt state + * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state + * @SYSTEM_RESTART: system entered emergency power off or normal restart + * @SYSTEM_SUSPEND: system entered suspend or hibernate state + * + * Note: + * Ordering of the states must not be changed * as code checks for <, <=, >, >= STATE. */ -extern enum system_states { +enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, SYSTEM_FREEING_INITMEM, @@ -177,7 +189,8 @@ extern enum system_states { SYSTEM_POWER_OFF, SYSTEM_RESTART, SYSTEM_SUSPEND, -} system_state; +}; +extern enum system_states system_state; /* * General tracing related utility functions - trace_printk(), -- cgit v1.2.3 From d6d5116391857fc78fad9aa42317b36e4ce17b58 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Thu, 21 Aug 2025 17:58:59 +0000 Subject: kexec: introduce is_kho_boot() Patch series "efi: Fix EFI boot with kexec handover (KHO)", v3. This patch series fixes a kernel panic that occurs when booting with both EFI and KHO (Kexec HandOver) enabled. The issue arises because EFI's `reserve_regions()` clears all memory regions with `memblock_remove(0, PHYS_ADDR_MAX)` before rebuilding them from EFI data. This destroys KHO scratch regions that were set up early during device tree scanning, causing a panic as the kernel has no valid memory regions for early allocations. The first patch introduces `is_kho_boot()` to allow early boot components to reliably detect if the kernel was booted via KHO-enabled kexec. The existing `kho_is_enabled()` only checks the command line and doesn't verify if an actual KHO FDT was passed. The second patch modifies EFI's `reserve_regions()` to selectively remove only non-KHO memory regions when KHO is active, preserving the critical scratch regions while still allowing EFI to rebuild its memory map. This patch (of 3): During early initialisation, after a kexec, other components, like EFI need to know if a KHO enabled kexec is performed. The `kho_is_enabled` function is not enough as in the early stages, it only reflects whether the cmdline has KHO enabled, not if an actual KHO FDT exists. Extend the KHO API with `is_kho_boot()` to provide a way for components to check if a KHO enabled kexec is performed. Link: https://lkml.kernel.org/r/cover.1755721529.git.epetron@amazon.de Link: https://lkml.kernel.org/r/7dc6674a76bf6e68cca0222ccff32427699cc02e.1755721529.git.epetron@amazon.de Signed-off-by: Evangelos Petrongonas Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 6 ++++++ kernel/kexec_handover.c | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 348844cffb13..559d13a3bc44 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -40,6 +40,7 @@ struct kho_serialization; #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); +bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_phys(phys_addr_t phys, size_t size); @@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void) return false; } +static inline bool is_kho_boot(void) +{ + return false; +} + static inline int kho_preserve_folio(struct folio *folio) { return -EOPNOTSUPP; diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index ecd1ac210dbd..49a39aee6a8e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -951,6 +951,26 @@ static const void *kho_get_fdt(void) return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; } +/** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + /** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). -- cgit v1.2.3 From e19ceeb1c0f63e3e15b197c5f34797134b51ba0e Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:35:58 +0000 Subject: platform/chrome: Centralize common cros_ec_device initialization Move the common initialization from protocol device drivers into central cros_ec_device_alloc(). This removes duplicated code from each driver's probe function. The buffer sizes are now calculated once, using the maximum possible overhead required by any of the transport protocols, ensuring the allocated buffers are sufficient for all cases. Link: https://lore.kernel.org/r/20250828083601.856083-3-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 9 +++++++++ drivers/platform/chrome/cros_ec_i2c.c | 5 ----- drivers/platform/chrome/cros_ec_ishtp.c | 4 ---- drivers/platform/chrome/cros_ec_lpc.c | 4 ---- drivers/platform/chrome/cros_ec_rpmsg.c | 4 ---- drivers/platform/chrome/cros_ec_spi.c | 5 ----- drivers/platform/chrome/cros_ec_uart.c | 4 ---- include/linux/platform_data/cros_ec_proto.h | 14 ++++++++++---- 8 files changed, 19 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index 25283a148ab9..da049068b6e9 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -38,6 +38,15 @@ struct cros_ec_device *cros_ec_device_alloc(struct device *dev) if (!ec_dev) return NULL; + ec_dev->din_size = sizeof(struct ec_host_response) + + sizeof(struct ec_response_get_protocol_info) + + EC_MAX_RESPONSE_OVERHEAD; + ec_dev->dout_size = sizeof(struct ec_host_request) + + sizeof(struct ec_params_rwsig_action) + + EC_MAX_REQUEST_OVERHEAD; + + ec_dev->dev = dev; + return ec_dev; } EXPORT_SYMBOL(cros_ec_device_alloc); diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c index ee3c5130ec3f..def1144a077e 100644 --- a/drivers/platform/chrome/cros_ec_i2c.c +++ b/drivers/platform/chrome/cros_ec_i2c.c @@ -297,16 +297,11 @@ static int cros_ec_i2c_probe(struct i2c_client *client) return -ENOMEM; i2c_set_clientdata(client, ec_dev); - ec_dev->dev = dev; ec_dev->priv = client; ec_dev->irq = client->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c; ec_dev->pkt_xfer = cros_ec_pkt_xfer_i2c; ec_dev->phys_name = client->adapter->name; - ec_dev->din_size = sizeof(struct ec_host_response_i2c) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request_i2c) + - sizeof(struct ec_params_rwsig_action); err = cros_ec_register(ec_dev); if (err) { diff --git a/drivers/platform/chrome/cros_ec_ishtp.c b/drivers/platform/chrome/cros_ec_ishtp.c index c102a796670c..4e74e702c5a2 100644 --- a/drivers/platform/chrome/cros_ec_ishtp.c +++ b/drivers/platform/chrome/cros_ec_ishtp.c @@ -550,14 +550,10 @@ static int cros_ec_dev_init(struct ishtp_cl_data *client_data) client_data->ec_dev = ec_dev; dev->driver_data = ec_dev; - ec_dev->dev = dev; ec_dev->priv = client_data->cros_ish_cl; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_pkt_xfer_ish; ec_dev->phys_name = dev_name(dev); - ec_dev->din_size = sizeof(struct cros_ish_in_msg) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct cros_ish_out_msg) + sizeof(struct ec_params_rwsig_action); return cros_ec_register(ec_dev); } diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 30fa89b81666..78cfff80cdea 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -642,14 +642,10 @@ static int cros_ec_lpc_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, ec_dev); - ec_dev->dev = dev; ec_dev->phys_name = dev_name(dev); ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc; ec_dev->pkt_xfer = cros_ec_pkt_xfer_lpc; ec_dev->cmd_readmem = cros_ec_lpc_readmem; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_dev->priv = ec_lpc; /* diff --git a/drivers/platform/chrome/cros_ec_rpmsg.c b/drivers/platform/chrome/cros_ec_rpmsg.c index 9ac2b923db6d..09bd9e49464e 100644 --- a/drivers/platform/chrome/cros_ec_rpmsg.c +++ b/drivers/platform/chrome/cros_ec_rpmsg.c @@ -224,14 +224,10 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev) if (!ec_rpmsg) return -ENOMEM; - ec_dev->dev = dev; ec_dev->priv = ec_rpmsg; ec_dev->cmd_xfer = cros_ec_cmd_xfer_rpmsg; ec_dev->pkt_xfer = cros_ec_pkt_xfer_rpmsg; ec_dev->phys_name = dev_name(&rpdev->dev); - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); dev_set_drvdata(dev, ec_dev); ec_rpmsg->rpdev = rpdev; diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index c778300a4145..28fa82f8cb07 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -757,16 +757,11 @@ static int cros_ec_spi_probe(struct spi_device *spi) cros_ec_spi_dt_probe(ec_spi, dev); spi_set_drvdata(spi, ec_dev); - ec_dev->dev = dev; ec_dev->priv = ec_spi; ec_dev->irq = spi->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi; ec_dev->pkt_xfer = cros_ec_pkt_xfer_spi; ec_dev->phys_name = dev_name(&ec_spi->spi->dev); - ec_dev->din_size = EC_MSG_PREAMBLE_COUNT + - sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_spi->last_transfer_ns = ktime_get_ns(); diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 1a7511b1bbe3..d5b37414ff12 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -276,14 +276,10 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); - ec_dev->dev = dev; ec_dev->priv = ec_uart; ec_dev->irq = ec_uart->irq; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_uart_pkt_xfer; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 3ec24f445c29..4d96cffbb9e3 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -33,12 +33,18 @@ /* * Max bus-specific overhead incurred by request/responses. - * I2C requires 1 additional byte for requests. - * I2C requires 2 additional bytes for responses. - * SPI requires up to 32 additional bytes for responses. + * + * Request: + * - I2C requires 1 byte (see struct ec_host_request_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_out_msg). + * + * Response: + * - I2C requires 2 bytes (see struct ec_host_response_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_in_msg). + * - SPI requires 32 bytes (see EC_MSG_PREAMBLE_COUNT). */ #define EC_PROTO_VERSION_UNKNOWN 0 -#define EC_MAX_REQUEST_OVERHEAD 1 +#define EC_MAX_REQUEST_OVERHEAD 4 #define EC_MAX_RESPONSE_OVERHEAD 32 /* -- cgit v1.2.3 From 56cb557279d70397cefb497e0f06bdd6fd685f8e Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:36:00 +0000 Subject: platform/chrome: cros_ec: Add a flag to track registration state Introduce a `registered` flag to the `struct cros_ec_device` to allow callers to determine if the device has been fully registered and is ready for use. This is a preparatory step to prevent race conditions where other drivers might try to access the device before it is fully registered or after it has been unregistered. Link: https://lore.kernel.org/r/20250828083601.856083-5-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 7 +++++++ drivers/platform/chrome/cros_ec_proto.c | 15 +++++++++++++++ include/linux/platform_data/cros_ec_proto.h | 4 ++++ 3 files changed, 26 insertions(+) (limited to 'include') diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index 61bcef8741db..1da79e3d215b 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -9,6 +9,7 @@ * battery charging and regulator control, firmware update. */ +#include #include #include #include @@ -316,6 +317,9 @@ int cros_ec_register(struct cros_ec_device *ec_dev) goto exit; } + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = true; + dev_info(dev, "Chrome EC device registered\n"); /* @@ -343,6 +347,9 @@ EXPORT_SYMBOL(cros_ec_register); */ void cros_ec_unregister(struct cros_ec_device *ec_dev) { + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = false; + if (ec_dev->mkbp_event_supported) blocking_notifier_chain_unregister(&ec_dev->event_notifier, &ec_dev->notifier_ready); diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 3e94a0a82173..1d8d9168ec1a 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -3,6 +3,7 @@ // // Copyright (C) 2015 Google, Inc +#include #include #include #include @@ -1153,5 +1154,19 @@ int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd) } EXPORT_SYMBOL_GPL(cros_ec_get_cmd_versions); +/** + * cros_ec_device_registered - Return if the ec_dev is registered. + * + * @ec_dev: EC device + * + * Return: true if registered. Otherwise, false. + */ +bool cros_ec_device_registered(struct cros_ec_device *ec_dev) +{ + guard(mutex)(&ec_dev->lock); + return ec_dev->registered; +} +EXPORT_SYMBOL_GPL(cros_ec_device_registered); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ChromeOS EC communication protocol helpers"); diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 4d96cffbb9e3..de14923720a5 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -128,6 +128,7 @@ struct cros_ec_command { * @dout_size: Size of dout buffer to allocate (zero to use static dout). * @wake_enabled: True if this device can wake the system from sleep. * @suspended: True if this device had been suspended. + * @registered: True if this device had been registered. * @cmd_xfer: Send command to EC and get response. * Returns the number of bytes received if the communication * succeeded, but that doesn't mean the EC was happy with the @@ -186,6 +187,7 @@ struct cros_ec_device { int dout_size; bool wake_enabled; bool suspended; + bool registered; int (*cmd_xfer)(struct cros_ec_device *ec, struct cros_ec_command *msg); int (*pkt_xfer)(struct cros_ec_device *ec, @@ -278,6 +280,8 @@ int cros_ec_cmd_readmem(struct cros_ec_device *ec_dev, u8 offset, u8 size, void int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd); +bool cros_ec_device_registered(struct cros_ec_device *ec_dev); + /** * cros_ec_get_time_ns() - Return time in ns. * -- cgit v1.2.3 From e68f150bc11d0a05cbe984a4e5c0f72a95cae07d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 18 Aug 2025 09:46:15 +0300 Subject: memblock: drop for_each_free_mem_pfn_range_in_zone_from() for_each_free_mem_pfn_range_in_zone_from() and its "backend" implementation __next_mem_pfn_range_in_zone() were only used by deferred initialization of the memory map. Remove them as they are not used anymore. Reviewed-by: Wei Yang Reviewed-by: David Hildenbrand Signed-off-by: Mike Rapoport (Microsoft) --- .clang-format | 1 - include/linux/memblock.h | 22 ----------------- mm/memblock.c | 64 ------------------------------------------------ 3 files changed, 87 deletions(-) (limited to 'include') diff --git a/.clang-format b/.clang-format index 48405c54ef27..f371a13b4d19 100644 --- a/.clang-format +++ b/.clang-format @@ -294,7 +294,6 @@ ForEachMacros: - 'for_each_fib6_node_rt_rcu' - 'for_each_fib6_walker_rt' - 'for_each_file_lock' - - 'for_each_free_mem_pfn_range_in_zone_from' - 'for_each_free_mem_range' - 'for_each_free_mem_range_reverse' - 'for_each_func_rsrc' diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fcda8481de9a..221118b5a16e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -324,28 +324,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, - unsigned long *out_spfn, - unsigned long *out_epfn); - -/** - * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific - * free memblock areas from a given point - * @i: u64 used as loop variable - * @zone: zone in which all of the memory blocks reside - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over free (memory && !reserved) areas of memblock in a specific - * zone, continuing from current position. Available as soon as memblock is - * initialized. - */ -#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ - for (; i != U64_MAX; \ - __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) - -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * for_each_free_mem_range - iterate through free memblock areas diff --git a/mm/memblock.c b/mm/memblock.c index 117d963e677c..120a501a887a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1445,70 +1445,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -/** - * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() - * - * @idx: pointer to u64 loop variable - * @zone: zone in which all of the memory blocks reside - * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL - * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL - * - * This function is meant to be a zone/pfn specific wrapper for the - * for_each_mem_range type iterators. Specifically they are used in the - * deferred memory init routines and as such we were duplicating much of - * this logic throughout the code. So instead of having it in multiple - * locations it seemed like it would make more sense to centralize this to - * one new iterator that does everything they need. - */ -void __init_memblock -__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, - unsigned long *out_spfn, unsigned long *out_epfn) -{ - int zone_nid = zone_to_nid(zone); - phys_addr_t spa, epa; - - __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, - &memblock.memory, &memblock.reserved, - &spa, &epa, NULL); - - while (*idx != U64_MAX) { - unsigned long epfn = PFN_DOWN(epa); - unsigned long spfn = PFN_UP(spa); - - /* - * Verify the end is at least past the start of the zone and - * that we have at least one PFN to initialize. - */ - if (zone->zone_start_pfn < epfn && spfn < epfn) { - /* if we went too far just stop searching */ - if (zone_end_pfn(zone) <= spfn) { - *idx = U64_MAX; - break; - } - - if (out_spfn) - *out_spfn = max(zone->zone_start_pfn, spfn); - if (out_epfn) - *out_epfn = min(zone_end_pfn(zone), epfn); - - return; - } - - __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, - &memblock.memory, &memblock.reserved, - &spa, &epa, NULL); - } - - /* signal end of iteration */ - if (out_spfn) - *out_spfn = ULONG_MAX; - if (out_epfn) - *out_epfn = 0; -} - -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ - /** * memblock_alloc_range_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes -- cgit v1.2.3 From fdae0ab67d57d480dc61e9fb45678bbdc3786711 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2025 12:19:42 +0000 Subject: net: use NUMA drop counters for softnet_data.dropped Hosts under DOS attack can suffer from false sharing in enqueue_to_backlog() : atomic_inc(&sd->dropped). This is because sd->dropped can be touched from many cpus, possibly residing on different NUMA nodes. Generalize the sk_drop_counters infrastucture added in commit c51613fa276f ("net: add sk->sk_drop_counters") and use it to replace softnet_data.dropped with NUMA friendly softnet_data.drop_counters. This adds 64 bytes per cpu, maybe more in the future if we increase the number of counters (currently 2) per 'struct numa_drop_counters'. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250909121942.1202585-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 2 +- include/linux/netdevice.h | 28 +++++++++++++++++++++++++++- include/linux/udp.h | 2 +- include/net/raw.h | 2 +- include/net/sock.h | 37 ++++++++++++------------------------- net/core/dev.c | 2 +- net/core/net-procfs.c | 3 ++- 7 files changed, 45 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 261d02efb615..f43314517396 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f3a3b761abfb..f5a840c07cf1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3459,6 +3459,32 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } +struct numa_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + +static inline int numa_drop_read(const struct numa_drop_counters *ndc) +{ + return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); +} + +static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) +{ + int n = numa_node_id() % 2; + + if (n) + atomic_add(val, &ndc->drops1); + else + atomic_add(val, &ndc->drops0); +} + +static inline void numa_drop_reset(struct numa_drop_counters *ndc) +{ + atomic_set(&ndc->drops0, 0); + atomic_set(&ndc->drops1, 0); +} + /* * Incoming packets are placed on per-CPU queues */ @@ -3504,7 +3530,7 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; - atomic_t dropped ____cacheline_aligned_in_smp; + struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; diff --git a/include/linux/udp.h b/include/linux/udp.h index 981506be1e15..6ed008ab1665 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/raw.h b/include/net/raw.h index d52709139060..66c0ffeada2e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,7 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/include/net/sock.h b/include/net/sock.h index 896bec2d2176..0fd465935334 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,11 +102,6 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; -struct socket_drop_counters { - atomic_t drops0 ____cacheline_aligned_in_smp; - atomic_t drops1 ____cacheline_aligned_in_smp; -}; - /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -287,7 +282,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter - * @sk_drop_counters: optional pointer to socket_drop_counters + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -456,7 +451,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct socket_drop_counters *sk_drop_counters; + struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2698,18 +2693,12 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - int n = numa_node_id() % 2; - - if (n) - atomic_add(segs, &sdc->drops1); - else - atomic_add(segs, &sdc->drops0); - } else { + if (ndc) + numa_drop_add(ndc, segs); + else atomic_add(segs, &sk->sk_drops); - } } static inline void sk_drops_inc(struct sock *sk) @@ -2719,23 +2708,21 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { - const struct socket_drop_counters *sdc = sk->sk_drop_counters; + const struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { + if (ndc) { DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); - return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + return numa_drop_read(ndc); } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - atomic_set(&sdc->drops0, 0); - atomic_set(&sdc->drops1, 0); - } + if (ndc) + numa_drop_reset(ndc); atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/dev.c b/net/core/dev.c index 1d1650d9ecff..2522d9d8f0e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5248,7 +5248,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: - atomic_inc(&sd->dropped); + numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 4f0f0709a1cb..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ -- cgit v1.2.3 From 66048f8b3cc7e462953c04285183cdee43a1cb89 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 2 Sep 2025 14:29:33 +0800 Subject: net/cls_cgroup: Fix task_get_classid() during qdisc run During recent testing with the netem qdisc to inject delays into TCP traffic, we observed that our CLS BPF program failed to function correctly due to incorrect classid retrieval from task_get_classid(). The issue manifests in the following call stack: bpf_get_cgroup_classid+5 cls_bpf_classify+507 __tcf_classify+90 tcf_classify+217 __dev_queue_xmit+798 bond_dev_queue_xmit+43 __bond_start_xmit+211 bond_start_xmit+70 dev_hard_start_xmit+142 sch_direct_xmit+161 __qdisc_run+102 <<<<< Issue location __dev_xmit_skb+1015 __dev_queue_xmit+637 neigh_hh_output+159 ip_finish_output2+461 __ip_finish_output+183 ip_finish_output+41 ip_output+120 ip_local_out+94 __ip_queue_xmit+394 ip_queue_xmit+21 __tcp_transmit_skb+2169 tcp_write_xmit+959 __tcp_push_pending_frames+55 tcp_push+264 tcp_sendmsg_locked+661 tcp_sendmsg+45 inet_sendmsg+67 sock_sendmsg+98 sock_write_iter+147 vfs_write+786 ksys_write+181 __x64_sys_write+25 do_syscall_64+56 entry_SYSCALL_64_after_hwframe+100 The problem occurs when multiple tasks share a single qdisc. In such cases, __qdisc_run() may transmit skbs created by different tasks. Consequently, task_get_classid() retrieves an incorrect classid since it references the current task's context rather than the skb's originating task. Given that dev_queue_xmit() always executes with bh disabled, we can use softirq_count() instead to obtain the correct classid. The simple steps to reproduce this issue: 1. Add network delay to the network interface: such as: tc qdisc add dev bond0 root netem delay 1.5ms 2. Build two distinct net_cls cgroups, each with a network-intensive task 3. Initiate parallel TCP streams from both tasks to external servers. Under this specific condition, the issue reliably occurs. The kernel eventually dequeues an SKB that originated from Task-A while executing in the context of Task-B. It is worth noting that it will change the established behavior for a slightly different scenario: prior to this patch the skb will be classified with the 'new' task A classid, now with the old/original one. The bpf_get_cgroup_classid_curr() function is a more appropriate choice for this case. Signed-off-by: Yafang Shao Cc: Daniel Borkmann Cc: Thomas Graf Cc: Sebastian Andrzej Siewior Cc: Nikolay Aleksandrov Link: https://patch.msgid.link/20250902062933.30087-1-laoar.shao@gmail.com Signed-off-by: Jakub Kicinski --- include/net/cls_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 7e78e7d6f015..668aeee9b3f6 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -63,7 +63,7 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * calls by looking at the number of nested bh disable calls because * softirqs always disables bh. */ - if (in_serving_softirq()) { + if (softirq_count()) { struct sock *sk = skb_to_full_sk(skb); /* If there is an sock_cgroup_classid we'll use that. */ -- cgit v1.2.3 From ae1c658b33d4bec20c037aebba583a68375d4773 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 11 Sep 2025 15:08:31 +0200 Subject: net: phy: introduce phy_id_compare_model() PHY ID helper Similar to phy_id_compare_vendor(), introduce the equivalent phy_id_compare_model() helper for the generic PHY ID Model mask. Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250911130840.23569-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 04553419adc3..6f3b25cb7f4e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1308,6 +1308,19 @@ static inline bool phy_id_compare_vendor(u32 id, u32 vendor_mask) return phy_id_compare(id, vendor_mask, PHY_ID_MATCH_VENDOR_MASK); } +/** + * phy_id_compare_model - compare @id with @model mask + * @id: PHY ID + * @model_mask: PHY Model mask + * + * Return: true if the bits from @id match @model using the + * generic PHY Model mask. + */ +static inline bool phy_id_compare_model(u32 id, u32 model_mask) +{ + return phy_id_compare(id, model_mask, PHY_ID_MATCH_MODEL_MASK); +} + /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device -- cgit v1.2.3 From a9888628cb2c768202a4530e2816da1889cc3165 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:15 +0200 Subject: net: dst_metadata: fix IP_DF bit not extracted from tunnel headers Both OVS and TC flower allow extracting and matching on the DF bit of the outer IP header via OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT in the OVS_KEY_ATTR_TUNNEL and TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT in the TCA_FLOWER_KEY_ENC_FLAGS respectively. Flow dissector extracts this information as FLOW_DIS_F_TUNNEL_DONT_FRAGMENT from the tunnel info key. However, the IP_TUNNEL_DONT_FRAGMENT_BIT in the tunnel key is never actually set, because the tunneling code doesn't actually extract it from the IP header. OAM and CRIT_OPT are extracted by the tunnel implementation code, same code also sets the KEY flag, if present. UDP tunnel core takes care of setting the CSUM flag if the checksum is present in the UDP header, but the DONT_FRAGMENT is not handled at any layer. Fix that by checking the bit and setting the corresponding flag while populating the tunnel info in the IP layer where it belongs. Not using __assign_bit as we don't really need to clear the bit in a just initialized field. It also doesn't seem like using __assign_bit will make the code look better. Clearly, users didn't rely on this functionality for anything very important until now. The reason why this doesn't break OVS logic is that it only matches on what kernel previously parsed out and if kernel consistently reports this bit as zero, OVS will only match on it to be zero, which sort of works. But it is still a bug that the uAPI reports and allows matching on the field that is not actually checked in the packet. And this is causing misleading -df reporting in OVS datapath flows, while the tunnel traffic actually has the bit set in most cases. This may also cause issues if a hardware properly implements support for tunnel flag matching as it will disagree with the implementation in a software path of TC flower. Fixes: 7d5437c709de ("openvswitch: Add tunneling interface.") Fixes: 1d17568e74de ("net/sched: cls_flower: add support for matching tunnel control flags") Signed-off-by: Ilya Maximets Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250909165440.229890-2-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4160731dcb6e..1fc2fb03ce3f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include +#include #include #include #include @@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, -- cgit v1.2.3 From 3afaff7a0ce97457c8ab46862f2c06603a89962e Mon Sep 17 00:00:00 2001 From: Akhilesh Patil Date: Mon, 11 Aug 2025 17:42:53 +0530 Subject: include/linux/rv.h: remove redundant include file Remove redundant include to clean up the code. Move all unique include files inside CONFIG_RV as they are only needed when CONFIG_RV is enabled. Arrange include files alphabetically. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") [1] Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202507312017.oyD08TL5-lkp@intel.com/ Signed-off-by: Akhilesh Patil Reviewed-by: Gabriele Monaco Link: https://lore.kernel.org/r/aJneRbHGlNFg7lr9@bhairav-test.ee.iitb.ac.in Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rv.h b/include/linux/rv.h index 14410a42faef..9520aab34bcb 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,16 +7,14 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#include -#include - #define MAX_DA_NAME_LEN 32 #define MAX_DA_RETRY_RACING_EVENTS 3 #ifdef CONFIG_RV +#include #include +#include #include -#include /* * Deterministic automaton per-object variables. -- cgit v1.2.3 From e7c9b66b106989aeb17b167f5bbea9a108d26c0d Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 17 Jul 2025 17:11:16 +0200 Subject: pwm: Provide a gpio device for waveform drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PWM is a more general concept than an output-only GPIO. When using duty_length = period_length the PWM looks like an active GPIO, with duty_length = 0 like an inactive GPIO. With the waveform abstraction there is enough control over the configuration to ensure that PWMs that cannot generate a constant signal at both levels error out. The pwm-pca9685 driver already provides a gpio chip. When this driver is converted to the waveform callbacks, the gpio part can just be dropped. Signed-off-by: Uwe Kleine-König Reviewed-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20250717151117.1828585-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/Kconfig | 9 +++++++ drivers/pwm/core.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pwm.h | 3 +++ 3 files changed, 84 insertions(+) (limited to 'include') diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index f00ce973dddf..2aa24608a8df 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -38,6 +38,15 @@ config PWM_DEBUG It is expected to introduce some runtime overhead and diagnostic output to the kernel log, so only enable while working on a driver. +config PWM_PROVIDE_GPIO + bool "Provide a GPIO chip for each PWM chip" + depends on GPIOLIB + help + Most PWMs can emit both a constant active high and a constant active + low signal and so they can be used as GPIO. Say Y here to let each + PWM chip provide a GPIO chip and so be easily plugged into consumers + that know how to handle GPIOs but not PWMs. + config PWM_AB8500 tristate "AB8500 PWM support" depends on AB8500_CORE && ARCH_U8500 diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 2570ad6a7f59..ea2ccf42e814 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -2391,6 +2391,51 @@ static const struct file_operations pwm_cdev_fileops = { static dev_t pwm_devt; +static int pwm_gpio_request(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm; + + pwm = pwm_request_from_chip(chip, offset, "pwm-gpio"); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + return 0; +} + +static void pwm_gpio_free(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + + pwm_put(&chip->pwms[offset]); +} + +static int pwm_gpio_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + return GPIO_LINE_DIRECTION_OUT; +} + +static int pwm_gpio_set(struct gpio_chip *gc, unsigned int offset, int value) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm = &chip->pwms[offset]; + int ret; + struct pwm_waveform wf = { + .period_length_ns = 1, + }; + + ret = pwm_round_waveform_might_sleep(pwm, &wf); + if (ret < 0) + return ret; + + if (value) + wf.duty_length_ns = wf.period_length_ns; + else + wf.duty_length_ns = 0; + + return pwm_set_waveform_might_sleep(pwm, &wf, true); +} + /** * __pwmchip_add() - register a new PWM chip * @chip: the PWM chip to add @@ -2457,9 +2502,33 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner) if (ret) goto err_device_add; + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) { + struct device *parent = pwmchip_parent(chip); + + chip->gpio = (typeof(chip->gpio)){ + .label = dev_name(parent), + .parent = parent, + .request = pwm_gpio_request, + .free = pwm_gpio_free, + .get_direction = pwm_gpio_get_direction, + .set = pwm_gpio_set, + .base = -1, + .ngpio = chip->npwm, + .can_sleep = true, + }; + + ret = gpiochip_add_data(&chip->gpio, chip); + if (ret) + goto err_gpiochip_add; + } + return 0; +err_gpiochip_add: + + cdev_device_del(&chip->cdev, &chip->dev); err_device_add: + scoped_guard(pwmchip, chip) chip->operational = false; @@ -2480,6 +2549,9 @@ EXPORT_SYMBOL_GPL(__pwmchip_add); */ void pwmchip_remove(struct pwm_chip *chip) { + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) + gpiochip_remove(&chip->gpio); + pwmchip_sysfs_unexport(chip); scoped_guard(mutex, &pwm_lock) { diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8cafc483db53..549ac4aaad59 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -321,6 +322,7 @@ struct pwm_ops { * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context + * @gpio: &struct gpio_chip to operate this PWM chip's lines as GPO * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip * @operational: signals if the chip can be used (or is already deregistered) * @nonatomic_lock: mutex for nonatomic chips @@ -340,6 +342,7 @@ struct pwm_chip { bool atomic; /* only used internally by the PWM framework */ + struct gpio_chip gpio; bool uses_pwmchip_alloc; bool operational; union { -- cgit v1.2.3 From 09f37134464cc03baf5cb8eab2d99db27ee73217 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:00 -0500 Subject: x86,fs/resctrl: Consolidate monitor event descriptions There are currently only three monitor events, all associated with the RDT_RESOURCE_L3 resource. Growing support for additional events will be easier with some restructuring to have a single point in file system code where all attributes of all events are defined. Place all event descriptions into an array mon_event_all[]. Doing this has the beneficial side effect of removing the need for rdt_resource::evt_list. Add resctrl_event_id::QOS_FIRST_EVENT for a lower bound on range checks for event ids and as the starting index to scan mon_event_all[]. Drop the code that builds evt_list and change the two places where the list is scanned to scan mon_event_all[] instead using a new helper macro for_each_mon_event(). Architecture code now informs file system code which events are available with resctrl_enable_mon_event(). Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 12 ++++++-- fs/resctrl/internal.h | 13 ++++++-- fs/resctrl/monitor.c | 63 +++++++++++++++++++------------------- fs/resctrl/rdtgroup.c | 11 ++++--- include/linux/resctrl.h | 4 +-- include/linux/resctrl_types.h | 12 +++++--- 6 files changed, 66 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b..7fcae25874fe 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -864,12 +864,18 @@ static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + } if (!rdt_mon_features) return false; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..7a57366d1abc 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,26 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7326c28a7908..d5bf3e0334b6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -842,38 +842,39 @@ out_unlock: mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - /* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. */ -static void l3_mon_evt_init(struct rdt_resource *r) +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, +}; + +void resctrl_enable_mon_event(enum resctrl_event_id eventid) { - INIT_LIST_HEAD(&r->evt_list); + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + mon_event_all[eventid].enabled = true; } /** @@ -900,15 +901,13 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5f0b7cfa1cc2..ab943a5907c5 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1152,7 +1152,9 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); if (mevt->configurable) seq_printf(seq, "%s_config\n", mevt->name); @@ -3054,10 +3056,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd..2944042bd84c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -269,7 +269,6 @@ enum resctrl_schema_fmt { * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource @@ -287,7 +286,6 @@ struct rdt_resource { struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,6 +370,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d..2dadbc54e4b3 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,15 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, -- cgit v1.2.3 From d257cc2e5c8bb8236cb161360d6c0529fd442712 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:01 -0500 Subject: x86,fs/resctrl: Replace architecture event enabled checks The resctrl file system now has complete knowledge of the status of every event. So there is no need for per-event function calls to check. Replace each of the resctrl_arch_is_{event}enabled() calls with resctrl_is_mon_event_enabled(QOS_{EVENT}). No functional change. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/include/asm/resctrl.h | 15 --------------- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++-- fs/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/monitor.c | 16 +++++++++++----- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 2 ++ 7 files changed, 28 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990..b1dd5d6b87db 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -84,21 +84,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7fcae25874fe..1a319ce9328c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -402,13 +402,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_total); hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_total) return -ENOMEM; } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_local); hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_local) { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cd..61d38517e2bf 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -207,11 +207,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); } diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3c39cfacb251..42b281b3852f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d5bf3e0334b6..b0b1dcc36795 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -635,10 +635,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } @@ -877,6 +877,12 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid) mon_event_all[eventid].enabled = true; } +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -912,9 +918,9 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ab943a5907c5..2ca8e66c0d20 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,8 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } static bool resctrl_is_mbm_event(int e) @@ -196,7 +196,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -4048,7 +4048,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,12 +4084,12 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { @@ -4097,7 +4097,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain return -ENOMEM; } } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { @@ -4142,7 +4142,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4217,7 +4217,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2944042bd84c..40aba6b5d4f0 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -372,6 +372,8 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); void resctrl_enable_mon_event(enum resctrl_event_id eventid); +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** -- cgit v1.2.3 From 8810c6e7cca8fbfce7652b53e05acc465e671d28 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 9 Sep 2025 10:00:04 +0000 Subject: KVM: arm64: vgic-init: Remove vgic_ready() macro It is now used only within kvm_vgic_map_resources(). vgic_dist::ready is already written directly by this function, so it is clearer to bypass the macro for reads as well. Signed-off-by: Keir Fraser Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 5 ++--- include/kvm/arm_vgic.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1e680ad6e863..3f207b5f80a5 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -554,7 +554,6 @@ int vgic_lazy_init(struct kvm *kvm) * Also map the virtual CPU interface into the VM. * v2 calls vgic_init() if not already done. * v3 and derivatives return an error if the VGIC is not initialized. - * vgic_ready() returns true if this function has succeeded. */ int kvm_vgic_map_resources(struct kvm *kvm) { @@ -563,12 +562,12 @@ int kvm_vgic_map_resources(struct kvm *kvm) gpa_t dist_base; int ret = 0; - if (likely(vgic_ready(kvm))) + if (likely(dist->ready)) return 0; mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->arch.config_lock); - if (vgic_ready(kvm)) + if (dist->ready) goto out; if (!irqchip_in_kernel(kvm)) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 404883c7af6e..e7ffaf4bf2e7 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -406,7 +406,6 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) -#define vgic_ready(k) ((k)->arch.vgic.ready) #define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) -- cgit v1.2.3 From 7788255aba6545a27b8d143c5256536f8dfb2c0a Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 9 Sep 2025 10:00:06 +0000 Subject: KVM: Implement barriers before accessing kvm->buses[] on SRCU read paths This ensures that, if a VCPU has "observed" that an IO registration has occurred, the instruction currently being trapped or emulated will also observe the IO registration. At the same time, enforce that kvm_get_bus() is used only on the update side, ensuring that a long-term reference cannot be obtained by an SRCU reader. Signed-off-by: Keir Fraser Signed-off-by: Marc Zyngier --- arch/x86/kvm/vmx/vmx.c | 7 +++++++ include/linux/kvm_host.h | 10 +++++++--- virt/kvm/kvm_main.c | 32 ++++++++++++++++++++++++++------ 3 files changed, 40 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..0bdf9405969a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5785,6 +5785,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; + /* + * Ensure that any updates to kvm->buses[] observed by the + * previous instruction (emulated or otherwise) are also + * visible to the instruction KVM is about to emulate. + */ + smp_rmb(); + if (!kvm_emulate_instruction(vcpu, 0)) return 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..e7d6111cf254 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -966,11 +966,15 @@ static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); } +/* + * Get a bus reference under the update-side lock. No long-term SRCU reader + * references are permitted, to avoid stale reads vs concurrent IO + * registrations. + */ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { - return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock) || - !refcount_read(&kvm->users_count)); + return rcu_dereference_protected(kvm->buses[idx], + lockdep_is_held(&kvm->slots_lock)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..870ad8ea93a7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1103,6 +1103,14 @@ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { } +/* Called only on cleanup and destruction paths when there are no users. */ +static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, + enum kvm_bus idx) +{ + return rcu_dereference_protected(kvm->buses[idx], + !refcount_read(&kvm->users_count)); +} + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -1228,7 +1236,7 @@ out_err_no_disable: out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) - kfree(kvm_get_bus(kvm, i)); + kfree(kvm_get_bus_for_destruction(kvm, i)); kvm_free_irq_routing(kvm); out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); @@ -1276,7 +1284,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - struct kvm_io_bus *bus = kvm_get_bus(kvm, i); + struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i); if (bus) kvm_io_bus_destroy(bus); @@ -5843,6 +5851,18 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } +static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx) +{ + /* + * Ensure that any updates to kvm_buses[] observed by the previous vCPU + * machine instruction are also visible to the vCPU machine instruction + * that triggered this call. + */ + smp_mb__after_srcu_read_lock(); + + return srcu_dereference(kvm->buses[idx], &kvm->srcu); +} + int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5855,7 +5875,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); @@ -5874,7 +5894,7 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; @@ -5924,7 +5944,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); @@ -6033,7 +6053,7 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = kvm_get_bus_srcu(kvm, bus_idx); if (!bus) goto out_unlock; -- cgit v1.2.3 From 7d9a0273c45962e9a6bc06f3b87eef7c431c1853 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 9 Sep 2025 10:00:07 +0000 Subject: KVM: Avoid synchronize_srcu() in kvm_io_bus_register_dev() Device MMIO registration may happen quite frequently during VM boot, and the SRCU synchronization each time has a measurable effect on VM startup time. In our experiments it can account for around 25% of a VM's startup time. Replace the synchronization with a deferred free of the old kvm_io_bus structure. Tested-by: Li RongQing Signed-off-by: Keir Fraser Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e7d6111cf254..103be35caf0d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -206,6 +206,7 @@ struct kvm_io_range { struct kvm_io_bus { int dev_count; int ioeventfd_count; + struct rcu_head rcu; struct kvm_io_range range[]; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 870ad8ea93a7..bcef324ccbf2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1320,6 +1320,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } cleanup_srcu_struct(&kvm->irq_srcu); + srcu_barrier(&kvm->srcu); cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_destroy(&kvm->mem_attr_array); @@ -5952,6 +5953,13 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_read); +static void __free_bus(struct rcu_head *rcu) +{ + struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu); + + kfree(bus); +} + int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) { @@ -5990,8 +5998,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); - kfree(bus); + call_srcu(&kvm->srcu, &bus->rcu, __free_bus); return 0; } -- cgit v1.2.3 From 83b039877310ae1eb614eef17b780df1e10d9fb5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:03 -0500 Subject: x86,fs/resctrl: Prepare for more monitor events There's a rule in computer programming that objects appear zero, once, or many times. So code accordingly. There are two MBM events and resctrl is coded with a lot of if (local) do one thing if (total) do a different thing Change the rdt_mon_domain and rdt_hw_mon_domain structures to hold arrays of pointers to per event data instead of explicit fields for total and local bandwidth. Simplify by coding for many events using loops on which are enabled. Move resctrl_is_mbm_event() to so it can be used more widely. Also provide a for_each_mbm_event_id() helper macro. Cleanup variable names in functions touched to consistently use "eventid" for those with type enum resctrl_event_id. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 40 +++++++++++++++------------ arch/x86/kernel/cpu/resctrl/internal.h | 8 +++--- arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++++++++++------------ fs/resctrl/monitor.c | 13 ++++----- fs/resctrl/rdtgroup.c | 50 +++++++++++++++++----------------- include/linux/resctrl.h | 23 +++++++++++++--- include/linux/resctrl_types.h | 3 ++ 7 files changed, 96 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 5d14f9a14eda..fbf019c1ff11 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b36437..58dca892a5df 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -54,15 +54,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 07f8ab097cbe..f01db2034d08 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -161,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -201,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b0b1dcc36795..e0dfa5fb969e 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -346,15 +346,14 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; - } + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; } static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 2ca8e66c0d20..a6047e9345cd 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -127,12 +127,6 @@ static bool resctrl_is_mbm_enabled(void) resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - /* * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap * of free CLOSIDs. @@ -4020,9 +4014,13 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4082,32 +4080,34 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 40aba6b5d4f0..478d7a935ca3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -161,8 +161,9 @@ struct rdt_ctrl_domain { * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters @@ -172,8 +173,7 @@ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; @@ -376,6 +376,21 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index 2dadbc54e4b3..d98351663c2c 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -51,4 +51,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ -- cgit v1.2.3 From 5ad68c8f965fed78c61f2ac7aea933f06bb50032 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:06 -0500 Subject: x86,fs/resctrl: Consolidate monitoring related data from rdt_resource The cache allocation and memory bandwidth allocation feature properties are consolidated into struct resctrl_cache and struct resctrl_membw respectively. In preparation for more monitoring properties that will clobber the existing resource struct more, re-organize the monitoring specific properties to also be in a separate structure. Also convert "bandwidth sources" terminology to "memory transactions" to have consistency within resctrl for related monitoring features. [ bp: Massage commit message. ] Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 10 +++++----- fs/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 18 +++++++++++++----- 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b07b12a05886..267e9206a999 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -541,7 +541,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f01db2034d08..2558b1bdef8b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -130,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -205,7 +205,7 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * continue; idx = MBM_STATE_IDX(eventid); memset(hw_dom->arch_mbm_states[idx], 0, - sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); } } @@ -344,7 +344,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -359,7 +359,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -373,7 +373,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } r->mon_capable = true; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index a6047e9345cd..b6ab10704993 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1135,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1731,9 +1731,9 @@ next: } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 478d7a935ca3..fe2af6cb96d4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -255,38 +255,46 @@ enum resctrl_schema_fmt { RESCTRL_SCHEMA_RANGE, }; +/** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - unsigned int mbm_cfg_mask; bool cdp_capable; }; -- cgit v1.2.3 From 13390861b426e936db20d675804a5b405622bc79 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:07 -0500 Subject: x86,fs/resctrl: Detect Assignable Bandwidth Monitoring feature details ABMC feature details are reported via CPUID Fn8000_0020_EBX_x5. Bits Description 15:0 MAX_ABMC Maximum Supported Assignable Bandwidth Monitoring Counter ID + 1 The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Detect the feature and number of assignable counters supported. For backward compatibility, upon detecting the assignable counter feature, enable the mbm_total_bytes and mbm_local_bytes events that users are familiar with as part of original L3 MBM support. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/kernel/cpu/resctrl/core.c | 7 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 11 ++++++++--- fs/resctrl/monitor.c | 7 +++++++ include/linux/resctrl.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 267e9206a999..2e68aa02ad3f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -883,6 +883,8 @@ static __init bool get_rdt_mon_resources(void) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); ret = true; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; if (!ret) return false; @@ -978,7 +980,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -990,7 +992,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2558b1bdef8b..0a695ce68f46 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -339,6 +339,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); @@ -368,14 +369,18 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + } + r->mon_capable = true; return 0; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e0dfa5fb969e..b578451de2b5 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -922,6 +922,13 @@ int resctrl_mon_resource_init(void) else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + } + return 0; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index fe2af6cb96d4..eb80cc233be4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -260,10 +260,14 @@ enum resctrl_schema_fmt { * @num_rmid: Number of RMIDs available. * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; }; /** -- cgit v1.2.3 From faebbc58cde9d8f6050ac152c34c88195ed4abaa Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:08 -0500 Subject: x86/resctrl: Add support to enable/disable AMD ABMC feature Add the functionality to enable/disable the AMD ABMC feature. The AMD ABMC feature is enabled by setting enabled bit(0) in the L3_QOS_EXT_CFG MSR. When the state of ABMC is changed, the MSR needs to be updated on all the logical processors in the QOS Domain. Hardware counters will reset when ABMC state is changed. [ bp: Massage commit message. ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/internal.h | 5 ++++ arch/x86/kernel/cpu/resctrl/monitor.c | 45 ++++++++++++++++++++++++++++++++++ include/linux/resctrl.h | 20 +++++++++++++++ 4 files changed, 71 insertions(+) (limited to 'include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..e4945e5c92ef 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1223,6 +1223,7 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 58dca892a5df..a79a487e639c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,9 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -102,6 +105,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +119,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 0a695ce68f46..cce35a0ad455 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -399,3 +399,48 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index eb80cc233be4..919806122c50 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -445,6 +445,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. -- cgit v1.2.3 From 4d32c24a74f2c12ff440d381ba01de574f6631ce Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:11 -0500 Subject: fs/resctrl: Introduce mbm_cntr_cfg to track assignable counters per domain The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Counters are assigned/unassigned at monitoring domain level. Manage a monitoring domain's hardware counters using a per monitoring domain array of struct mbm_cntr_cfg that is indexed by the hardware counter ID. A hardware counter's configuration contains the MBM event ID and points to the monitoring group that it is assigned to, with a NULL pointer meaning that the hardware counter is available for assignment. There is no direct way to determine which hardware counters are assigned to a particular monitoring group. Check every entry of every hardware counter configuration array in every monitoring domain to query which MBM events of a monitoring group is tracked by hardware. Such queries are acceptable because of a very small number of assignable counters (32 to 64). Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/rdtgroup.c | 8 ++++++++ include/linux/resctrl.h | 15 +++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 9d95d01da3f9..61f7b68f2273 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4029,6 +4029,7 @@ static void domain_destroy_mon_state(struct rdt_mon_domain *d) { int idx; + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); for_each_mbm_idx(idx) { kfree(d->mbm_states[idx]); @@ -4112,6 +4113,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain goto cleanup; } + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; + } + return 0; cleanup: bitmap_free(d->rmid_busy_llc); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 919806122c50..e013caba6641 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -156,6 +156,18 @@ struct rdt_ctrl_domain { u32 *mbps_val; }; +/** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types @@ -168,6 +180,8 @@ struct rdt_ctrl_domain { * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; @@ -178,6 +192,7 @@ struct rdt_mon_domain { struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** -- cgit v1.2.3 From ebebda853633de389ba2c6737f8ca38405713e90 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:14 -0500 Subject: fs/resctrl: Introduce event configuration field in struct mon_evt When supported, mbm_event counter assignment mode allows the user to configure events to track specific types of memory transactions. Introduce an evt_cfg field in struct mon_evt to define the type of memory transactions tracked by a monitoring event. Also add a helper function to get the evt_cfg value. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/internal.h | 5 +++++ fs/resctrl/monitor.c | 10 ++++++++++ include/linux/resctrl.h | 2 ++ 3 files changed, 17 insertions(+) (limited to 'include') diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 4f372e80bf37..1cddfff007a2 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -56,6 +56,10 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * @evtid: event id * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable * @enabled: true if the event is enabled */ @@ -63,6 +67,7 @@ struct mon_evt { enum resctrl_event_id evtid; enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; bool enabled; }; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 1fa82a62b2e5..f714e7baaea6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -882,6 +882,11 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) mon_event_all[eventid].enabled; } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -1023,6 +1028,11 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e013caba6641..87daa4ca312d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -409,6 +409,8 @@ static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + /* Iterate over all memory bandwidth events */ #define for_each_mbm_event_id(eventid) \ for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ -- cgit v1.2.3 From feb8ae81b2378b75a99c81d315602ac8918ed382 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 12 Sep 2025 21:54:53 +0200 Subject: ACPICA: Allow to skip Global Lock initialization Introduce acpi_gbl_use_global_lock, which allows to skip the Global Lock initialization. This is useful for systems without Global Lock (such as loong_arch), so as to avoid error messages during boot phase: ACPI Error: Could not enable global_lock event (20240827/evxfevnt-182) ACPI Error: No response from Global Lock hardware, disabling lock (20240827/evglock-59) Link: https://github.com/acpica/acpica/commit/463cb0fe Signed-off-by: Huacai Chen Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evglock.c | 4 ++++ include/acpi/acpixf.h | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c index fa3e0d00d1ca..df2a4ab0e0da 100644 --- a/drivers/acpi/acpica/evglock.c +++ b/drivers/acpi/acpica/evglock.c @@ -42,6 +42,10 @@ acpi_status acpi_ev_init_global_lock_handler(void) return_ACPI_STATUS(AE_OK); } + if (!acpi_gbl_use_global_lock) { + return_ACPI_STATUS(AE_OK); + } + /* Attempt installation of the global lock handler */ status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL, diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index b49396aa4058..97c25ae8a36e 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -213,6 +213,12 @@ ACPI_INIT_GLOBAL(u8, acpi_gbl_osi_data, 0); */ ACPI_INIT_GLOBAL(u8, acpi_gbl_reduced_hardware, FALSE); +/* + * ACPI Global Lock is mainly used for systems with SMM, so no-SMM systems + * (such as loong_arch) may not have and not use Global Lock. + */ +ACPI_INIT_GLOBAL(u8, acpi_gbl_use_global_lock, TRUE); + /* * Maximum timeout for While() loop iterations before forced method abort. * This mechanism is intended to prevent infinite loops during interpreter -- cgit v1.2.3 From 12fd607554c4efb4856959f0e5823f541bc0e701 Mon Sep 17 00:00:00 2001 From: Ahmed Salem Date: Fri, 12 Sep 2025 21:55:35 +0200 Subject: ACPICA: Apply ACPI_NONSTRING Add ACPI_NONSTRING for destination char arrays without a terminating NUL character. This is a follow-up to commit 2b82118845e0 ("ACPICA: Apply ACPI_NONSTRING") where a few more destination arrays were missed. Link: https://github.com/acpica/acpica/commit/f359e5ed Fixes: 2b82118845e0 ("ACPICA: Apply ACPI_NONSTRING") Signed-off-by: Ahmed Salem Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl.h | 2 +- tools/power/acpi/os_specific/service_layers/oslinuxtbl.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 243097a3da63..8a67d4ea6e3f 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -73,7 +73,7 @@ struct acpi_table_header { char oem_id[ACPI_OEM_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM identification */ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM table identification */ u32 oem_revision; /* OEM revision number */ - char asl_compiler_id[ACPI_NAMESEG_SIZE]; /* ASCII ASL compiler vendor ID */ + char asl_compiler_id[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; /* ASCII ASL compiler vendor ID */ u32 asl_compiler_revision; /* ASL compiler version */ }; diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c index 9741e7503591..de93067a5da3 100644 --- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c +++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c @@ -995,7 +995,7 @@ static acpi_status osl_list_customized_tables(char *directory) { void *table_dir; u32 instance; - char temp_name[ACPI_NAMESEG_SIZE]; + char temp_name[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; char *filename; acpi_status status = AE_OK; @@ -1312,7 +1312,7 @@ osl_get_customized_table(char *pathname, { void *table_dir; u32 current_instance = 0; - char temp_name[ACPI_NAMESEG_SIZE]; + char temp_name[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; char table_filename[PATH_MAX]; char *filename; acpi_status status; -- cgit v1.2.3 From e2c80b3c23782d809b4e15da7d8a0135a8d350b5 Mon Sep 17 00:00:00 2001 From: Saket Dumbre Date: Fri, 12 Sep 2025 22:01:52 +0200 Subject: ACPICA: Print error messages for too few or too many arguments Fix Issue #1027 by displaying error messages when there are too few or too many arguments in the caller vs the definition of an ASL/AML method. Link: https://github.com/acpica/acpica/commit/cbc243e4 Reported-by: Peter Williams Signed-off-by: Rafael J. Wysocki Tested-by: Hans de Goede Signed-off-by: Saket Dumbre --- drivers/acpi/acpica/dsmethod.c | 11 +++++++++-- include/acpi/acexcep.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index b2f756b7078d..45ec32e81903 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -483,10 +483,17 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, } if (this_walk_state->num_operands < obj_desc->method.param_count) { - ACPI_ERROR((AE_INFO, "Missing argument for method [%4.4s]", + ACPI_ERROR((AE_INFO, "Missing argument(s) for method [%4.4s]", acpi_ut_get_node_name(method_node))); - return_ACPI_STATUS(AE_AML_UNINITIALIZED_ARG); + return_ACPI_STATUS(AE_AML_TOO_FEW_ARGUMENTS); + } + + else if (this_walk_state->num_operands > obj_desc->method.param_count) { + ACPI_ERROR((AE_INFO, "Too many arguments for method [%4.4s]", + acpi_ut_get_node_name(method_node))); + + return_ACPI_STATUS(AE_AML_TOO_MANY_ARGUMENTS); } /* Init for new method, possibly wait on method mutex */ diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 53c98f5fe3c3..a2db36d18419 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -173,8 +173,10 @@ struct acpi_exception_info { #define AE_AML_TARGET_TYPE EXCEP_AML (0x0023) #define AE_AML_PROTOCOL EXCEP_AML (0x0024) #define AE_AML_BUFFER_LENGTH EXCEP_AML (0x0025) +#define AE_AML_TOO_FEW_ARGUMENTS EXCEP_AML (0x0026) +#define AE_AML_TOO_MANY_ARGUMENTS EXCEP_AML (0x0027) -#define AE_CODE_AML_MAX 0x0025 +#define AE_CODE_AML_MAX 0x0027 /* * Internal exceptions used for control @@ -353,7 +355,11 @@ static const struct acpi_exception_info acpi_gbl_exception_names_aml[] = { "A target operand of an incorrect type was encountered"), EXCEP_TXT("AE_AML_PROTOCOL", "Violation of a fixed ACPI protocol"), EXCEP_TXT("AE_AML_BUFFER_LENGTH", - "The length of the buffer is invalid/incorrect") + "The length of the buffer is invalid/incorrect"), + EXCEP_TXT("AE_AML_TOO_FEW_ARGUMENTS", + "There are fewer than expected method arguments"), + EXCEP_TXT("AE_AML_TOO_MANY_ARGUMENTS", + "There are too many arguments for this method") }; static const struct acpi_exception_info acpi_gbl_exception_names_ctrl[] = { -- cgit v1.2.3 From b0fb6891b8ad55cb5767edf94351ffe4f0b9a404 Mon Sep 17 00:00:00 2001 From: Saket Dumbre Date: Fri, 12 Sep 2025 22:02:32 +0200 Subject: ACPICA: Update version to 20250807 Link: https://github.com/acpica/acpica/commit/0845a773 Signed-off-by: Saket Dumbre Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 97c25ae8a36e..e65a2afe9250 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20250404 +#define ACPI_CA_VERSION 0x20250807 #include #include -- cgit v1.2.3 From eddf12041ddd8c40289015f42285c5ac614b5c30 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 12 Sep 2025 22:04:04 +0200 Subject: ACPICA: CEDT: Add Back-Invalidate restriction to CXL Window This is added in newer version (3.0+) of the CXL Spec to support the HDM-DB coherency model. Link: https://github.com/acpica/acpica/commit/a6886da1 Signed-off-by: Davidlohr Bueso Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl1.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 99fd1588ff38..0b4c332df25c 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -565,6 +565,7 @@ struct acpi_cedt_cfmws_target_element { #define ACPI_CEDT_CFMWS_RESTRICT_VOLATILE (1<<2) #define ACPI_CEDT_CFMWS_RESTRICT_PMEM (1<<3) #define ACPI_CEDT_CFMWS_RESTRICT_FIXED (1<<4) +#define ACPI_CEDT_CFMWS_RESTRICT_BI (1<<5) /* 2: CXL XOR Interleave Math Structure */ -- cgit v1.2.3 From f7a4fb22312646329ba21bc58958fd83fb9fc15d Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:15 -0500 Subject: x86,fs/resctrl: Implement resctrl_arch_config_cntr() to assign a counter with ABMC The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Implement an x86 architecture-specific handler to configure a counter. This architecture specific handler is called by resctrl fs when a counter is assigned or unassigned as well as when an already assigned counter's configuration should be updated. Configure counters by writing to the L3_QOS_ABMC_CFG MSR, specifying the counter ID, bandwidth source (RMID), and event configuration. The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++++++++++++++++++++++++++++++++++ include/linux/resctrl.h | 19 ++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include') diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index cce35a0ad455..ed295a6c5e66 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -444,3 +444,39 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; } + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 87daa4ca312d..50e38445183a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -594,6 +594,25 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From 862314fd1f93d96eddb0559a807c66cb1f6ee520 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:19 -0500 Subject: fs/resctrl: Introduce counter ID read, reset calls in mbm_event mode When supported, "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Introduce the architecture calls resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() to read and reset event counters when "mbm_event" mode is supported. Function names match existing resctrl_arch_rmid_read() and resctrl_arch_reset_rmid(). Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- include/linux/resctrl.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 50e38445183a..04152654827d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -613,6 +613,44 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From ea274cbeaf8f0667267b347e3f84797439cdab4e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:23 -0500 Subject: fs/resctrl: Add event configuration directory under info/L3_MON/ The "mbm_event" counter assignment mode allows the user to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. The user can specify the memory transaction(s) for the counter to track. When this mode is supported, the /sys/fs/resctrl/info/L3_MON/event_configs directory contains a sub-directory for each MBM event that can be assigned to a counter. The MBM event sub-directory contains a file named "event_filter" that is used to view and modify which memory transactions the MBM event is configured with. Create /sys/fs/resctrl/info/L3_MON/event_configs directory on resctrl mount and pre-populate it with directories for the two existing MBM events: mbm_total_bytes and mbm_local_bytes. Create the "event_filter" file within each MBM event directory with the needed *show() that displays the memory transactions with which the MBM event is configured. Example: $ mount -t resctrl resctrl /sys/fs/resctrl $ cd /sys/fs/resctrl/ $ cat info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,remote_reads,local_non_temporal_writes, remote_non_temporal_writes,local_reads_slow_memory, remote_reads_slow_memory,dirty_victim_writes_all $ cat info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 33 ++++++++++++++++++++ fs/resctrl/internal.h | 4 +++ fs/resctrl/monitor.c | 56 +++++++++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 59 +++++++++++++++++++++++++++++++++-- include/linux/resctrl_types.h | 3 ++ 5 files changed, 153 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 4c24c5f3f4c1..ddd95f1472e6 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -310,6 +310,39 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs 0=30;1=30 +"event_configs": + Directory that exists when "mbm_event" counter assignment mode is supported. + Contains a sub-directory for each MBM event that can be assigned to a counter. + + Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes. + Each MBM event's sub-directory contains a file named "event_filter" that is + used to view and modify which memory transactions the MBM event is configured + with. The file is accessible only when "mbm_event" counter assignment mode is + enabled. + + List of memory transaction types supported: + + ========================== ======================================================== + Name Description + ========================== ======================================================== + dirty_victim_writes_all Dirty Victims from the QOS domain to all types of memory + remote_reads_slow_memory Reads to slow memory in the non-local NUMA domain + local_reads_slow_memory Reads to slow memory in the local NUMA domain + remote_non_temporal_writes Non-temporal writes to non-local NUMA domain + local_non_temporal_writes Non-temporal writes to local NUMA domain + remote_reads Reads to memory in the non-local NUMA domain + local_reads Reads to memory in the local NUMA domain + ========================== ======================================================== + + For example:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 2f1f2efe2f40..9bf2e2fd5c19 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -241,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -403,6 +405,8 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 55327056596e..7179f9865a48 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -972,6 +972,61 @@ u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) return mon_event_all[evtid].evt_cfg; } +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; +}; + +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, +}; + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1287,6 +1342,7 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 2e1d0a2703da..8f0c403e3fb5 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1923,6 +1923,12 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "event_filter", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + }, { .name = "mbm_assign_mode", .mode = 0444, @@ -2183,10 +2189,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2199,8 +2243,19 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + } + } + + kernfs_activate(kn_subdir); return ret; } diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index d98351663c2c..acfe07860b34 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,6 +34,9 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + /* Event IDs */ enum resctrl_event_id { /* Must match value of first event below */ -- cgit v1.2.3 From ac1df9bb0ba3ae94137fb494cd9efc598f65d826 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:25 -0500 Subject: fs/resctrl: Introduce mbm_assign_on_mkdir to enable assignments on mkdir The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. Introduce a user-configurable option that determines if a counter will automatically be assigned to an RMID, event pair when its associated monitor group is created via mkdir. Accessible when "mbm_event" counter assignment mode is enabled. Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 20 +++++++++++++ fs/resctrl/internal.h | 6 ++++ fs/resctrl/monitor.c | 53 +++++++++++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 7 +++++ include/linux/resctrl.h | 3 ++ 5 files changed, 89 insertions(+) (limited to 'include') diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 2e840ef26f68..1de815b3a07b 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -355,6 +355,26 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,local_non_temporal_writes +"mbm_assign_on_mkdir": + Exists when "mbm_event" counter assignment mode is supported. Accessible + only when "mbm_event" counter assignment mode is enabled. + + Determines if a counter will automatically be assigned to an RMID, MBM event + pair when its associated monitor group is created via mkdir. Enabled by default + on boot, also when switched from "default" mode to "mbm_event" counter assignment + mode. Users can disable this capability by writing to the interface. + + "0": + Auto assignment is disabled. + "1": + Auto assignment is enabled. + + Example:: + + # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + 0 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 90d3e4ab335b..66c677c1b858 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -410,6 +410,12 @@ int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index ccb9726bba54..deca9535fbbb 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1027,6 +1027,57 @@ out_unlock: return ret; } +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1457,6 +1508,8 @@ int resctrl_mon_resource_init(void) resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e90bc808fe53..c7ea42c2a3c2 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1808,6 +1808,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_last_cmd_status_show, .fflags = RFTYPE_TOP_INFO, }, + { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, { .name = "num_closids", .mode = 0444, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 04152654827d..a7d92718b653 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -277,12 +277,15 @@ enum resctrl_schema_fmt { * monitoring events can be configured. * @num_mbm_cntrs: Number of assignable counters. * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; }; /** -- cgit v1.2.3 From 3c17001b21b9f168c957ced9384abe969019b609 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:24 +0200 Subject: pidfs: validate extensible ioctls Validate extensible ioctls stricter than we do now. Reviewed-by: Aleksa Sarai Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 +- include/linux/fs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..0a5083b9cce5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..2f2edc53bf3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -4023,4 +4023,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +static inline bool extensible_ioctl_valid(unsigned int cmd_a, + unsigned int cmd_b, size_t min_size) +{ + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) + return false; + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) + return false; + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) + return false; + if (_IOC_SIZE(cmd_a) < min_size) + return false; + return true; +} + #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From be975448a45cd024e2b98598eefc0e164ad93f09 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Jul 2025 14:25:35 -0700 Subject: srcu: Document __srcu_read_{,un}lock_fast() implicit RCU readers This commit documents the implicit RCU readers that are implied by the this_cpu_inc() and atomic_long_inc() operations in __srcu_read_lock_fast() and __srcu_read_unlock_fast(). While in the area, fix the documentation of the memory pairing of atomic_long_inc() in __srcu_read_lock_fast(). [ paulmck: Apply Joel Fernandes feedback. ] Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcutree.h | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 4d2fee4d3828..42098e0fa0b7 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -232,9 +232,27 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * srcu_read_unlock_fast(). * * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side - * critical sections either because they disables interrupts, because they - * are a single instruction, or because they are a read-modify-write atomic - * operation, depending on the whims of the architecture. + * critical sections either because they disables interrupts, because + * they are a single instruction, or because they are read-modify-write + * atomic operations, depending on the whims of the architecture. + * This matters because the SRCU-fast grace-period mechanism uses either + * synchronize_rcu() or synchronize_rcu_expedited(), that is, RCU, + * *not* SRCU, in order to eliminate the need for the read-side smp_mb() + * invocations that are used by srcu_read_lock() and srcu_read_unlock(). + * The __srcu_read_unlock_fast() function also relies on this same RCU + * (again, *not* SRCU) trick to eliminate the need for smp_mb(). + * + * The key point behind this RCU trick is that if any part of a given + * RCU reader precedes the beginning of a given RCU grace period, then + * the entirety of that RCU reader and everything preceding it happens + * before the end of that same RCU grace period. Similarly, if any part + * of a given RCU reader follows the end of a given RCU grace period, + * then the entirety of that RCU reader and everything following it + * happens after the beginning of that same RCU grace period. Therefore, + * the operations labeled Y in __srcu_read_lock_fast() and those labeled Z + * in __srcu_read_unlock_fast() are ordered against the corresponding SRCU + * read-side critical section from the viewpoint of the SRCU grace period. + * This is all the ordering that is required, hence no calls to smp_mb(). * * This means that __srcu_read_lock_fast() is not all that fast * on architectures that support NMIs but do not supply NMI-safe @@ -245,9 +263,9 @@ static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct src struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - this_cpu_inc(scp->srcu_locks.counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader. else - atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */ + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); // Y, and implicit RCU reader. barrier(); /* Avoid leaking the critical section. */ return scp; } @@ -258,23 +276,17 @@ static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct src * different CPU than that which was incremented by the corresponding * srcu_read_lock_fast(), but it must be within the same task. * - * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side - * critical sections either because they disables interrupts, because they - * are a single instruction, or because they are a read-modify-write atomic - * operation, depending on the whims of the architecture. - * - * This means that __srcu_read_unlock_fast() is not all that fast - * on architectures that support NMIs but do not supply NMI-safe - * implementations of this_cpu_inc(). + * Please see the __srcu_read_lock_fast() function's header comment for + * information on implicit RCU readers and NMI safety. */ static inline void notrace __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) { barrier(); /* Avoid leaking the critical section. */ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + this_cpu_inc(scp->srcu_unlocks.counter); // Z, and implicit RCU reader. else - atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader. } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -- cgit v1.2.3 From 5e0ae59159e3a07391a35865bb79ff335473fa79 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:42 +0000 Subject: HID: add haptics page defines Introduce haptic usages as defined in HID Usage Tables specification. Add HID units for newton and gram. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/hid.h b/include/linux/hid.h index 2cc4f1e4ea96..10f113c758fe 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -156,6 +156,7 @@ struct hid_item { #define HID_UP_TELEPHONY 0x000b0000 #define HID_UP_CONSUMER 0x000c0000 #define HID_UP_DIGITIZER 0x000d0000 +#define HID_UP_HAPTIC 0x000e0000 #define HID_UP_PID 0x000f0000 #define HID_UP_BATTERY 0x00850000 #define HID_UP_CAMERA 0x00900000 @@ -316,6 +317,28 @@ struct hid_item { #define HID_DG_TOOLSERIALNUMBER 0x000d005b #define HID_DG_LATENCYMODE 0x000d0060 +#define HID_HP_SIMPLECONTROLLER 0x000e0001 +#define HID_HP_WAVEFORMLIST 0x000e0010 +#define HID_HP_DURATIONLIST 0x000e0011 +#define HID_HP_AUTOTRIGGER 0x000e0020 +#define HID_HP_MANUALTRIGGER 0x000e0021 +#define HID_HP_AUTOTRIGGERASSOCIATEDCONTROL 0x000e0022 +#define HID_HP_INTENSITY 0x000e0023 +#define HID_HP_REPEATCOUNT 0x000e0024 +#define HID_HP_RETRIGGERPERIOD 0x000e0025 +#define HID_HP_WAVEFORMVENDORPAGE 0x000e0026 +#define HID_HP_WAVEFORMVENDORID 0x000e0027 +#define HID_HP_WAVEFORMCUTOFFTIME 0x000e0028 +#define HID_HP_WAVEFORMNONE 0x000e1001 +#define HID_HP_WAVEFORMSTOP 0x000e1002 +#define HID_HP_WAVEFORMCLICK 0x000e1003 +#define HID_HP_WAVEFORMBUZZCONTINUOUS 0x000e1004 +#define HID_HP_WAVEFORMRUMBLECONTINUOUS 0x000e1005 +#define HID_HP_WAVEFORMPRESS 0x000e1006 +#define HID_HP_WAVEFORMRELEASE 0x000e1007 +#define HID_HP_VENDORWAVEFORMMIN 0x000e2001 +#define HID_HP_VENDORWAVEFORMMAX 0x000e2fff + #define HID_BAT_ABSOLUTESTATEOFCHARGE 0x00850065 #define HID_BAT_CHARGING 0x00850044 @@ -423,6 +446,12 @@ struct hid_item { #define HID_REPORT_PROTOCOL 1 #define HID_BOOT_PROTOCOL 0 +/* + * HID units + */ +#define HID_UNIT_GRAM 0x0101 +#define HID_UNIT_NEWTON 0xe111 + /* * This is the global environment of the parser. This information is * persistent for main-items. The global environment can be saved and -- cgit v1.2.3 From 08a72a220e960e7f153a810fb633638afd0b7563 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:43 +0000 Subject: Input: add FF_HAPTIC effect type FF_HAPTIC effect type can be used to trigger haptic feedback with HID simple haptic usages. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Signed-off-by: Benjamin Tissoires --- include/uapi/linux/input.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 127119c287cf..6aa703fcfcfb 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -429,6 +429,24 @@ struct ff_rumble_effect { __u16 weak_magnitude; }; +/** + * struct ff_haptic_effect + * @hid_usage: hid_usage according to Haptics page (WAVEFORM_CLICK, etc.) + * @vendor_id: the waveform vendor ID if hid_usage is in the vendor-defined range + * @vendor_waveform_page: the vendor waveform page if hid_usage is in the vendor-defined range + * @intensity: strength of the effect as percentage + * @repeat_count: number of times to retrigger effect + * @retrigger_period: time before effect is retriggered (in ms) + */ +struct ff_haptic_effect { + __u16 hid_usage; + __u16 vendor_id; + __u8 vendor_waveform_page; + __u16 intensity; + __u16 repeat_count; + __u16 retrigger_period; +}; + /** * struct ff_effect - defines force feedback effect * @type: type of the effect (FF_CONSTANT, FF_PERIODIC, FF_RAMP, FF_SPRING, @@ -465,6 +483,7 @@ struct ff_effect { struct ff_periodic_effect periodic; struct ff_condition_effect condition[2]; /* One for each axis */ struct ff_rumble_effect rumble; + struct ff_haptic_effect haptic; } u; }; @@ -472,6 +491,7 @@ struct ff_effect { * Force feedback effect types */ +#define FF_HAPTIC 0x4f #define FF_RUMBLE 0x50 #define FF_PERIODIC 0x51 #define FF_CONSTANT 0x52 @@ -481,7 +501,7 @@ struct ff_effect { #define FF_INERTIA 0x56 #define FF_RAMP 0x57 -#define FF_EFFECT_MIN FF_RUMBLE +#define FF_EFFECT_MIN FF_HAPTIC #define FF_EFFECT_MAX FF_RAMP /* -- cgit v1.2.3 From 7075ae4ac9db93a3e762f3c2793ad57dbbf8a120 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:44 +0000 Subject: Input: add INPUT_PROP_HAPTIC_TOUCHPAD INPUT_PROP_HAPTIC_TOUCHPAD property is to be set for a device with simple haptic capabilities. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Reviewed-by: Randy Dunlap Signed-off-by: Benjamin Tissoires --- Documentation/input/event-codes.rst | 14 ++++++++++++++ include/uapi/linux/input-event-codes.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index b4557462edd7..1ead9bb8d9c6 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -400,6 +400,20 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz). All other axes retain their meaning. A device must not mix regular directional axes and accelerometer axes on the same event node. +INPUT_PROP_HAPTIC_TOUCHPAD +-------------------------- + +The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device: +- supports simple haptic auto and manual triggering +- can differentiate between at least 5 fingers +- uses correct resolution for the X/Y (units and value) +- reports correct force per touch, and correct units for them (newtons or grams) +- follows the MT protocol type B + +Summing up, such devices follow the MS spec for input devices in +Win8 and Win8.1, and in addition support the Simple haptic controller HID table, +and report correct units for the pressure. + Guidelines ========== diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index ca5851e97fac..4a9fbf42aa9f 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -27,6 +27,7 @@ #define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */ #define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */ #define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */ +#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */ #define INPUT_PROP_MAX 0x1f #define INPUT_PROP_CNT (INPUT_PROP_MAX + 1) -- cgit v1.2.3 From 4e584ac737884ef0fd80f6836b917972fad86b17 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:50 +0000 Subject: Input: MT - add INPUT_MT_TOTAL_FORCE flags Add a flag to generate ABS_PRESSURE as sum of ABS_MT_PRESSURE across all slots. This flag should be set if one knows a device reports true force and would like to report total force to the userspace. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Signed-off-by: Benjamin Tissoires --- drivers/input/input-mt.c | 14 ++++++++++---- include/linux/input/mt.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c index 337006dd9dcf..09f518897d4a 100644 --- a/drivers/input/input-mt.c +++ b/drivers/input/input-mt.c @@ -198,6 +198,7 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) struct input_mt *mt = dev->mt; struct input_mt_slot *oldest; int oldid, count, i; + int p, reported_p = 0; if (!mt) return; @@ -216,6 +217,13 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) oldest = ps; oldid = id; } + if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { + p = input_mt_get_value(ps, ABS_MT_PRESSURE); + if (mt->flags & INPUT_MT_TOTAL_FORCE) + reported_p += p; + else if (oldid == id) + reported_p = p; + } count++; } @@ -245,10 +253,8 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) input_event(dev, EV_ABS, ABS_X, x); input_event(dev, EV_ABS, ABS_Y, y); - if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { - int p = input_mt_get_value(oldest, ABS_MT_PRESSURE); - input_event(dev, EV_ABS, ABS_PRESSURE, p); - } + if (test_bit(ABS_MT_PRESSURE, dev->absbit)) + input_event(dev, EV_ABS, ABS_PRESSURE, reported_p); } else { if (test_bit(ABS_MT_PRESSURE, dev->absbit)) input_event(dev, EV_ABS, ABS_PRESSURE, 0); diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h index 2cf89a538b18..d30286298a00 100644 --- a/include/linux/input/mt.h +++ b/include/linux/input/mt.h @@ -17,6 +17,7 @@ #define INPUT_MT_DROP_UNUSED 0x0004 /* drop contacts not seen in frame */ #define INPUT_MT_TRACK 0x0008 /* use in-kernel tracking */ #define INPUT_MT_SEMI_MT 0x0010 /* semi-mt device, finger count handled manually */ +#define INPUT_MT_TOTAL_FORCE 0x0020 /* calculate total force from slots pressure */ /** * struct input_mt_slot - represents the state of an input MT slot -- cgit v1.2.3 From 8aa1e3a6f0ffbcfdf3bd7d87feb9090f96c54bc4 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:40 -0700 Subject: firmware: qcom: tzmem: export shm_bridge create/delete Anyone with access to contiguous physical memory should be able to share memory with QTEE using shm_bridge. Tested-by: Neil Armstrong Tested-by: Harshal Dev Reviewed-by: Kuldeep Singh Signed-off-by: Amirreza Zarrabi Link: https://lore.kernel.org/r/20250911-qcom-tee-using-tee-ss-without-mem-obj-v12-1-17f07a942b8d@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_tzmem.c | 63 ++++++++++++++++++++++++++------ include/linux/firmware/qcom/qcom_tzmem.h | 15 ++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/firmware/qcom/qcom_tzmem.c b/drivers/firmware/qcom/qcom_tzmem.c index ea0a35355657..186511ced924 100644 --- a/drivers/firmware/qcom/qcom_tzmem.c +++ b/drivers/firmware/qcom/qcom_tzmem.c @@ -109,7 +109,19 @@ notsupp: return 0; } -static int qcom_tzmem_init_area(struct qcom_tzmem_area *area) +/** + * qcom_tzmem_shm_bridge_create() - Create a SHM bridge. + * @paddr: Physical address of the memory to share. + * @size: Size of the memory to share. + * @handle: Handle to the SHM bridge. + * + * On platforms that support SHM bridge, this function creates a SHM bridge + * for the given memory region with QTEE. The handle returned by this function + * must be passed to qcom_tzmem_shm_bridge_delete() to free the SHM bridge. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcom_tzmem_shm_bridge_create(phys_addr_t paddr, size_t size, u64 *handle) { u64 pfn_and_ns_perm, ipfn_and_s_perm, size_and_flags; int ret; @@ -117,17 +129,49 @@ static int qcom_tzmem_init_area(struct qcom_tzmem_area *area) if (!qcom_tzmem_using_shm_bridge) return 0; - pfn_and_ns_perm = (u64)area->paddr | QCOM_SCM_PERM_RW; - ipfn_and_s_perm = (u64)area->paddr | QCOM_SCM_PERM_RW; - size_and_flags = area->size | (1 << QCOM_SHM_BRIDGE_NUM_VM_SHIFT); + pfn_and_ns_perm = paddr | QCOM_SCM_PERM_RW; + ipfn_and_s_perm = paddr | QCOM_SCM_PERM_RW; + size_and_flags = size | (1 << QCOM_SHM_BRIDGE_NUM_VM_SHIFT); + + ret = qcom_scm_shm_bridge_create(pfn_and_ns_perm, ipfn_and_s_perm, + size_and_flags, QCOM_SCM_VMID_HLOS, + handle); + if (ret) { + dev_err(qcom_tzmem_dev, + "SHM Bridge failed: ret %d paddr 0x%pa, size %zu\n", + ret, &paddr, size); + + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(qcom_tzmem_shm_bridge_create); + +/** + * qcom_tzmem_shm_bridge_delete() - Delete a SHM bridge. + * @handle: Handle to the SHM bridge. + * + * On platforms that support SHM bridge, this function deletes the SHM bridge + * for the given memory region. The handle must be the same as the one + * returned by qcom_tzmem_shm_bridge_create(). + */ +void qcom_tzmem_shm_bridge_delete(u64 handle) +{ + if (qcom_tzmem_using_shm_bridge) + qcom_scm_shm_bridge_delete(handle); +} +EXPORT_SYMBOL_GPL(qcom_tzmem_shm_bridge_delete); + +static int qcom_tzmem_init_area(struct qcom_tzmem_area *area) +{ + int ret; u64 *handle __free(kfree) = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) return -ENOMEM; - ret = qcom_scm_shm_bridge_create(pfn_and_ns_perm, ipfn_and_s_perm, - size_and_flags, QCOM_SCM_VMID_HLOS, - handle); + ret = qcom_tzmem_shm_bridge_create(area->paddr, area->size, handle); if (ret) return ret; @@ -140,10 +184,7 @@ static void qcom_tzmem_cleanup_area(struct qcom_tzmem_area *area) { u64 *handle = area->priv; - if (!qcom_tzmem_using_shm_bridge) - return; - - qcom_scm_shm_bridge_delete(*handle); + qcom_tzmem_shm_bridge_delete(*handle); kfree(handle); } diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h index b83b63a0c049..48ac0e5454c7 100644 --- a/include/linux/firmware/qcom/qcom_tzmem.h +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -53,4 +53,19 @@ DEFINE_FREE(qcom_tzmem, void *, if (_T) qcom_tzmem_free(_T)) phys_addr_t qcom_tzmem_to_phys(void *ptr); +#if IS_ENABLED(CONFIG_QCOM_TZMEM_MODE_SHMBRIDGE) +int qcom_tzmem_shm_bridge_create(phys_addr_t paddr, size_t size, u64 *handle); +void qcom_tzmem_shm_bridge_delete(u64 handle); +#else +static inline int qcom_tzmem_shm_bridge_create(phys_addr_t paddr, + size_t size, u64 *handle) +{ + return 0; +} + +static inline void qcom_tzmem_shm_bridge_delete(u64 handle) +{ +} +#endif + #endif /* __QCOM_TZMEM */ -- cgit v1.2.3 From 4b700098c0fc4a76c5c1e54465c8f35e13755294 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:41 -0700 Subject: firmware: qcom: scm: add support for object invocation Qualcomm TEE (QTEE) hosts Trusted Applications (TAs) and services in the secure world, accessed via objects. A QTEE client can invoke these objects to request services. Similarly, QTEE can request services from the nonsecure world using objects exported to the secure world. Add low-level primitives to facilitate the invocation of objects hosted in QTEE, as well as those hosted in the nonsecure world. If support for object invocation is available, the qcom_scm allocates a dedicated child platform device. The driver for this device communicates with QTEE using low-level primitives. Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Link: https://lore.kernel.org/r/20250911-qcom-tee-using-tee-ss-without-mem-obj-v12-2-17f07a942b8d@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_scm.c | 119 +++++++++++++++++++++++++++++++++ drivers/firmware/qcom/qcom_scm.h | 7 ++ include/linux/firmware/qcom/qcom_scm.h | 6 ++ 3 files changed, 132 insertions(+) (limited to 'include') diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index 26cd0458aacd..9b5a9a0f68cf 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -2093,6 +2093,122 @@ static int qcom_scm_qseecom_init(struct qcom_scm *scm) #endif /* CONFIG_QCOM_QSEECOM */ +/** + * qcom_scm_qtee_invoke_smc() - Invoke a QTEE object. + * @inbuf: start address of memory area used for inbound buffer. + * @inbuf_size: size of the memory area used for inbound buffer. + * @outbuf: start address of memory area used for outbound buffer. + * @outbuf_size: size of the memory area used for outbound buffer. + * @result: result of QTEE object invocation. + * @response_type: response type returned by QTEE. + * + * @response_type determines how the contents of @inbuf and @outbuf + * should be processed. + * + * Return: On success, return 0 or <0 on failure. + */ +int qcom_scm_qtee_invoke_smc(phys_addr_t inbuf, size_t inbuf_size, + phys_addr_t outbuf, size_t outbuf_size, + u64 *result, u64 *response_type) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_SMCINVOKE, + .cmd = QCOM_SCM_SMCINVOKE_INVOKE, + .owner = ARM_SMCCC_OWNER_TRUSTED_OS, + .args[0] = inbuf, + .args[1] = inbuf_size, + .args[2] = outbuf, + .args[3] = outbuf_size, + .arginfo = QCOM_SCM_ARGS(4, QCOM_SCM_RW, QCOM_SCM_VAL, + QCOM_SCM_RW, QCOM_SCM_VAL), + }; + struct qcom_scm_res res; + int ret; + + ret = qcom_scm_call(__scm->dev, &desc, &res); + if (ret) + return ret; + + if (response_type) + *response_type = res.result[0]; + + if (result) + *result = res.result[1]; + + return 0; +} +EXPORT_SYMBOL(qcom_scm_qtee_invoke_smc); + +/** + * qcom_scm_qtee_callback_response() - Submit response for callback request. + * @buf: start address of memory area used for outbound buffer. + * @buf_size: size of the memory area used for outbound buffer. + * @result: Result of QTEE object invocation. + * @response_type: Response type returned by QTEE. + * + * @response_type determines how the contents of @buf should be processed. + * + * Return: On success, return 0 or <0 on failure. + */ +int qcom_scm_qtee_callback_response(phys_addr_t buf, size_t buf_size, + u64 *result, u64 *response_type) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_SMCINVOKE, + .cmd = QCOM_SCM_SMCINVOKE_CB_RSP, + .owner = ARM_SMCCC_OWNER_TRUSTED_OS, + .args[0] = buf, + .args[1] = buf_size, + .arginfo = QCOM_SCM_ARGS(2, QCOM_SCM_RW, QCOM_SCM_VAL), + }; + struct qcom_scm_res res; + int ret; + + ret = qcom_scm_call(__scm->dev, &desc, &res); + if (ret) + return ret; + + if (response_type) + *response_type = res.result[0]; + + if (result) + *result = res.result[1]; + + return 0; +} +EXPORT_SYMBOL(qcom_scm_qtee_callback_response); + +static void qcom_scm_qtee_free(void *data) +{ + struct platform_device *qtee_dev = data; + + platform_device_unregister(qtee_dev); +} + +static void qcom_scm_qtee_init(struct qcom_scm *scm) +{ + struct platform_device *qtee_dev; + u64 result, response_type; + int ret; + + /* + * Probe for smcinvoke support. This will fail due to invalid buffers, + * but first, it checks whether the call is supported in QTEE syscall + * handler. If it is not supported, -EIO is returned. + */ + ret = qcom_scm_qtee_invoke_smc(0, 0, 0, 0, &result, &response_type); + if (ret == -EIO) + return; + + /* Setup QTEE interface device. */ + qtee_dev = platform_device_register_data(scm->dev, "qcomtee", + PLATFORM_DEVID_NONE, NULL, 0); + if (IS_ERR(qtee_dev)) + return; + + devm_add_action_or_reset(scm->dev, qcom_scm_qtee_free, qtee_dev); +} + /** * qcom_scm_is_available() - Checks if SCM is available */ @@ -2325,6 +2441,9 @@ static int qcom_scm_probe(struct platform_device *pdev) ret = qcom_scm_qseecom_init(scm); WARN(ret < 0, "failed to initialize qseecom: %d\n", ret); + /* Initialize the QTEE object interface. */ + qcom_scm_qtee_init(scm); + return 0; } diff --git a/drivers/firmware/qcom/qcom_scm.h b/drivers/firmware/qcom/qcom_scm.h index 0e8dd838099e..a56c8212cc0c 100644 --- a/drivers/firmware/qcom/qcom_scm.h +++ b/drivers/firmware/qcom/qcom_scm.h @@ -156,6 +156,13 @@ int qcom_scm_shm_bridge_enable(struct device *scm_dev); #define QCOM_SCM_SVC_GPU 0x28 #define QCOM_SCM_SVC_GPU_INIT_REGS 0x01 +/* ARM_SMCCC_OWNER_TRUSTED_OS calls */ + +#define QCOM_SCM_SVC_SMCINVOKE 0x06 +#define QCOM_SCM_SMCINVOKE_INVOKE_LEGACY 0x00 +#define QCOM_SCM_SMCINVOKE_CB_RSP 0x01 +#define QCOM_SCM_SMCINVOKE_INVOKE 0x02 + /* common error codes */ #define QCOM_SCM_V2_EBUSY -12 #define QCOM_SCM_ENOMEM -5 diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index 0f667bf1d4d9..a55ca771286b 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -175,4 +175,10 @@ static inline int qcom_scm_qseecom_app_send(u32 app_id, #endif /* CONFIG_QCOM_QSEECOM */ +int qcom_scm_qtee_invoke_smc(phys_addr_t inbuf, size_t inbuf_size, + phys_addr_t outbuf, size_t outbuf_size, + u64 *result, u64 *response_type); +int qcom_scm_qtee_callback_response(phys_addr_t buf, size_t buf_size, + u64 *result, u64 *response_type); + #endif -- cgit v1.2.3 From 54fd6bd42e7bd351802ff1d193a2e33e4bfb1836 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:45 +0530 Subject: cdx: Split mcdi.h and reorganize headers Move bitfield.h from the CDX controller directory to include/linux/cdx to make them accessible to other drivers. As part of this refactoring, split mcdi.h into two headers: - mcdi.h: retains interface-level declarations - mcdid.h: contains internal definitions and macros This is in preparation for VersalNET EDAC driver that relies on it. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/cdx/controller/bitfield.h | 90 ------------ drivers/cdx/controller/cdx_controller.c | 2 +- drivers/cdx/controller/cdx_rpmsg.c | 2 +- drivers/cdx/controller/mcdi.c | 5 +- drivers/cdx/controller/mcdi.h | 242 -------------------------------- drivers/cdx/controller/mcdi_functions.c | 1 - drivers/cdx/controller/mcdi_functions.h | 3 +- drivers/cdx/controller/mcdid.h | 63 +++++++++ include/linux/cdx/bitfield.h | 90 ++++++++++++ include/linux/cdx/mcdi.h | 192 +++++++++++++++++++++++++ 10 files changed, 352 insertions(+), 338 deletions(-) delete mode 100644 drivers/cdx/controller/bitfield.h delete mode 100644 drivers/cdx/controller/mcdi.h create mode 100644 drivers/cdx/controller/mcdid.h create mode 100644 include/linux/cdx/bitfield.h create mode 100644 include/linux/cdx/mcdi.h (limited to 'include') diff --git a/drivers/cdx/controller/bitfield.h b/drivers/cdx/controller/bitfield.h deleted file mode 100644 index 567f8ec47582..000000000000 --- a/drivers/cdx/controller/bitfield.h +++ /dev/null @@ -1,90 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright 2005-2006 Fen Systems Ltd. - * Copyright 2006-2013 Solarflare Communications Inc. - * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. - */ - -#ifndef CDX_BITFIELD_H -#define CDX_BITFIELD_H - -#include - -/* Lowest bit numbers and widths */ -#define CDX_DWORD_LBN 0 -#define CDX_DWORD_WIDTH 32 - -/* Specified attribute (e.g. LBN) of the specified field */ -#define CDX_VAL(field, attribute) field ## _ ## attribute -/* Low bit number of the specified field */ -#define CDX_LOW_BIT(field) CDX_VAL(field, LBN) -/* Bit width of the specified field */ -#define CDX_WIDTH(field) CDX_VAL(field, WIDTH) -/* High bit number of the specified field */ -#define CDX_HIGH_BIT(field) (CDX_LOW_BIT(field) + CDX_WIDTH(field) - 1) - -/* A doubleword (i.e. 4 byte) datatype - little-endian in HW */ -struct cdx_dword { - __le32 cdx_u32; -}; - -/* Value expanders for printk */ -#define CDX_DWORD_VAL(dword) \ - ((unsigned int)le32_to_cpu((dword).cdx_u32)) - -/* - * Extract bit field portion [low,high) from the 32-bit little-endian - * element which contains bits [min,max) - */ -#define CDX_DWORD_FIELD(dword, field) \ - (FIELD_GET(GENMASK(CDX_HIGH_BIT(field), CDX_LOW_BIT(field)), \ - le32_to_cpu((dword).cdx_u32))) - -/* - * Creates the portion of the named bit field that lies within the - * range [min,max). - */ -#define CDX_INSERT_FIELD(field, value) \ - (FIELD_PREP(GENMASK(CDX_HIGH_BIT(field), \ - CDX_LOW_BIT(field)), value)) - -/* - * Creates the portion of the named bit fields that lie within the - * range [min,max). - */ -#define CDX_INSERT_FIELDS(field1, value1, \ - field2, value2, \ - field3, value3, \ - field4, value4, \ - field5, value5, \ - field6, value6, \ - field7, value7) \ - (CDX_INSERT_FIELD(field1, (value1)) | \ - CDX_INSERT_FIELD(field2, (value2)) | \ - CDX_INSERT_FIELD(field3, (value3)) | \ - CDX_INSERT_FIELD(field4, (value4)) | \ - CDX_INSERT_FIELD(field5, (value5)) | \ - CDX_INSERT_FIELD(field6, (value6)) | \ - CDX_INSERT_FIELD(field7, (value7))) - -#define CDX_POPULATE_DWORD(dword, ...) \ - (dword).cdx_u32 = cpu_to_le32(CDX_INSERT_FIELDS(__VA_ARGS__)) - -/* Populate a dword field with various numbers of arguments */ -#define CDX_POPULATE_DWORD_7 CDX_POPULATE_DWORD -#define CDX_POPULATE_DWORD_6(dword, ...) \ - CDX_POPULATE_DWORD_7(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_5(dword, ...) \ - CDX_POPULATE_DWORD_6(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_4(dword, ...) \ - CDX_POPULATE_DWORD_5(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_3(dword, ...) \ - CDX_POPULATE_DWORD_4(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_2(dword, ...) \ - CDX_POPULATE_DWORD_3(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_1(dword, ...) \ - CDX_POPULATE_DWORD_2(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_SET_DWORD(dword) \ - CDX_POPULATE_DWORD_1(dword, CDX_DWORD, 0xffffffff) - -#endif /* CDX_BITFIELD_H */ diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index fca83141e3e6..3f8b9041babf 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -14,7 +14,7 @@ #include "cdx_controller.h" #include "../cdx.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static unsigned int cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) { diff --git a/drivers/cdx/controller/cdx_rpmsg.c b/drivers/cdx/controller/cdx_rpmsg.c index 04b578a0be17..d4f763323aac 100644 --- a/drivers/cdx/controller/cdx_rpmsg.c +++ b/drivers/cdx/controller/cdx_rpmsg.c @@ -15,7 +15,7 @@ #include "../cdx.h" #include "cdx_controller.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static struct rpmsg_device_id cdx_rpmsg_id_table[] = { { .name = "mcdi_ipc" }, diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index e760f8d347cc..90bf9f7c257b 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -23,9 +23,10 @@ #include #include #include +#include -#include "bitfield.h" -#include "mcdi.h" +#include +#include "mcdid.h" static void cdx_mcdi_cancel_cmd(struct cdx_mcdi *cdx, struct cdx_mcdi_cmd *cmd); static void cdx_mcdi_wait_for_cleanup(struct cdx_mcdi *cdx); diff --git a/drivers/cdx/controller/mcdi.h b/drivers/cdx/controller/mcdi.h deleted file mode 100644 index 54a65e9760ae..000000000000 --- a/drivers/cdx/controller/mcdi.h +++ /dev/null @@ -1,242 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright 2008-2013 Solarflare Communications Inc. - * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. - */ - -#ifndef CDX_MCDI_H -#define CDX_MCDI_H - -#include -#include -#include - -#include "bitfield.h" -#include "mc_cdx_pcol.h" - -#ifdef DEBUG -#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) -#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) -#else -#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) -#define CDX_WARN_ON_PARANOID(x) do {} while (0) -#endif - -/** - * enum cdx_mcdi_mode - MCDI transaction mode - * @MCDI_MODE_EVENTS: wait for an mcdi response callback. - * @MCDI_MODE_FAIL: we think MCDI is dead, so fail-fast all calls - */ -enum cdx_mcdi_mode { - MCDI_MODE_EVENTS, - MCDI_MODE_FAIL, -}; - -#define MCDI_RPC_TIMEOUT (10 * HZ) -#define MCDI_RPC_LONG_TIMEOU (60 * HZ) -#define MCDI_RPC_POST_RST_TIME (10 * HZ) - -#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) - -/** - * enum cdx_mcdi_cmd_state - State for an individual MCDI command - * @MCDI_STATE_QUEUED: Command not started and is waiting to run. - * @MCDI_STATE_RETRY: Command was submitted and MC rejected with no resources, - * as MC have too many outstanding commands. Command will be retried once - * another command returns. - * @MCDI_STATE_RUNNING: Command was accepted and is running. - * @MCDI_STATE_RUNNING_CANCELLED: Command is running but the issuer cancelled - * the command. - * @MCDI_STATE_FINISHED: Processing of this command has completed. - */ - -enum cdx_mcdi_cmd_state { - MCDI_STATE_QUEUED, - MCDI_STATE_RETRY, - MCDI_STATE_RUNNING, - MCDI_STATE_RUNNING_CANCELLED, - MCDI_STATE_FINISHED, -}; - -/** - * struct cdx_mcdi - CDX MCDI Firmware interface, to interact - * with CDX controller. - * @mcdi: MCDI interface - * @mcdi_ops: MCDI operations - * @r5_rproc : R5 Remoteproc device handle - * @rpdev: RPMsg device - * @ept: RPMsg endpoint - * @work: Post probe work - */ -struct cdx_mcdi { - /* MCDI interface */ - struct cdx_mcdi_data *mcdi; - const struct cdx_mcdi_ops *mcdi_ops; - - struct rproc *r5_rproc; - struct rpmsg_device *rpdev; - struct rpmsg_endpoint *ept; - struct work_struct work; -}; - -struct cdx_mcdi_ops { - void (*mcdi_request)(struct cdx_mcdi *cdx, - const struct cdx_dword *hdr, size_t hdr_len, - const struct cdx_dword *sdu, size_t sdu_len); - unsigned int (*mcdi_rpc_timeout)(struct cdx_mcdi *cdx, unsigned int cmd); -}; - -typedef void cdx_mcdi_async_completer(struct cdx_mcdi *cdx, - unsigned long cookie, int rc, - struct cdx_dword *outbuf, - size_t outlen_actual); - -/** - * struct cdx_mcdi_cmd - An outstanding MCDI command - * @ref: Reference count. There will be one reference if the command is - * in the mcdi_iface cmd_list, another if it's on a cleanup list, - * and a third if it's queued in the work queue. - * @list: The data for this entry in mcdi->cmd_list - * @cleanup_list: The data for this entry in a cleanup list - * @work: The work item for this command, queued in mcdi->workqueue - * @mcdi: The mcdi_iface for this command - * @state: The state of this command - * @inlen: inbuf length - * @inbuf: Input buffer - * @quiet: Whether to silence errors - * @reboot_seen: Whether a reboot has been seen during this command, - * to prevent duplicates - * @seq: Sequence number - * @started: Jiffies this command was started at - * @cookie: Context for completion function - * @completer: Completion function - * @handle: Command handle - * @cmd: Command number - * @rc: Return code - * @outlen: Length of output buffer - * @outbuf: Output buffer - */ -struct cdx_mcdi_cmd { - struct kref ref; - struct list_head list; - struct list_head cleanup_list; - struct work_struct work; - struct cdx_mcdi_iface *mcdi; - enum cdx_mcdi_cmd_state state; - size_t inlen; - const struct cdx_dword *inbuf; - bool quiet; - bool reboot_seen; - u8 seq; - unsigned long started; - unsigned long cookie; - cdx_mcdi_async_completer *completer; - unsigned int handle; - unsigned int cmd; - int rc; - size_t outlen; - struct cdx_dword *outbuf; - /* followed by inbuf data if necessary */ -}; - -/** - * struct cdx_mcdi_iface - MCDI protocol context - * @cdx: The associated NIC - * @iface_lock: Serialise access to this structure - * @outstanding_cleanups: Count of cleanups - * @cmd_list: List of outstanding and running commands - * @workqueue: Workqueue used for delayed processing - * @cmd_complete_wq: Waitqueue for command completion - * @db_held_by: Command the MC doorbell is in use by - * @seq_held_by: Command each sequence number is in use by - * @prev_handle: The last used command handle - * @mode: Poll for mcdi completion, or wait for an mcdi_event - * @prev_seq: The last used sequence number - * @new_epoch: Indicates start of day or start of MC reboot recovery - */ -struct cdx_mcdi_iface { - struct cdx_mcdi *cdx; - /* Serialise access */ - struct mutex iface_lock; - unsigned int outstanding_cleanups; - struct list_head cmd_list; - struct workqueue_struct *workqueue; - wait_queue_head_t cmd_complete_wq; - struct cdx_mcdi_cmd *db_held_by; - struct cdx_mcdi_cmd *seq_held_by[16]; - unsigned int prev_handle; - enum cdx_mcdi_mode mode; - u8 prev_seq; - bool new_epoch; -}; - -/** - * struct cdx_mcdi_data - extra state for NICs that implement MCDI - * @iface: Interface/protocol state - * @fn_flags: Flags for this function, as returned by %MC_CMD_DRV_ATTACH. - */ -struct cdx_mcdi_data { - struct cdx_mcdi_iface iface; - u32 fn_flags; -}; - -static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) -{ - return cdx->mcdi ? &cdx->mcdi->iface : NULL; -} - -int cdx_mcdi_init(struct cdx_mcdi *cdx); -void cdx_mcdi_finish(struct cdx_mcdi *cdx); - -void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); -int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); -int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - cdx_mcdi_async_completer *complete, - unsigned long cookie); -int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, - unsigned int timeout_jiffies); - -/* - * We expect that 16- and 32-bit fields in MCDI requests and responses - * are appropriately aligned, but 64-bit fields are only - * 32-bit-aligned. - */ -#define MCDI_DECLARE_BUF(_name, _len) struct cdx_dword _name[DIV_ROUND_UP(_len, 4)] = {{0}} -#define _MCDI_PTR(_buf, _offset) \ - ((u8 *)(_buf) + (_offset)) -#define MCDI_PTR(_buf, _field) \ - _MCDI_PTR(_buf, MC_CMD_ ## _field ## _OFST) -#define _MCDI_CHECK_ALIGN(_ofst, _align) \ - ((void)BUILD_BUG_ON_ZERO((_ofst) & ((_align) - 1)), \ - (_ofst)) -#define _MCDI_DWORD(_buf, _field) \ - ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) - -#define MCDI_BYTE(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ - *MCDI_PTR(_buf, _field)) -#define MCDI_WORD(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ - le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) -#define MCDI_SET_DWORD(_buf, _field, _value) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) -#define MCDI_DWORD(_buf, _field) \ - CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) -#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ - MC_CMD_ ## _name1, _value1) -#define MCDI_SET_QWORD(_buf, _field, _value) \ - do { \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ - CDX_DWORD, (u32)(_value)); \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ - CDX_DWORD, (u64)(_value) >> 32); \ - } while (0) -#define MCDI_QWORD(_buf, _field) \ - (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ - (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) - -#endif /* CDX_MCDI_H */ diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c index 885c69e6ebe5..8ae2d99be81e 100644 --- a/drivers/cdx/controller/mcdi_functions.c +++ b/drivers/cdx/controller/mcdi_functions.c @@ -5,7 +5,6 @@ #include -#include "mcdi.h" #include "mcdi_functions.h" int cdx_mcdi_get_num_buses(struct cdx_mcdi *cdx) diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h index b9942affdc6b..57fd1bae706b 100644 --- a/drivers/cdx/controller/mcdi_functions.h +++ b/drivers/cdx/controller/mcdi_functions.h @@ -8,7 +8,8 @@ #ifndef CDX_MCDI_FUNCTIONS_H #define CDX_MCDI_FUNCTIONS_H -#include "mcdi.h" +#include +#include "mcdid.h" #include "../cdx.h" /** diff --git a/drivers/cdx/controller/mcdid.h b/drivers/cdx/controller/mcdid.h new file mode 100644 index 000000000000..7fc29f099265 --- /dev/null +++ b/drivers/cdx/controller/mcdid.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2025, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDID_H +#define CDX_MCDID_H + +#include +#include +#include + +#include "mc_cdx_pcol.h" + +#ifdef DEBUG +#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) +#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) +#else +#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) +#define CDX_WARN_ON_PARANOID(x) do {} while (0) +#endif + +#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) + +static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) +{ + return cdx->mcdi ? &cdx->mcdi->iface : NULL; +} + +int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + cdx_mcdi_async_completer *complete, + unsigned long cookie); +int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, + unsigned int timeout_jiffies); + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_BYTE(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ + *MCDI_PTR(_buf, _field)) +#define MCDI_WORD(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ + le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) +#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ + MC_CMD_ ## _name1, _value1) +#define MCDI_SET_QWORD(_buf, _field, _value) \ + do { \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ + CDX_DWORD, (u32)(_value)); \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ + CDX_DWORD, (u64)(_value) >> 32); \ + } while (0) +#define MCDI_QWORD(_buf, _field) \ + (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ + (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) + +#endif /* CDX_MCDID_H */ diff --git a/include/linux/cdx/bitfield.h b/include/linux/cdx/bitfield.h new file mode 100644 index 000000000000..567f8ec47582 --- /dev/null +++ b/include/linux/cdx/bitfield.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2005-2006 Fen Systems Ltd. + * Copyright 2006-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_BITFIELD_H +#define CDX_BITFIELD_H + +#include + +/* Lowest bit numbers and widths */ +#define CDX_DWORD_LBN 0 +#define CDX_DWORD_WIDTH 32 + +/* Specified attribute (e.g. LBN) of the specified field */ +#define CDX_VAL(field, attribute) field ## _ ## attribute +/* Low bit number of the specified field */ +#define CDX_LOW_BIT(field) CDX_VAL(field, LBN) +/* Bit width of the specified field */ +#define CDX_WIDTH(field) CDX_VAL(field, WIDTH) +/* High bit number of the specified field */ +#define CDX_HIGH_BIT(field) (CDX_LOW_BIT(field) + CDX_WIDTH(field) - 1) + +/* A doubleword (i.e. 4 byte) datatype - little-endian in HW */ +struct cdx_dword { + __le32 cdx_u32; +}; + +/* Value expanders for printk */ +#define CDX_DWORD_VAL(dword) \ + ((unsigned int)le32_to_cpu((dword).cdx_u32)) + +/* + * Extract bit field portion [low,high) from the 32-bit little-endian + * element which contains bits [min,max) + */ +#define CDX_DWORD_FIELD(dword, field) \ + (FIELD_GET(GENMASK(CDX_HIGH_BIT(field), CDX_LOW_BIT(field)), \ + le32_to_cpu((dword).cdx_u32))) + +/* + * Creates the portion of the named bit field that lies within the + * range [min,max). + */ +#define CDX_INSERT_FIELD(field, value) \ + (FIELD_PREP(GENMASK(CDX_HIGH_BIT(field), \ + CDX_LOW_BIT(field)), value)) + +/* + * Creates the portion of the named bit fields that lie within the + * range [min,max). + */ +#define CDX_INSERT_FIELDS(field1, value1, \ + field2, value2, \ + field3, value3, \ + field4, value4, \ + field5, value5, \ + field6, value6, \ + field7, value7) \ + (CDX_INSERT_FIELD(field1, (value1)) | \ + CDX_INSERT_FIELD(field2, (value2)) | \ + CDX_INSERT_FIELD(field3, (value3)) | \ + CDX_INSERT_FIELD(field4, (value4)) | \ + CDX_INSERT_FIELD(field5, (value5)) | \ + CDX_INSERT_FIELD(field6, (value6)) | \ + CDX_INSERT_FIELD(field7, (value7))) + +#define CDX_POPULATE_DWORD(dword, ...) \ + (dword).cdx_u32 = cpu_to_le32(CDX_INSERT_FIELDS(__VA_ARGS__)) + +/* Populate a dword field with various numbers of arguments */ +#define CDX_POPULATE_DWORD_7 CDX_POPULATE_DWORD +#define CDX_POPULATE_DWORD_6(dword, ...) \ + CDX_POPULATE_DWORD_7(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_5(dword, ...) \ + CDX_POPULATE_DWORD_6(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_4(dword, ...) \ + CDX_POPULATE_DWORD_5(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_3(dword, ...) \ + CDX_POPULATE_DWORD_4(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_2(dword, ...) \ + CDX_POPULATE_DWORD_3(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_1(dword, ...) \ + CDX_POPULATE_DWORD_2(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_SET_DWORD(dword) \ + CDX_POPULATE_DWORD_1(dword, CDX_DWORD, 0xffffffff) + +#endif /* CDX_BITFIELD_H */ diff --git a/include/linux/cdx/mcdi.h b/include/linux/cdx/mcdi.h new file mode 100644 index 000000000000..46e3f63b062a --- /dev/null +++ b/include/linux/cdx/mcdi.h @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDI_H +#define CDX_MCDI_H + +#include +#include +#include + +#include "linux/cdx/bitfield.h" + +/** + * enum cdx_mcdi_mode - MCDI transaction mode + * @MCDI_MODE_EVENTS: wait for an mcdi response callback. + * @MCDI_MODE_FAIL: we think MCDI is dead, so fail-fast all calls + */ +enum cdx_mcdi_mode { + MCDI_MODE_EVENTS, + MCDI_MODE_FAIL, +}; + +#define MCDI_RPC_TIMEOUT (10 * HZ) +#define MCDI_RPC_LONG_TIMEOU (60 * HZ) +#define MCDI_RPC_POST_RST_TIME (10 * HZ) + +/** + * enum cdx_mcdi_cmd_state - State for an individual MCDI command + * @MCDI_STATE_QUEUED: Command not started and is waiting to run. + * @MCDI_STATE_RETRY: Command was submitted and MC rejected with no resources, + * as MC have too many outstanding commands. Command will be retried once + * another command returns. + * @MCDI_STATE_RUNNING: Command was accepted and is running. + * @MCDI_STATE_RUNNING_CANCELLED: Command is running but the issuer cancelled + * the command. + * @MCDI_STATE_FINISHED: Processing of this command has completed. + */ + +enum cdx_mcdi_cmd_state { + MCDI_STATE_QUEUED, + MCDI_STATE_RETRY, + MCDI_STATE_RUNNING, + MCDI_STATE_RUNNING_CANCELLED, + MCDI_STATE_FINISHED, +}; + +/** + * struct cdx_mcdi - CDX MCDI Firmware interface, to interact + * with CDX controller. + * @mcdi: MCDI interface + * @mcdi_ops: MCDI operations + * @r5_rproc : R5 Remoteproc device handle + * @rpdev: RPMsg device + * @ept: RPMsg endpoint + * @work: Post probe work + */ +struct cdx_mcdi { + /* MCDI interface */ + struct cdx_mcdi_data *mcdi; + const struct cdx_mcdi_ops *mcdi_ops; + + struct rproc *r5_rproc; + struct rpmsg_device *rpdev; + struct rpmsg_endpoint *ept; + struct work_struct work; +}; + +struct cdx_mcdi_ops { + void (*mcdi_request)(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len); + unsigned int (*mcdi_rpc_timeout)(struct cdx_mcdi *cdx, unsigned int cmd); +}; + +typedef void cdx_mcdi_async_completer(struct cdx_mcdi *cdx, + unsigned long cookie, int rc, + struct cdx_dword *outbuf, + size_t outlen_actual); + +/** + * struct cdx_mcdi_cmd - An outstanding MCDI command + * @ref: Reference count. There will be one reference if the command is + * in the mcdi_iface cmd_list, another if it's on a cleanup list, + * and a third if it's queued in the work queue. + * @list: The data for this entry in mcdi->cmd_list + * @cleanup_list: The data for this entry in a cleanup list + * @work: The work item for this command, queued in mcdi->workqueue + * @mcdi: The mcdi_iface for this command + * @state: The state of this command + * @inlen: inbuf length + * @inbuf: Input buffer + * @quiet: Whether to silence errors + * @reboot_seen: Whether a reboot has been seen during this command, + * to prevent duplicates + * @seq: Sequence number + * @started: Jiffies this command was started at + * @cookie: Context for completion function + * @completer: Completion function + * @handle: Command handle + * @cmd: Command number + * @rc: Return code + * @outlen: Length of output buffer + * @outbuf: Output buffer + */ +struct cdx_mcdi_cmd { + struct kref ref; + struct list_head list; + struct list_head cleanup_list; + struct work_struct work; + struct cdx_mcdi_iface *mcdi; + enum cdx_mcdi_cmd_state state; + size_t inlen; + const struct cdx_dword *inbuf; + bool quiet; + bool reboot_seen; + u8 seq; + unsigned long started; + unsigned long cookie; + cdx_mcdi_async_completer *completer; + unsigned int handle; + unsigned int cmd; + int rc; + size_t outlen; + struct cdx_dword *outbuf; + /* followed by inbuf data if necessary */ +}; + +/** + * struct cdx_mcdi_iface - MCDI protocol context + * @cdx: The associated NIC + * @iface_lock: Serialise access to this structure + * @outstanding_cleanups: Count of cleanups + * @cmd_list: List of outstanding and running commands + * @workqueue: Workqueue used for delayed processing + * @cmd_complete_wq: Waitqueue for command completion + * @db_held_by: Command the MC doorbell is in use by + * @seq_held_by: Command each sequence number is in use by + * @prev_handle: The last used command handle + * @mode: Poll for mcdi completion, or wait for an mcdi_event + * @prev_seq: The last used sequence number + * @new_epoch: Indicates start of day or start of MC reboot recovery + */ +struct cdx_mcdi_iface { + struct cdx_mcdi *cdx; + /* Serialise access */ + struct mutex iface_lock; + unsigned int outstanding_cleanups; + struct list_head cmd_list; + struct workqueue_struct *workqueue; + wait_queue_head_t cmd_complete_wq; + struct cdx_mcdi_cmd *db_held_by; + struct cdx_mcdi_cmd *seq_held_by[16]; + unsigned int prev_handle; + enum cdx_mcdi_mode mode; + u8 prev_seq; + bool new_epoch; +}; + +/** + * struct cdx_mcdi_data - extra state for NICs that implement MCDI + * @iface: Interface/protocol state + * @fn_flags: Flags for this function, as returned by %MC_CMD_DRV_ATTACH. + */ +struct cdx_mcdi_data { + struct cdx_mcdi_iface iface; + u32 fn_flags; +}; + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_DECLARE_BUF(_name, _len) struct cdx_dword _name[DIV_ROUND_UP(_len, 4)] = {{0}} +#define _MCDI_PTR(_buf, _offset) \ + ((u8 *)(_buf) + (_offset)) +#define MCDI_PTR(_buf, _field) \ + _MCDI_PTR(_buf, MC_CMD_ ## _field ## _OFST) +#define _MCDI_CHECK_ALIGN(_ofst, _align) \ + ((void)BUILD_BUG_ON_ZERO((_ofst) & ((_align) - 1)), \ + (_ofst)) +#define _MCDI_DWORD(_buf, _field) \ + ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) + +#define MCDI_SET_DWORD(_buf, _field, _value) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) +#define MCDI_DWORD(_buf, _field) \ + CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) +#endif /* CDX_MCDI_H */ -- cgit v1.2.3 From f99b3917789d83ea89b24b722d784956f8289f45 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 15 Sep 2025 14:57:29 +0200 Subject: fs: rename generic_delete_inode() and generic_drop_inode() generic_delete_inode() is rather misleading for what the routine is doing. inode_just_drop() should be much clearer. The new naming is inconsistent with generic_drop_inode(), so rename that one as well with inode_ as the suffix. No functional changes. Signed-off-by: Mateusz Guzik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 4 ++-- Documentation/filesystems/vfs.rst | 4 ++-- block/bdev.c | 2 +- drivers/dax/super.c | 2 +- drivers/misc/ibmasm/ibmasmfs.c | 2 +- drivers/usb/gadget/function/f_fs.c | 2 +- drivers/usb/gadget/legacy/inode.c | 2 +- fs/9p/vfs_super.c | 2 +- fs/afs/inode.c | 4 ++-- fs/btrfs/inode.c | 2 +- fs/ceph/super.c | 2 +- fs/configfs/mount.c | 2 +- fs/efivarfs/super.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/super.c | 2 +- fs/fuse/inode.c | 2 +- fs/gfs2/super.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/inode.c | 6 +++--- fs/kernfs/mount.c | 2 +- fs/nfs/inode.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/orangefs/super.c | 2 +- fs/overlayfs/super.c | 2 +- fs/pidfs.c | 2 +- fs/proc/inode.c | 2 +- fs/pstore/inode.c | 2 +- fs/ramfs/inode.c | 2 +- fs/smb/client/cifsfs.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 4 ++-- kernel/bpf/inode.c | 2 +- mm/shmem.c | 2 +- 34 files changed, 40 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..b5db45c0094c 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil ->drop_inode() returns int now; it's called on final iput() with inode->i_lock held and it returns true if filesystems wants the inode to be -dropped. As before, generic_drop_inode() is still the default and it's been -updated appropriately. generic_delete_inode() is also alive and it consists +dropped. As before, inode_generic_drop() is still the default and it's been +updated appropriately. inode_just_drop() is also alive and it consists simply of return 1. Note that all actual eviction work is done by caller after ->drop_inode() returns. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 486a91633474..7a314eee6305 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -327,11 +327,11 @@ or bottom half). inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do + semantics) or "inode_just_drop" (for filesystems that do not want to cache inodes - causing "delete_inode" to always be called regardless of the value of i_nlink) - The "generic_delete_inode()" behavior is equivalent to the old + The "inode_just_drop()" behavior is equivalent to the old practice of using "force_delete" in the put_inode() case, but does not have the races that the "force_delete()" approach had. diff --git a/block/bdev.c b/block/bdev.c index b77ddd12dc06..810707cca970 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 54c480e874cb..d7714d8afb0f 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -388,7 +388,7 @@ static const struct super_operations dax_sops = { .alloc_inode = dax_alloc_inode, .destroy_inode = dax_destroy_inode, .free_inode = dax_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int dax_init_fs_context(struct fs_context *fc) diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index c44de892a61e..5372ed2a363e 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc) static const struct super_operations ibmasmfs_s_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 08a251df20c4..5246fa6af3d6 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb, /* Super block */ static const struct super_operations ffs_sb_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; struct ffs_sb_fill_data { diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index b51e132b0cd2..13c3da49348c 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name, static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744c..1581ebac5bb4 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f848..e1cb17b85791 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdb..2081654dea5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7953,7 +7953,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c5..70dc9467f6a0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d..456c4a2efb53 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index c4a139911356..6f131910be9d 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733..848e059d58cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..63cf73409da6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1768,7 +1768,7 @@ static int f2fs_drop_inode(struct inode *inode) trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ecb869e895ab..63390575b6fd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b..644b2d1e7276 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd..1e1acf5775ab 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/inode.c b/fs/inode.c index 7b81d4a101b8..a66c02123bca 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1838,11 +1838,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1866,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece..76eaf64b9d9e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..cbd8a7f9c617 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e1..807e2b758a5c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e7..b46100a4f529 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e9..bd3d7ba8fb95 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..3865272178a8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be1..d9b7ef122343 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426c..b4e55c90f8dc 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e..41f9995da7ca 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 3bd85ab2deb1..b4075385a649 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..733fd1e5a9a2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..a05ff68748dc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -778,7 +778,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void diff --git a/include/linux/fs.h b/include/linux/fs.h index 4daf9b30a641..724b9af67f35 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3312,8 +3312,8 @@ extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); -static inline int generic_drop_inode(struct inode *inode) +extern int inode_just_drop(struct inode *inode); +static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..6d021d18afa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..932727247c64 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, -- cgit v1.2.3 From 8b0d03129b6165bbf8c9494897489c6da6fadd58 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:46 +0530 Subject: cdx: Export Symbols for MCDI RPC and Initialization The cdx_mcdi_init(), cdx_mcdi_process_cmd(), and cdx_mcdi_rpc() functions are needed by the VersalNET EDAC module that interact with the MCDI (Management Controller Direct Interface) framework. These functions facilitate communication between different hardware components by enabling command execution and status management. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/cdx/controller/mcdi.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/cdx/mcdi.h | 7 +++++++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index 90bf9f7c257b..2e82ffc18d89 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -100,6 +100,19 @@ static unsigned long cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd return cdx->mcdi_ops->mcdi_rpc_timeout(cdx, cmd); } +/** + * cdx_mcdi_init - Initialize MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function allocates and initializes internal MCDI structures and resources + * for the CDX device, including the workqueue, locking primitives, and command + * tracking mechanisms. It sets the initial operating mode and prepares the device + * for MCDI operations. + * + * Return: + * * 0 - on success + * * -ENOMEM - if memory allocation or workqueue creation fails + */ int cdx_mcdi_init(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -129,7 +142,16 @@ fail2: fail: return rc; } +EXPORT_SYMBOL_GPL(cdx_mcdi_init); +/** + * cdx_mcdi_finish - Cleanup MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function is responsible for cleaning up the MCDI (Management Controller Driver Interface) + * resources associated with a cdx_mcdi structure. Also destroys the mcdi workqueue. + * + */ void cdx_mcdi_finish(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -144,6 +166,7 @@ void cdx_mcdi_finish(struct cdx_mcdi *cdx) kfree(cdx->mcdi); cdx->mcdi = NULL; } +EXPORT_SYMBOL_GPL(cdx_mcdi_finish); static bool cdx_mcdi_flushed(struct cdx_mcdi_iface *mcdi, bool ignore_cleanups) { @@ -554,6 +577,19 @@ static void cdx_mcdi_start_or_queue(struct cdx_mcdi_iface *mcdi, cdx_mcdi_cmd_start_or_queue(mcdi, cmd); } +/** + * cdx_mcdi_process_cmd - Process an incoming MCDI response + * @cdx: Handle to the CDX MCDI structure + * @outbuf: Pointer to the response buffer received from the management controller + * @len: Length of the response buffer in bytes + * + * This function handles a response from the management controller. It locates the + * corresponding command using the sequence number embedded in the header, + * completes the command if it is still pending, and initiates any necessary cleanup. + * + * The function assumes that the response buffer is well-formed and at least one + * dword in size. + */ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len) { struct cdx_mcdi_iface *mcdi; @@ -591,6 +627,7 @@ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int le cdx_mcdi_process_cleanup_list(mcdi->cdx, &cleanup_list); } +EXPORT_SYMBOL_GPL(cdx_mcdi_process_cmd); static void cdx_mcdi_cmd_work(struct work_struct *context) { @@ -758,6 +795,7 @@ int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, return cdx_mcdi_rpc_sync(cdx, cmd, inbuf, inlen, outbuf, outlen, outlen_actual, false); } +EXPORT_SYMBOL_GPL(cdx_mcdi_rpc); /** * cdx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously diff --git a/include/linux/cdx/mcdi.h b/include/linux/cdx/mcdi.h index 46e3f63b062a..74075305cba4 100644 --- a/include/linux/cdx/mcdi.h +++ b/include/linux/cdx/mcdi.h @@ -169,6 +169,13 @@ struct cdx_mcdi_data { u32 fn_flags; }; +void cdx_mcdi_finish(struct cdx_mcdi *cdx); +int cdx_mcdi_init(struct cdx_mcdi *cdx); +void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); +int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); + /* * We expect that 16- and 32-bit fields in MCDI requests and responses * are appropriately aligned, but 64-bit fields are only -- cgit v1.2.3 From d5fe2fec6c40dda03df8cc9b4a97de0b7e39f984 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:49 +0530 Subject: EDAC: Add a driver for the AMD Versal NET DDR controller Add a driver for the AMD Versal NET DDR memory controller which supports single bit error correction, double bit error detection and other system errors from various IP subsystems (e.g., RPU, NOCs, HNICX, PL). The driver listens for notifications from the NMC (Network management controller) using RPMsg (Remote Processor Messaging). The channel used for communicating to RPMsg is named "error_edac". Upon receipt of a notification, the driver sends a RAS event trace. [ bp: - Fixup title - Rewrite commit message - Fixup Kconfig text - Zap unused defines and align them - Simplify rpmsg_cb() considerably - Drop silly double-brackets in conditionals - Use proper void * type in mcdi_request() - Do not clear chinfo in rpmsg_probe() unnecessarily - Fix indentation - Do a proper err unwind path in init_versalnet() - Redo the error unwind path in mc_probe() properly - Fix the ordering in mc_remove() ] Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com Link: https://lore.kernel.org/r/20250703173105.GLaGa-WQCESDNsqygm@fat_crate.local --- MAINTAINERS | 7 + drivers/edac/Kconfig | 8 + drivers/edac/Makefile | 1 + drivers/edac/versalnet_edac.c | 958 ++++++++++++++++++++++++++++++++++++++ include/linux/cdx/edac_cdx_pcol.h | 28 ++ 5 files changed, 1002 insertions(+) create mode 100644 drivers/edac/versalnet_edac.c create mode 100644 include/linux/cdx/edac_cdx_pcol.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index ac20587a3e98..23254e89a1f8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27631,6 +27631,13 @@ S: Maintained F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml F: drivers/edac/versal_edac.c +XILINX VERSALNET EDAC DRIVER +M: Shubhrajyoti Datta +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml +F: drivers/edac/versalnet_edac.c +F: include/linux/cdx/edac_cdx_pcol.h + XILINX WATCHDOG DRIVER M: Srinivas Neeli R: Shubhrajyoti Datta diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index b824472208c4..39352b9b7a7e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -584,4 +584,12 @@ config EDAC_CORTEX_A72 The detected and reported errors are from reading CPU/L2 memory error syndrome registers. +config EDAC_VERSALNET + tristate "AMD VersalNET DDR Controller" + depends on CDX_CONTROLLER && ARCH_ZYNQMP + help + Support for single bit error correction, double bit error detection + and other system errors from various IP subsystems like RPU, NOCs, + HNICX, PL on the AMD Versal NET DDR memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index c1736f264320..1c14796410a3 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,4 +88,5 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_VERSALNET) += versalnet_edac.o obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c new file mode 100644 index 000000000000..66714fffa591 --- /dev/null +++ b/drivers/edac/versalnet_edac.c @@ -0,0 +1,958 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Versal NET memory controller driver + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "edac_module.h" + +/* Granularity of reported error in bytes */ +#define MC5_ERR_GRAIN 1 +#define MC_GET_DDR_CONFIG_IN_LEN 4 + +#define MC5_IRQ_CE_MASK GENMASK(18, 15) +#define MC5_IRQ_UE_MASK GENMASK(14, 11) + +#define MC5_RANK_1_MASK GENMASK(11, 6) +#define MASK_24 GENMASK(29, 24) +#define MASK_0 GENMASK(5, 0) + +#define MC5_LRANK_1_MASK GENMASK(11, 6) +#define MC5_LRANK_2_MASK GENMASK(17, 12) +#define MC5_BANK1_MASK GENMASK(11, 6) +#define MC5_GRP_0_MASK GENMASK(17, 12) +#define MC5_GRP_1_MASK GENMASK(23, 18) + +#define MC5_REGHI_ROW 7 +#define MC5_EACHBIT 1 +#define MC5_ERR_TYPE_CE 0 +#define MC5_ERR_TYPE_UE 1 +#define MC5_HIGH_MEM_EN BIT(20) +#define MC5_MEM_MASK GENMASK(19, 0) +#define MC5_X16_BASE 256 +#define MC5_X16_ECC 32 +#define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC) +#define MC5_X32_SIZE 576 +#define MC5_HIMEM_BASE (256 * SZ_1M) +#define MC5_ILC_HIMEM_EN BIT(28) +#define MC5_ILC_MEM GENMASK(27, 0) +#define MC5_INTERLEAVE_SEL GENMASK(3, 0) +#define MC5_BUS_WIDTH_MASK GENMASK(19, 18) +#define MC5_NUM_CHANS_MASK BIT(17) +#define MC5_RANK_MASK GENMASK(15, 14) + +#define ERROR_LEVEL 2 +#define ERROR_ID 3 +#define TOTAL_ERR_LENGTH 5 +#define MSG_ERR_OFFSET 8 +#define MSG_ERR_LENGTH 9 +#define ERROR_DATA 10 +#define MCDI_RESPONSE 0xFF + +#define REG_MAX 152 +#define ADEC_MAX 152 +#define NUM_CONTROLLERS 8 +#define REGS_PER_CONTROLLER 19 +#define ADEC_NUM 19 +#define BUFFER_SZ 80 + +#define XDDR5_BUS_WIDTH_64 0 +#define XDDR5_BUS_WIDTH_32 1 +#define XDDR5_BUS_WIDTH_16 2 + +/** + * struct ecc_error_info - ECC error log information. + * @burstpos: Burst position. + * @lrank: Logical Rank number. + * @rank: Rank number. + * @group: Group number. + * @bank: Bank number. + * @col: Column number. + * @row: Row number. + * @rowhi: Row number higher bits. + * @i: Combined ECC error vector containing encoded values of burst position, + * rank, bank, column, and row information. + */ +union ecc_error_info { + struct { + u32 burstpos:3; + u32 lrank:4; + u32 rank:2; + u32 group:3; + u32 bank:2; + u32 col:11; + u32 row:7; + u32 rowhi; + }; + u64 i; +} __packed; + +/* Row and column bit positions in the address decoder (ADEC) registers. */ +union row_col_mapping { + struct { + u32 row0:6; + u32 row1:6; + u32 row2:6; + u32 row3:6; + u32 row4:6; + u32 reserved:2; + }; + struct { + u32 col1:6; + u32 col2:6; + u32 col3:6; + u32 col4:6; + u32 col5:6; + u32 reservedcol:2; + }; + u32 i; +} __packed; + +/** + * struct ecc_status - ECC status information to report. + * @ceinfo: Correctable errors. + * @ueinfo: Uncorrected errors. + * @channel: Channel number. + * @error_type: Error type. + */ +struct ecc_status { + union ecc_error_info ceinfo[2]; + union ecc_error_info ueinfo[2]; + u8 channel; + u8 error_type; +}; + +/** + * struct mc_priv - DDR memory controller private instance data. + * @message: Buffer for framing the event specific info. + * @stat: ECC status information. + * @error_id: The error id. + * @error_level: The error level. + * @dwidth: Width of data bus excluding ECC bits. + * @part_len: The support of the message received. + * @regs: The registers sent on the rpmsg. + * @adec: Address decode registers. + * @mci: Memory controller interface. + * @ept: rpmsg endpoint. + * @mcdi: The mcdi handle. + */ +struct mc_priv { + char message[256]; + struct ecc_status stat; + u32 error_id; + u32 error_level; + u32 dwidth; + u32 part_len; + u32 regs[REG_MAX]; + u32 adec[ADEC_MAX]; + struct mem_ctl_info *mci[NUM_CONTROLLERS]; + struct rpmsg_endpoint *ept; + struct cdx_mcdi *mcdi; +}; + +/* + * Address decoder (ADEC) registers to match the order in which the register + * information is received from the firmware. + */ +enum adec_info { + CONF = 0, + ADEC0, + ADEC1, + ADEC2, + ADEC3, + ADEC4, + ADEC5, + ADEC6, + ADEC7, + ADEC8, + ADEC9, + ADEC10, + ADEC11, + ADEC12, + ADEC13, + ADEC14, + ADEC15, + ADEC16, + ADECILC, +}; + +enum reg_info { + ISR = 0, + IMR, + ECCR0_ERR_STATUS, + ECCR0_ADDR_LO, + ECCR0_ADDR_HI, + ECCR0_DATA_LO, + ECCR0_DATA_HI, + ECCR0_PAR, + ECCR1_ERR_STATUS, + ECCR1_ADDR_LO, + ECCR1_ADDR_HI, + ECCR1_DATA_LO, + ECCR1_DATA_HI, + ECCR1_PAR, + XMPU_ERR, + XMPU_ERR_ADDR_L0, + XMPU_ERR_ADDR_HI, + XMPU_ERR_AXI_ID, + ADEC_CHK_ERR_LOG, +}; + +static bool get_ddr_info(u32 *error_data, struct mc_priv *priv) +{ + u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr; + struct ecc_status *p; + + isr = error_data[ISR]; + + if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK))) + return false; + + eccr0_val = error_data[ECCR0_ERR_STATUS]; + eccr1_val = error_data[ECCR1_ERR_STATUS]; + + if (!eccr0_val && !eccr1_val) + return false; + + p = &priv->stat; + + if (!eccr0_val) + p->channel = 1; + else + p->channel = 0; + + reglo = error_data[ECCR0_ADDR_LO]; + reghi = error_data[ECCR0_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[0].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[0].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR0_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + reglo = error_data[ECCR1_ADDR_LO]; + reghi = error_data[ECCR1_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[1].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[1].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR1_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + return true; +} + +/** + * convert_to_physical - Convert @error_data to a physical address. + * @priv: DDR memory controller private instance data. + * @pinf: ECC error info structure. + * @controller: Controller number of the MC5 + * @error_data: the DDRMC5 ADEC address decoder register data + * + * Return: physical address of the DDR memory. + */ +static unsigned long convert_to_physical(struct mc_priv *priv, + union ecc_error_info pinf, + int controller, int *error_data) +{ + u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset; + u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base; + unsigned long err_addr = 0, addr; + union row_col_mapping cols; + union row_col_mapping rows; + u32 col_bit_0; + + row = pinf.rowhi << MC5_REGHI_ROW | pinf.row; + offset = controller * ADEC_NUM; + + reg = error_data[ADEC6]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC7]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC8]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + + reg = error_data[ADEC9]; + rows.i = reg; + + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + + col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]); + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << col_bit_0; + + cols.i = error_data[ADEC10]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + cols.i = error_data[ADEC11]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + reg = error_data[ADEC12]; + err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0); + pinf.bank >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg); + pinf.bank >>= MC5_EACHBIT; + + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.group >>= MC5_EACHBIT; + + reg = error_data[ADEC4]; + err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0); + pinf.rank >>= MC5_EACHBIT; + err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg); + pinf.rank >>= MC5_EACHBIT; + + reg = error_data[ADEC5]; + err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.lrank >>= MC5_EACHBIT; + + high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE; + interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL; + + high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK; + low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK; + reg = priv->adec[ADEC14 + offset]; + ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN); + ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M; + if (ilc_himem_en) + ilc_base_ctrl_add = ilcmem_base - high_mem_offset; + else + ilc_base_ctrl_add = ilcmem_base - low_mem_offset; + + if (priv->dwidth == DEV_X16) { + blk = err_addr / MC5_X16_SIZE; + rsh_req_addr = (blk << 8) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } else { + blk = err_addr / MC5_X32_SIZE; + rsh_req_addr = (blk << 9) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } + + if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base) + addr = err_addr - high_mem_offset; + else + addr = err_addr - low_mem_offset; + + return addr; +} + +/** + * handle_error - Handle errors. + * @priv: DDR memory controller private instance data. + * @stat: ECC status structure. + * @ctl_num: Controller number of the MC5 + * @error_data: the MC5 ADEC address decoder register data + * + * Handles ECC correctable and uncorrectable errors. + */ +static void handle_error(struct mc_priv *priv, struct ecc_status *stat, + int ctl_num, int *error_data) +{ + union ecc_error_info pinf; + struct mem_ctl_info *mci; + unsigned long pa; + phys_addr_t pfn; + int err; + + if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + return; + + mci = priv->mci[ctl_num]; + + if (stat->error_type == MC5_ERR_TYPE_CE) { + pinf = stat->ceinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s Controller %d Addr at %lx\n", + "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + } + + if (stat->error_type == MC5_ERR_TYPE_UE) { + pinf = stat->ueinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s controller %d Addr at %lx\n", + "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + pa = convert_to_physical(priv, pinf, ctl_num, error_data); + pfn = PHYS_PFN(pa); + + if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) { + err = memory_failure(pfn, MF_ACTION_REQUIRED); + if (err) + edac_dbg(2, "memory_failure() error: %d", err); + else + edac_dbg(2, "Poison page at PA 0x%lx\n", pa); + } + } +} + +static void mc_init(struct mem_ctl_info *mci, struct device *dev) +{ + struct mc_priv *priv = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 row; + int ch; + + /* Initialize controller capabilities and configuration */ + mci->mtype_cap = MEM_FLAG_DDR5; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_HW_SRC; + mci->scrub_mode = SCRUB_NONE; + + mci->edac_cap = EDAC_FLAG_SECDED; + mci->ctl_name = "VersalNET DDR5"; + mci->dev_name = dev_name(dev); + mci->mod_name = "versalnet_edac"; + + edac_op_state = EDAC_OPSTATE_INT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR5; + dimm->grain = MC5_ERR_GRAIN; + dimm->dtype = priv->dwidth; + } + } +} + +#define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) +{ + return MCDI_RPC_TIMEOUT; +} + +static void mcdi_request(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len) +{ + void *send_buf; + int ret; + + send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL); + if (!send_buf) + return; + + memcpy(send_buf, hdr, hdr_len); + memcpy(send_buf + hdr_len, sdu, sdu_len); + + ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len); + if (ret) + dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret); + + kfree(send_buf); +} + +static const struct cdx_mcdi_ops mcdi_ops = { + .mcdi_rpc_timeout = mcdi_rpc_timeout, + .mcdi_request = mcdi_request, +}; + +static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi) +{ + size_t outlen; + int ret; + + MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN); + MCDI_DECLARE_BUF(outbuf, BUFFER_SZ); + + MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index); + + ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (!ret) + memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG), + (ADEC_NUM * 4)); +} + +static int setup_mcdi(struct mc_priv *mc_priv) +{ + struct cdx_mcdi *amd_mcdi; + int ret, i; + + amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL); + if (!amd_mcdi) + return -ENOMEM; + + amd_mcdi->mcdi_ops = &mcdi_ops; + ret = cdx_mcdi_init(amd_mcdi); + if (ret) { + kfree(amd_mcdi); + return ret; + } + + amd_mcdi->ept = mc_priv->ept; + mc_priv->mcdi = amd_mcdi; + + for (i = 0; i < NUM_CONTROLLERS; i++) + get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi); + + return 0; +} + +static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2, + 0xb8, 0xb4, 0x45, 0x56, 0x2e, + 0x8c, 0x5b, 0xec); + +static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, + int len, void *priv, u32 src) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + const guid_t *sec_type = &guid_null; + u32 length, offset, error_id; + u32 *result = (u32 *)data; + struct ecc_status *p; + int i, j, k, sec_sev; + const char *err_str; + u32 *adec_data; + + if (*(u8 *)data == MCDI_RESPONSE) { + cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len); + return 0; + } + + sec_sev = result[ERROR_LEVEL]; + error_id = result[ERROR_ID]; + length = result[MSG_ERR_LENGTH]; + offset = result[MSG_ERR_OFFSET]; + + if (result[TOTAL_ERR_LENGTH] > length) { + if (!mc_priv->part_len) + mc_priv->part_len = length; + else + mc_priv->part_len += length; + /* + * The data can come in 2 stretches. Construct the regs from 2 + * messages the offset indicates the offset from which the data is to + * be taken + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) + return 0; + mc_priv->part_len = 0; + } + + mc_priv->error_id = error_id; + mc_priv->error_level = result[ERROR_LEVEL]; + + switch (error_id) { + case 5: err_str = "General Software Non-Correctable error"; break; + case 6: err_str = "CFU error"; break; + case 7: err_str = "CFRAME error"; break; + case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break; + case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break; + case 15: err_str = "MMCM error"; break; + case 16: err_str = "HNICX Correctable error"; break; + case 17: err_str = "HNICX Non-Correctable error"; break; + + case 18: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_CE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + case 19: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_UE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + + case 21: err_str = "GT Non-Correctable error"; break; + case 22: err_str = "PL Sysmon Correctable error"; break; + case 23: err_str = "PL Sysmon Non-Correctable error"; break; + case 111: err_str = "LPX unexpected dfx activation error"; break; + case 114: err_str = "INT_LPD Non-Correctable error"; break; + case 116: err_str = "INT_OCM Non-Correctable error"; break; + case 117: err_str = "INT_FPD Correctable error"; break; + case 118: err_str = "INT_FPD Non-Correctable error"; break; + case 120: err_str = "INT_IOU Non-Correctable error"; break; + case 123: err_str = "err_int_irq from APU GIC Distributor"; break; + case 124: err_str = "fault_int_irq from APU GIC Distribute"; break; + case 132 ... 139: err_str = "FPX SPLITTER error"; break; + case 140: err_str = "APU Cluster 0 error"; break; + case 141: err_str = "APU Cluster 1 error"; break; + case 142: err_str = "APU Cluster 2 error"; break; + case 143: err_str = "APU Cluster 3 error"; break; + case 145: err_str = "WWDT1 LPX error"; break; + case 147: err_str = "IPI error"; break; + case 152 ... 153: err_str = "AFIFS error"; break; + case 154 ... 155: err_str = "LPX glitch error"; break; + case 185 ... 186: err_str = "FPX AFIFS error"; break; + case 195 ... 199: err_str = "AFIFM error"; break; + case 108: err_str = "PSM Correctable error"; break; + case 59: err_str = "PMC correctable error"; break; + case 60: err_str = "PMC Un correctable error"; break; + case 43 ... 47: err_str = "PMC Sysmon error"; break; + case 163 ... 184: err_str = "RPU error"; break; + case 148: err_str = "OCM0 correctable error"; break; + case 149: err_str = "OCM1 correctable error"; break; + case 150: err_str = "OCM0 Un-correctable error"; break; + case 151: err_str = "OCM1 Un-correctable error"; break; + case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break; + case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break; + case 232: err_str = "CRAM Un-Correctable error"; break; + default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break; + } + + snprintf(mc_priv->message, + sizeof(mc_priv->message), + "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str); + + /* Convert to bytes */ + length = result[TOTAL_ERR_LENGTH] * 4; + log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, + sec_sev, (void *)&result[ERROR_DATA], length); + + return 0; +} + +static struct rpmsg_device_id amd_rpmsg_id_table[] = { + { .name = "error_ipc" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table); + +static int rpmsg_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_channel_info chinfo; + struct mc_priv *pg; + + pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data; + chinfo.src = RPMSG_ADDR_ANY; + chinfo.dst = rpdev->dst; + strscpy(chinfo.name, amd_rpmsg_id_table[0].name, + strlen(amd_rpmsg_id_table[0].name)); + + pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo); + if (!pg->ept) + return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n", + chinfo.name); + + dev_set_drvdata(&rpdev->dev, pg); + + return 0; +} + +static void rpmsg_remove(struct rpmsg_device *rpdev) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + + rpmsg_destroy_ept(mc_priv->ept); + dev_set_drvdata(&rpdev->dev, NULL); +} + +static struct rpmsg_driver amd_rpmsg_driver = { + .drv.name = KBUILD_MODNAME, + .probe = rpmsg_probe, + .remove = rpmsg_remove, + .callback = rpmsg_cb, + .id_table = amd_rpmsg_id_table, +}; + +static void versal_edac_release(struct device *dev) +{ + kfree(dev); +} + +static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev) +{ + u32 num_chans, rank, dwidth, config; + struct edac_mc_layer layers[2]; + struct mem_ctl_info *mci; + struct device *dev; + enum dev_type dt; + char *name; + int rc, i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + config = priv->adec[CONF + i * ADEC_NUM]; + num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config); + rank = 1 << FIELD_GET(MC5_RANK_MASK, config); + dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config); + + switch (dwidth) { + case XDDR5_BUS_WIDTH_16: + dt = DEV_X16; + break; + case XDDR5_BUS_WIDTH_32: + dt = DEV_X32; + break; + case XDDR5_BUS_WIDTH_64: + dt = DEV_X64; + break; + default: + dt = DEV_UNKNOWN; + } + + if (dt == DEV_UNKNOWN) + continue; + + /* Find the first enabled device and register that one. */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = rank; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + + rc = -ENOMEM; + mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, + sizeof(struct mc_priv)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i); + goto err_alloc; + } + + priv->mci[i] = mci; + priv->dwidth = dt; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev->release = versal_edac_release; + name = kmalloc(32, GFP_KERNEL); + sprintf(name, "versal-net-ddrmc5-edac-%d", i); + dev->init_name = name; + rc = device_register(dev); + if (rc) + goto err_alloc; + + mci->pdev = dev; + + platform_set_drvdata(pdev, priv); + + mc_init(mci, dev); + rc = edac_mc_add_mc(mci); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i); + goto err_alloc; + } + } + return 0; + +err_alloc: + while (i--) { + mci = priv->mci[i]; + if (!mci) + continue; + + if (mci->pdev) { + device_unregister(mci->pdev); + edac_mc_del_mc(mci->pdev); + } + + edac_mc_free(mci); + } + + return rc; +} + +static void remove_versalnet(struct mc_priv *priv) +{ + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + device_unregister(priv->mci[i]->pdev); + mci = edac_mc_del_mc(priv->mci[i]->pdev); + if (!mci) + return; + + edac_mc_free(mci); + } +} + +static int mc_probe(struct platform_device *pdev) +{ + struct device_node *r5_core_node; + struct mc_priv *priv; + struct rproc *rp; + int rc; + + r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0); + if (!r5_core_node) { + dev_err(&pdev->dev, "amd,rproc: invalid phandle\n"); + return -EINVAL; + } + + rp = rproc_get_by_phandle(r5_core_node->phandle); + if (!rp) + return -EPROBE_DEFER; + + rc = rproc_boot(rp); + if (rc) { + dev_err(&pdev->dev, "Failed to attach to remote processor\n"); + goto err_rproc_boot; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + goto err_alloc; + + amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; + + rc = register_rpmsg_driver(&amd_rpmsg_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc); + goto err_alloc; + } + + rc = setup_mcdi(priv); + if (rc) + goto err_unreg; + + priv->mcdi->r5_rproc = rp; + + rc = init_versalnet(priv, pdev); + if (rc) + goto err_init; + + return 0; + +err_init: + cdx_mcdi_finish(priv->mcdi); + +err_unreg: + unregister_rpmsg_driver(&amd_rpmsg_driver); + +err_alloc: + rproc_shutdown(rp); + +err_rproc_boot: + rproc_put(rp); + + return rc; +} + +static void mc_remove(struct platform_device *pdev) +{ + struct mc_priv *priv = platform_get_drvdata(pdev); + + unregister_rpmsg_driver(&amd_rpmsg_driver); + remove_versalnet(priv); + rproc_shutdown(priv->mcdi->r5_rproc); + cdx_mcdi_finish(priv->mcdi); +} + +static const struct of_device_id amd_edac_match[] = { + { .compatible = "xlnx,versal-net-ddrmc5", }, + {} +}; +MODULE_DEVICE_TABLE(of, amd_edac_match); + +static struct platform_driver amd_ddr_edac_mc_driver = { + .driver = { + .name = "versal-net-edac", + .of_match_table = amd_edac_match, + }, + .probe = mc_probe, + .remove = mc_remove, +}; + +module_platform_driver(amd_ddr_edac_mc_driver); + +MODULE_AUTHOR("AMD Inc"); +MODULE_DESCRIPTION("Versal NET EDAC driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h new file mode 100644 index 000000000000..749db33bb482 --- /dev/null +++ b/include/linux/cdx/edac_cdx_pcol.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Driver for AMD network controllers and boards + * + * Copyright (C) 2021, Xilinx, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef MC_CDX_PCOL_H +#define MC_CDX_PCOL_H +#include + +#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN 4 +/* Number of registers for the DDR controller */ +#define MC_CMD_GET_DDR_CONFIG_OFST 4 +#define MC_CMD_GET_DDR_CONFIG_LEN 4 + +/***********************************/ +/* MC_CMD_EDAC_GET_DDR_CONFIG + * Provides detailed configuration for the DDR controller of the given index. + */ +#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3 + +/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */ +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST 0 +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN 4 + +#endif /* MC_CDX_PCOL_H */ -- cgit v1.2.3 From 1b3aa3900782707ec2f4cc1651bc82c628f25d2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Sep 2025 17:45:36 -0600 Subject: io_uring/uring_cmd: correct signature for io_uring_mshot_cmd_post_cqe() The !CONFIG_IO_URING signature is wrong, fix that up. The non stub signature got updated for the io_br_sel changes that happened before this patch went in, but the stub one did not. Fixes: 620a50c92700 ("io_uring: uring_cmd: add multishot support") Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 1350af846ddd..c8185f54fde9 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -126,7 +126,7 @@ io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, return (struct io_br_sel) { .val = -EOPNOTSUPP }; } static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, - ssize_t ret, unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { return true; } -- cgit v1.2.3 From 0cbaf65c91db0e40a577e8919979dac1963cfcc0 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:43 -0700 Subject: tee: add close_context to TEE driver operation The tee_context can be used to manage TEE user resources, including those allocated by the driver for the TEE on behalf of the user. The release() callback is invoked only when all resources, such as tee_shm, are released and there are no references to the tee_context. When a user closes the device file, the driver should notify the TEE to release any resources it may hold and drop the context references. To achieve this, a close_context() callback is introduced to initiate resource release in the TEE driver when the device file is closed. Relocate teedev_ctx_get, teedev_ctx_put, tee_device_get, and tee_device_get functions to tee_core.h to make them accessible outside the TEE subsystem. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 7 +++++++ drivers/tee/tee_private.h | 6 ------ include/linux/tee_core.h | 50 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 68f556898836..5a7fce5b6007 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -80,6 +80,7 @@ void teedev_ctx_get(struct tee_context *ctx) kref_get(&ctx->refcount); } +EXPORT_SYMBOL_GPL(teedev_ctx_get); static void teedev_ctx_release(struct kref *ref) { @@ -97,11 +98,15 @@ void teedev_ctx_put(struct tee_context *ctx) kref_put(&ctx->refcount, teedev_ctx_release); } +EXPORT_SYMBOL_GPL(teedev_ctx_put); void teedev_close_context(struct tee_context *ctx) { struct tee_device *teedev = ctx->teedev; + if (teedev->desc->ops->close_context) + teedev->desc->ops->close_context(ctx); + teedev_ctx_put(ctx); tee_device_put(teedev); } @@ -1112,6 +1117,7 @@ void tee_device_put(struct tee_device *teedev) } mutex_unlock(&teedev->mutex); } +EXPORT_SYMBOL_GPL(tee_device_put); bool tee_device_get(struct tee_device *teedev) { @@ -1124,6 +1130,7 @@ bool tee_device_get(struct tee_device *teedev) mutex_unlock(&teedev->mutex); return true; } +EXPORT_SYMBOL_GPL(tee_device_get); /** * tee_device_unregister() - Removes a TEE device diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index a9b5e4a6a8f7..6bde688bfcb1 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -23,12 +23,6 @@ struct tee_shm_dmabuf_ref { int tee_shm_get_fd(struct tee_shm *shm); -bool tee_device_get(struct tee_device *teedev); -void tee_device_put(struct tee_device *teedev); - -void teedev_ctx_get(struct tee_context *ctx); -void teedev_ctx_put(struct tee_context *ctx); - struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_user_buf(struct tee_context *ctx, unsigned long addr, size_t length); diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 7b0c1da2ca6c..456a940d4710 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -76,8 +76,9 @@ struct tee_device { /** * struct tee_driver_ops - driver operations vtable * @get_version: returns version of driver - * @open: called when the device file is opened - * @release: release this open file + * @open: called for a context when the device file is opened + * @close_context: called when the device file is closed + * @release: called to release the context * @open_session: open a new session * @close_session: close a session * @system_session: declare session as a system session @@ -87,11 +88,17 @@ struct tee_device { * @supp_send: called for supplicant to send a response * @shm_register: register shared memory buffer in TEE * @shm_unregister: unregister shared memory buffer in TEE + * + * The context given to @open might last longer than the device file if it is + * tied to other resources in the TEE driver. @close_context is called when the + * client closes the device file, even if there are existing references to the + * context. The TEE driver can use @close_context to start cleaning up. */ struct tee_driver_ops { void (*get_version)(struct tee_device *teedev, struct tee_ioctl_version_data *vers); int (*open)(struct tee_context *ctx); + void (*close_context)(struct tee_context *ctx); void (*release)(struct tee_context *ctx); int (*open_session)(struct tee_context *ctx, struct tee_ioctl_open_session_arg *arg, @@ -200,6 +207,24 @@ int tee_device_register_dma_heap(struct tee_device *teedev, struct tee_protmem_pool *pool); void tee_device_put_all_dma_heaps(struct tee_device *teedev); +/** + * tee_device_get() - Increment the user count for a tee_device + * @teedev: Pointer to the tee_device + * + * If tee_device_unregister() has been called and the final user of @teedev + * has already released the device, this function will fail to prevent new users + * from accessing the device during the unregistration process. + * + * Returns: true if @teedev remains valid, otherwise false + */ +bool tee_device_get(struct tee_device *teedev); + +/** + * tee_device_put() - Decrease the user count for a tee_device + * @teedev: pointer to the tee_device + */ +void tee_device_put(struct tee_device *teedev); + /** * tee_device_set_dev_groups() - Set device attribute groups * @teedev: Device to register @@ -374,4 +399,25 @@ struct tee_context *teedev_open(struct tee_device *teedev); */ void teedev_close_context(struct tee_context *ctx); +/** + * teedev_ctx_get() - Increment the reference count of a context + * @ctx: Pointer to the context + * + * This function increases the refcount of the context, which is tied to + * resources shared by the same tee_device. During the unregistration process, + * the context may remain valid even after tee_device_unregister() has returned. + * + * Users should ensure that the context's refcount is properly decreased before + * calling tee_device_put(), typically within the context's release() function. + * Alternatively, users can call tee_device_get() and teedev_ctx_get() together + * and release them simultaneously (see shm_alloc_helper()). + */ +void teedev_ctx_get(struct tee_context *ctx); + +/** + * teedev_ctx_put() - Decrease reference count on a context + * @ctx: pointer to the context + */ +void teedev_ctx_put(struct tee_context *ctx); + #endif /*__TEE_CORE_H*/ -- cgit v1.2.3 From 54a53e95a908a4cc770f0530c49f04c89e7b18dc Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:44 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_UBUF For drivers that can transfer data to the TEE without using shared memory from client, it is necessary to receive the user address directly, bypassing any processing by the TEE subsystem. Introduce TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT/OUTPUT/INOUT to represent userspace buffers. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 33 +++++++++++++++++++++++++++++++++ include/linux/tee_drv.h | 6 ++++++ include/uapi/linux/tee.h | 22 ++++++++++++++++------ 3 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 5a7fce5b6007..529738565ebd 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -494,6 +494,17 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, params[n].u.value.b = ip.b; params[n].u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + params[n].u.ubuf.uaddr = u64_to_user_ptr(ip.a); + params[n].u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -527,6 +538,11 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, put_user(p->u.value.c, &up->c)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + if (put_user((u64)p->u.ubuf.size, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -727,6 +743,13 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.value.b; ip.c = p->u.value.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + ip.a = (__force unsigned long)p->u.ubuf.uaddr; + ip.b = p->u.ubuf.size; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -829,6 +852,16 @@ static int params_from_supp(struct tee_param *params, size_t num_params, p->u.value.b = ip.b; p->u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + p->u.ubuf.uaddr = u64_to_user_ptr(ip.a); + p->u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 824f1251de60..7915e8869cbd 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -82,6 +82,11 @@ struct tee_param_memref { struct tee_shm *shm; }; +struct tee_param_ubuf { + void __user *uaddr; + size_t size; +}; + struct tee_param_value { u64 a; u64 b; @@ -92,6 +97,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_ubuf ubuf; struct tee_param_value value; } u; }; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d843cf980d98..0e3b735dcfca 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -151,6 +151,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT 6 #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT 7 /* input and output */ +/* + * These defines userspace buffer parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT 8 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -186,14 +193,17 @@ struct tee_ioctl_buf_data { /** * struct tee_ioctl_param - parameter * @attr: attributes - * @a: if a memref, offset into the shared memory object, else a value parameter - * @b: if a memref, size of the buffer, else a value parameter + * @a: if a memref, offset into the shared memory object, + * else if a ubuf, address of the user buffer, + * else a value parameter + * @b: if a memref or ubuf, size of the buffer, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * - * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in - * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE - * indicates that none of the members are used. + * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is + * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members + * are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference -- cgit v1.2.3 From d5b8b0fa1775d8b59c3fc9e4aa2baa715d08f3ee Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:45 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF The TEE subsystem allows session-based access to trusted services, requiring a session to be established to receive a service. This is not suitable for an environment that represents services as objects. An object supports various operations that a client can invoke, potentially generating a result or a new object that can be invoked independently of the original object. Add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT/OUTPUT/INOUT to represent an object. Objects may reside in either TEE or userspace. To invoke an object in TEE, introduce a new ioctl. Use the existing SUPPL_RECV and SUPPL_SEND to invoke an object in userspace. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/tee_core.h | 4 +++ include/linux/tee_drv.h | 6 ++++ include/uapi/linux/tee.h | 41 +++++++++++++++++++---- 4 files changed, 130 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 529738565ebd..71b1bd69f067 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -487,6 +487,7 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) { case TEE_IOCTL_PARAM_ATTR_TYPE_NONE: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: break; case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT: @@ -505,6 +506,11 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + params[n].u.objref.id = ip.a; + params[n].u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -543,6 +549,12 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, if (put_user((u64)p->u.ubuf.size, &up->b)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + if (put_user(p->u.objref.id, &up->a) || + put_user(p->u.objref.flags, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -695,6 +707,66 @@ out: return rc; } +static int tee_ioctl_object_invoke(struct tee_context *ctx, + struct tee_ioctl_buf_data __user *ubuf) +{ + int rc; + size_t n; + struct tee_ioctl_buf_data buf; + struct tee_ioctl_object_invoke_arg __user *uarg; + struct tee_ioctl_object_invoke_arg arg; + struct tee_ioctl_param __user *uparams = NULL; + struct tee_param *params = NULL; + + if (!ctx->teedev->desc->ops->object_invoke_func) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + + if (buf.buf_len > TEE_MAX_ARG_SIZE || + buf.buf_len < sizeof(struct tee_ioctl_object_invoke_arg)) + return -EINVAL; + + uarg = u64_to_user_ptr(buf.buf_ptr); + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len) + return -EINVAL; + + if (arg.num_params) { + params = kcalloc(arg.num_params, sizeof(struct tee_param), + GFP_KERNEL); + if (!params) + return -ENOMEM; + uparams = uarg->params; + rc = params_from_user(ctx, params, arg.num_params, uparams); + if (rc) + goto out; + } + + rc = ctx->teedev->desc->ops->object_invoke_func(ctx, &arg, params); + if (rc) + goto out; + + if (put_user(arg.ret, &uarg->ret)) { + rc = -EFAULT; + goto out; + } + rc = params_to_user(uparams, arg.num_params, params); +out: + if (params) { + /* Decrease ref count for all valid shared memory pointers */ + for (n = 0; n < arg.num_params; n++) + if (tee_param_is_memref(params + n) && + params[n].u.memref.shm) + tee_shm_put(params[n].u.memref.shm); + kfree(params); + } + return rc; +} + static int tee_ioctl_cancel(struct tee_context *ctx, struct tee_ioctl_cancel_arg __user *uarg) { @@ -750,6 +822,12 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.ubuf.size; ip.c = 0; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + ip.a = p->u.objref.id; + ip.b = p->u.objref.flags; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -862,6 +940,11 @@ static int params_from_supp(struct tee_param *params, size_t num_params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + p->u.objref.id = ip.a; + p->u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* @@ -944,6 +1027,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: return tee_ioctl_invoke(ctx, uarg); + case TEE_IOC_OBJECT_INVOKE: + return tee_ioctl_object_invoke(ctx, uarg); case TEE_IOC_CANCEL: return tee_ioctl_cancel(ctx, uarg); case TEE_IOC_CLOSE_SESSION: diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 456a940d4710..1f3e5dad6d0d 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -83,6 +83,7 @@ struct tee_device { * @close_session: close a session * @system_session: declare session as a system session * @invoke_func: invoke a trusted function + * @object_invoke_func: invoke a TEE object * @cancel_req: request cancel of an ongoing invoke or open * @supp_recv: called for supplicant to get a command * @supp_send: called for supplicant to send a response @@ -108,6 +109,9 @@ struct tee_driver_ops { int (*invoke_func)(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg, struct tee_param *param); + int (*object_invoke_func)(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *param); int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session); int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params, struct tee_param *param); diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 7915e8869cbd..88a6f9697c89 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -87,6 +87,11 @@ struct tee_param_ubuf { size_t size; }; +struct tee_param_objref { + u64 id; + u64 flags; +}; + struct tee_param_value { u64 a; u64 b; @@ -97,6 +102,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_objref objref; struct tee_param_ubuf ubuf; struct tee_param_value value; } u; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 0e3b735dcfca..9abb0f299549 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -48,8 +48,10 @@ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ #define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */ #define TEE_GEN_CAP_MEMREF_NULL (1 << 3)/* NULL MemRef support */ +#define TEE_GEN_CAP_OBJREF (1 << 4)/* Supports generic object reference */ -#define TEE_MEMREF_NULL (__u64)(-1) /* NULL MemRef Buffer */ +#define TEE_MEMREF_NULL ((__u64)(-1)) /* NULL MemRef Buffer */ +#define TEE_OBJREF_NULL ((__u64)(-1)) /* NULL ObjRef Object */ /* * TEE Implementation ID @@ -158,6 +160,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ +/* + * These defines object reference parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT 11 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT 12 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT 13 + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -195,15 +204,16 @@ struct tee_ioctl_buf_data { * @attr: attributes * @a: if a memref, offset into the shared memory object, * else if a ubuf, address of the user buffer, - * else a value parameter - * @b: if a memref or ubuf, size of the buffer, else a value parameter + * else if an objref, object identifier, else a value parameter + * @b: if a memref or ubuf, size of the buffer, + * else if objref, flags for the object, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* - * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members - * are used. + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf, and TEE_PARAM_ATTR_TYPE_OBJREF_* indicates objref. + * TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference @@ -442,4 +452,23 @@ struct tee_ioctl_shm_register_fd_data { * munmap(): unmaps previously shared memory */ +/** + * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * @id: [in] Object id + * @op: [in] Object operation, specific to the object + * @ret: [out] return value + * @num_params: [in] number of parameters following this struct + */ +struct tee_ioctl_object_invoke_arg { + __u64 id; + __u32 op; + __u32 ret; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +#define TEE_IOC_OBJECT_INVOKE _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 10, \ + struct tee_ioctl_buf_data) + #endif /*__TEE_H*/ -- cgit v1.2.3 From bd5139306886a9626a7d794940376806eccb9547 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:46 -0700 Subject: tee: increase TEE_MAX_ARG_SIZE to 4096 Increase TEE_MAX_ARG_SIZE to accommodate worst-case scenarios where additional buffer space is required to pass all arguments to TEE. This change is necessary for upcoming support for Qualcomm TEE, which requires a larger buffer for argument marshaling. Reviewed-by: Sumit Garg Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 9abb0f299549..a5466b503bfe 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -42,7 +42,7 @@ #define TEE_IOC_MAGIC 0xa4 #define TEE_IOC_BASE 0 -#define TEE_MAX_ARG_SIZE 1024 +#define TEE_MAX_ARG_SIZE 4096 #define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ -- cgit v1.2.3 From d6e290837e50f73f88f31f19bd8a7213d92e6e46 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:47 -0700 Subject: tee: add Qualcomm TEE driver Introduce qcomtee_object, which represents an object in both QTEE and the kernel. QTEE clients can invoke an instance of qcomtee_object to access QTEE services. If this invocation produces a new object in QTEE, an instance of qcomtee_object will be returned. Similarly, QTEE can request services from by issuing a callback request, which invokes an instance of qcomtee_object. Implement initial support for exporting qcomtee_object to userspace and QTEE, enabling the invocation of objects hosted in QTEE and userspace through the TEE subsystem. Tested-by: Neil Armstrong Tested-by: Harshal Dev Acked-by: Sumit Garg Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- MAINTAINERS | 6 + drivers/tee/Kconfig | 1 + drivers/tee/Makefile | 1 + drivers/tee/qcomtee/Kconfig | 12 + drivers/tee/qcomtee/Makefile | 7 + drivers/tee/qcomtee/async.c | 182 +++++++ drivers/tee/qcomtee/call.c | 813 +++++++++++++++++++++++++++++++ drivers/tee/qcomtee/core.c | 906 +++++++++++++++++++++++++++++++++++ drivers/tee/qcomtee/qcomtee.h | 143 ++++++ drivers/tee/qcomtee/qcomtee_msg.h | 304 ++++++++++++ drivers/tee/qcomtee/qcomtee_object.h | 316 ++++++++++++ drivers/tee/qcomtee/shm.c | 153 ++++++ drivers/tee/qcomtee/user_obj.c | 692 ++++++++++++++++++++++++++ include/uapi/linux/tee.h | 1 + 14 files changed, 3537 insertions(+) create mode 100644 drivers/tee/qcomtee/Kconfig create mode 100644 drivers/tee/qcomtee/Makefile create mode 100644 drivers/tee/qcomtee/async.c create mode 100644 drivers/tee/qcomtee/call.c create mode 100644 drivers/tee/qcomtee/core.c create mode 100644 drivers/tee/qcomtee/qcomtee.h create mode 100644 drivers/tee/qcomtee/qcomtee_msg.h create mode 100644 drivers/tee/qcomtee/qcomtee_object.h create mode 100644 drivers/tee/qcomtee/shm.c create mode 100644 drivers/tee/qcomtee/user_obj.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..bde449308736 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20856,6 +20856,12 @@ F: Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst F: drivers/net/ethernet/qualcomm/rmnet/ F: include/linux/if_rmnet.h +QUALCOMM TEE (QCOMTEE) DRIVER +M: Amirreza Zarrabi +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: drivers/tee/qcomtee/ + QUALCOMM TRUST ZONE MEMORY ALLOCATOR M: Bartosz Golaszewski L: linux-arm-msm@vger.kernel.org diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 90600607a9d8..35d207ef6910 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -21,5 +21,6 @@ config TEE_DMABUF_HEAPS source "drivers/tee/optee/Kconfig" source "drivers/tee/amdtee/Kconfig" source "drivers/tee/tstee/Kconfig" +source "drivers/tee/qcomtee/Kconfig" endif diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile index 949a6a79fb06..3239b91dee96 100644 --- a/drivers/tee/Makefile +++ b/drivers/tee/Makefile @@ -7,3 +7,4 @@ tee-objs += tee_shm_pool.o obj-$(CONFIG_OPTEE) += optee/ obj-$(CONFIG_AMDTEE) += amdtee/ obj-$(CONFIG_ARM_TSTEE) += tstee/ +obj-$(CONFIG_QCOMTEE) += qcomtee/ diff --git a/drivers/tee/qcomtee/Kconfig b/drivers/tee/qcomtee/Kconfig new file mode 100644 index 000000000000..927686abceb1 --- /dev/null +++ b/drivers/tee/qcomtee/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Qualcomm Trusted Execution Environment Configuration +config QCOMTEE + tristate "Qualcomm TEE Support" + depends on !CPU_BIG_ENDIAN + select QCOM_SCM + select QCOM_TZMEM_MODE_SHMBRIDGE + help + This option enables the Qualcomm Trusted Execution Environment (QTEE) + driver. It provides an API to access services offered by QTEE and + its loaded Trusted Applications (TAs). Additionally, it facilitates + the export of userspace services provided by supplicants to QTEE. diff --git a/drivers/tee/qcomtee/Makefile b/drivers/tee/qcomtee/Makefile new file mode 100644 index 000000000000..600af2b8f1c1 --- /dev/null +++ b/drivers/tee/qcomtee/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_QCOMTEE) += qcomtee.o +qcomtee-objs += async.o +qcomtee-objs += call.o +qcomtee-objs += core.o +qcomtee-objs += shm.o +qcomtee-objs += user_obj.o diff --git a/drivers/tee/qcomtee/async.c b/drivers/tee/qcomtee/async.c new file mode 100644 index 000000000000..31bff4309e67 --- /dev/null +++ b/drivers/tee/qcomtee/async.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "qcomtee.h" + +#define QCOMTEE_ASYNC_VERSION_1_0 0x00010000U /* Maj: 0x0001, Min: 0x0000. */ +#define QCOMTEE_ASYNC_VERSION_1_1 0x00010001U /* Maj: 0x0001, Min: 0x0001. */ +#define QCOMTEE_ASYNC_VERSION_1_2 0x00010002U /* Maj: 0x0001, Min: 0x0002. */ +#define QCOMTEE_ASYNC_VERSION_CURRENT QCOMTEE_ASYNC_VERSION_1_2 + +#define QCOMTEE_ASYNC_VERSION_MAJOR(n) upper_16_bits(n) +#define QCOMTEE_ASYNC_VERSION_MINOR(n) lower_16_bits(n) + +#define QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR \ + QCOMTEE_ASYNC_VERSION_MAJOR(QCOMTEE_ASYNC_VERSION_CURRENT) +#define QCOMTEE_ASYNC_VERSION_CURRENT_MINOR \ + QCOMTEE_ASYNC_VERSION_MINOR(QCOMTEE_ASYNC_VERSION_CURRENT) + +/** + * struct qcomtee_async_msg_hdr - Asynchronous message header format. + * @version: current async protocol version of the remote endpoint. + * @op: async operation. + * + * @version specifies the endpoint's (QTEE or driver) supported async protocol. + * For example, if QTEE sets @version to %QCOMTEE_ASYNC_VERSION_1_1, QTEE + * handles operations supported in %QCOMTEE_ASYNC_VERSION_1_1 or + * %QCOMTEE_ASYNC_VERSION_1_0. @op determines the message format. + */ +struct qcomtee_async_msg_hdr { + u32 version; + u32 op; +}; + +/* Size of an empty async message. */ +#define QCOMTEE_ASYNC_MSG_ZERO sizeof(struct qcomtee_async_msg_hdr) + +/** + * struct qcomtee_async_release_msg - Release asynchronous message. + * @hdr: message header as &struct qcomtee_async_msg_hdr. + * @counts: number of objects in @object_ids. + * @object_ids: array of object IDs that should be released. + * + * Available in Maj = 0x0001, Min >= 0x0000. + */ +struct qcomtee_async_release_msg { + struct qcomtee_async_msg_hdr hdr; + u32 counts; + u32 object_ids[] __counted_by(counts); +}; + +/** + * qcomtee_get_async_buffer() - Get the start of the asynchronous message. + * @oic: context used for the current invocation. + * @async_buffer: return buffer to extract from or fill in async messages. + * + * If @oic is used for direct object invocation, the whole outbound buffer + * is available for the async message. If @oic is used for a callback request, + * the tail of the outbound buffer (after the callback request message) is + * available for the async message. + * + * The start of the async buffer is aligned, see qcomtee_msg_offset_align(). + */ +static void qcomtee_get_async_buffer(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_buffer *async_buffer) +{ + struct qcomtee_msg_callback *msg; + unsigned int offset; + int i; + + if (!(oic->flags & QCOMTEE_OIC_FLAG_BUSY)) { + /* The outbound buffer is empty. Using the whole buffer. */ + offset = 0; + } else { + msg = (struct qcomtee_msg_callback *)oic->out_msg.addr; + + /* Start offset in a message for buffer arguments. */ + offset = qcomtee_msg_buffer_args(struct qcomtee_msg_callback, + qcomtee_msg_args(msg)); + + /* Add size of IB arguments. */ + qcomtee_msg_for_each_input_buffer(i, msg) + offset += qcomtee_msg_offset_align(msg->args[i].b.size); + + /* Add size of OB arguments. */ + qcomtee_msg_for_each_output_buffer(i, msg) + offset += qcomtee_msg_offset_align(msg->args[i].b.size); + } + + async_buffer->addr = oic->out_msg.addr + offset; + async_buffer->size = oic->out_msg.size - offset; +} + +/** + * async_release() - Process QTEE async release requests. + * @oic: context used for the current invocation. + * @msg: async message for object release. + * @size: size of the async buffer available. + * + * Return: Size of the outbound buffer used when processing @msg. + */ +static size_t async_release(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_async_msg_hdr *async_msg, + size_t size) +{ + struct qcomtee_async_release_msg *msg; + struct qcomtee_object *object; + int i; + + msg = (struct qcomtee_async_release_msg *)async_msg; + + for (i = 0; i < msg->counts; i++) { + object = qcomtee_idx_erase(oic, msg->object_ids[i]); + qcomtee_object_put(object); + } + + return struct_size(msg, object_ids, msg->counts); +} + +/** + * qcomtee_fetch_async_reqs() - Fetch and process asynchronous messages. + * @oic: context used for the current invocation. + * + * Calls handlers to process the requested operations in the async message. + * Currently, only supports async release requests. + */ +void qcomtee_fetch_async_reqs(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_async_msg_hdr *async_msg; + struct qcomtee_buffer async_buffer; + size_t consumed, used = 0; + u16 major_ver; + + qcomtee_get_async_buffer(oic, &async_buffer); + + while (async_buffer.size - used > QCOMTEE_ASYNC_MSG_ZERO) { + async_msg = (struct qcomtee_async_msg_hdr *)(async_buffer.addr + + used); + /* + * QTEE assumes that the unused space of the async buffer is + * zeroed; so if version is zero, the buffer is unused. + */ + if (async_msg->version == 0) + goto out; + + major_ver = QCOMTEE_ASYNC_VERSION_MAJOR(async_msg->version); + /* Major version mismatch is a compatibility break. */ + if (major_ver != QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR) { + pr_err("Async message version mismatch (%u != %u)\n", + major_ver, QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR); + + goto out; + } + + switch (async_msg->op) { + case QCOMTEE_MSG_OBJECT_OP_RELEASE: + consumed = async_release(oic, async_msg, + async_buffer.size - used); + break; + default: + pr_err("Unsupported async message %u\n", async_msg->op); + goto out; + } + + /* Supported operation but unable to parse the message. */ + if (!consumed) { + pr_err("Unable to parse async message for op %u\n", + async_msg->op); + goto out; + } + + /* Next async message. */ + used += qcomtee_msg_offset_align(consumed); + } + +out: + /* Reset the async buffer so async requests do not loop to QTEE. */ + memzero_explicit(async_buffer.addr, async_buffer.size); +} diff --git a/drivers/tee/qcomtee/call.c b/drivers/tee/qcomtee/call.c new file mode 100644 index 000000000000..33daa4d7033d --- /dev/null +++ b/drivers/tee/qcomtee/call.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include "qcomtee.h" + +static int find_qtee_object(struct qcomtee_object **object, unsigned long id, + struct qcomtee_context_data *ctxdata) +{ + int err = 0; + + guard(rcu)(); + /* Object release is RCU protected. */ + *object = idr_find(&ctxdata->qtee_objects_idr, id); + if (!qcomtee_object_get(*object)) + err = -EINVAL; + + return err; +} + +static void del_qtee_object(unsigned long id, + struct qcomtee_context_data *ctxdata) +{ + struct qcomtee_object *object; + + scoped_guard(mutex, &ctxdata->qtee_lock) + object = idr_remove(&ctxdata->qtee_objects_idr, id); + + qcomtee_object_put(object); +} + +/** + * qcomtee_context_add_qtee_object() - Add a QTEE object to the context. + * @param: TEE parameter representing @object. + * @object: QTEE object. + * @ctx: context to add the object. + * + * It assumes @object is %QCOMTEE_OBJECT_TYPE_TEE and the caller has already + * issued qcomtee_object_get() for @object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_context_add_qtee_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx) +{ + int ret; + struct qcomtee_context_data *ctxdata = ctx->data; + + scoped_guard(mutex, &ctxdata->qtee_lock) + ret = idr_alloc(&ctxdata->qtee_objects_idr, object, 0, 0, + GFP_KERNEL); + if (ret < 0) + return ret; + + param->u.objref.id = ret; + /* QTEE Object: QCOMTEE_OBJREF_FLAG_TEE set. */ + param->u.objref.flags = QCOMTEE_OBJREF_FLAG_TEE; + + return 0; +} + +/* Retrieve the QTEE object added with qcomtee_context_add_qtee_object(). */ +int qcomtee_context_find_qtee_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + + return find_qtee_object(object, param->u.objref.id, ctxdata); +} + +/** + * qcomtee_context_del_qtee_object() - Delete a QTEE object from the context. + * @param: TEE parameter representing @object. + * @ctx: context for deleting the object. + * + * The @param has been initialized by qcomtee_context_add_qtee_object(). + */ +void qcomtee_context_del_qtee_object(struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + /* 'qtee_objects_idr' stores QTEE objects only. */ + if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_TEE) + del_qtee_object(param->u.objref.id, ctxdata); +} + +/** + * qcomtee_objref_to_arg() - Convert OBJREF parameter to QTEE argument. + * @arg: QTEE argument. + * @param: TEE parameter. + * @ctx: context in which the conversion should happen. + * + * It assumes @param is an OBJREF. + * It does not set @arg.type; the caller should initialize it to a correct + * &enum qcomtee_arg_type value. It gets the object's refcount in @arg; + * the caller should manage to put it afterward. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_objref_to_arg(struct qcomtee_arg *arg, struct tee_param *param, + struct tee_context *ctx) +{ + int err = -EINVAL; + + arg->o = NULL_QCOMTEE_OBJECT; + /* param is a NULL object: */ + if (param->u.objref.id == TEE_OBJREF_NULL) + return 0; + + /* param is a callback object: */ + if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_USER) + err = qcomtee_user_param_to_object(&arg->o, param, ctx); + /* param is a QTEE object: */ + else if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_TEE) + err = qcomtee_context_find_qtee_object(&arg->o, param, ctx); + + /* + * For callback objects, call qcomtee_object_get() to keep a temporary + * copy for the driver, as these objects are released asynchronously + * and may disappear even before returning from QTEE. + * + * - For direct object invocations, the matching put is called in + * qcomtee_object_invoke() when parsing the QTEE response. + * - For callback responses, put is called in qcomtee_user_object_notify() + * after QTEE has received its copies. + */ + + if (!err && (typeof_qcomtee_object(arg->o) == QCOMTEE_OBJECT_TYPE_CB)) + qcomtee_object_get(arg->o); + + return err; +} + +/** + * qcomtee_objref_from_arg() - Convert QTEE argument to OBJREF param. + * @param: TEE parameter. + * @arg: QTEE argument. + * @ctx: context in which the conversion should happen. + * + * It assumes @arg is of %QCOMTEE_ARG_TYPE_IO or %QCOMTEE_ARG_TYPE_OO. + * It does not set @param.attr; the caller should initialize it to a + * correct type. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_objref_from_arg(struct tee_param *param, struct qcomtee_arg *arg, + struct tee_context *ctx) +{ + struct qcomtee_object *object = arg->o; + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_NULL: + param->u.objref.id = TEE_OBJREF_NULL; + + return 0; + case QCOMTEE_OBJECT_TYPE_CB: + /* object is a callback object: */ + if (is_qcomtee_user_object(object)) + return qcomtee_user_param_from_object(param, object, + ctx); + + break; + case QCOMTEE_OBJECT_TYPE_TEE: + return qcomtee_context_add_qtee_object(param, object, ctx); + + case QCOMTEE_OBJECT_TYPE_ROOT: + default: + break; + } + + return -EINVAL; +} + +/** + * qcomtee_params_to_args() - Convert TEE parameters to QTEE arguments. + * @u: QTEE arguments. + * @params: TEE parameters. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * It assumes @u has at least @num_params + 1 entries and has been initialized + * with %QCOMTEE_ARG_TYPE_INV as &struct qcomtee_arg.type. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_params_to_args(struct qcomtee_arg *u, + struct tee_param *params, int num_params, + struct tee_context *ctx) +{ + int i; + + for (i = 0; i < num_params; i++) { + switch (params[i].attr) { + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + u[i].flags = QCOMTEE_ARG_FLAGS_UADDR; + u[i].b.uaddr = params[i].u.ubuf.uaddr; + u[i].b.size = params[i].u.ubuf.size; + + if (params[i].attr == + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT) + u[i].type = QCOMTEE_ARG_TYPE_IB; + else /* TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT */ + u[i].type = QCOMTEE_ARG_TYPE_OB; + + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + u[i].type = QCOMTEE_ARG_TYPE_IO; + if (qcomtee_objref_to_arg(&u[i], ¶ms[i], ctx)) + goto out_failed; + + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + u[i].type = QCOMTEE_ARG_TYPE_OO; + u[i].o = NULL_QCOMTEE_OBJECT; + break; + default: + goto out_failed; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_to_arg(). */ + for (i--; i >= 0; i--) { + if (u[i].type != QCOMTEE_ARG_TYPE_IO) + continue; + + qcomtee_user_object_set_notify(u[i].o, false); + /* See docs for qcomtee_objref_to_arg() for double put. */ + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_params_from_args() - Convert QTEE arguments to TEE parameters. + * @params: TEE parameters. + * @u: QTEE arguments. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * @u should have already been initialized by qcomtee_params_to_args(). + * This also represents the end of a QTEE invocation that started with + * qcomtee_params_to_args() by releasing %QCOMTEE_ARG_TYPE_IO objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_params_from_args(struct tee_param *params, + struct qcomtee_arg *u, int num_params, + struct tee_context *ctx) +{ + int i, np; + + qcomtee_arg_for_each(np, u) { + switch (u[np].type) { + case QCOMTEE_ARG_TYPE_OB: + /* TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT */ + params[np].u.ubuf.size = u[np].b.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + /* IEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT */ + qcomtee_object_put(u[np].o); + + break; + case QCOMTEE_ARG_TYPE_OO: + /* TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT */ + if (qcomtee_objref_from_arg(¶ms[np], &u[np], ctx)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_IB: + default: + break; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_from_arg(). */ + for (i = 0; i < np; i++) { + if (params[i].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) + qcomtee_context_del_qtee_object(¶ms[i], ctx); + } + + /* Release any IO and OO objects not processed. */ + for (; u[i].type && i < num_params; i++) { + if (u[i].type == QCOMTEE_ARG_TYPE_OO || + u[i].type == QCOMTEE_ARG_TYPE_IO) + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/* TEE Device Ops. */ + +static int qcomtee_params_check(struct tee_param *params, int num_params) +{ + int io = 0, oo = 0, ib = 0, ob = 0; + int i; + + /* QTEE can accept 64 arguments. */ + if (num_params > QCOMTEE_ARGS_MAX) + return -EINVAL; + + /* Supported parameter types. */ + for (i = 0; i < num_params; i++) { + switch (params[i].attr) { + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + ib++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + ob++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + io++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + oo++; + break; + default: + return -EINVAL; + } + } + + /* QTEE can accept 16 arguments of each supported types. */ + if (io > QCOMTEE_ARGS_PER_TYPE || oo > QCOMTEE_ARGS_PER_TYPE || + ib > QCOMTEE_ARGS_PER_TYPE || ob > QCOMTEE_ARGS_PER_TYPE) + return -EINVAL; + + return 0; +} + +/* Check if an operation on ROOT_QCOMTEE_OBJECT from userspace is permitted. */ +static int qcomtee_root_object_check(u32 op, struct tee_param *params, + int num_params) +{ + /* Some privileged operations recognized by QTEE. */ + if (op == QCOMTEE_ROOT_OP_NOTIFY_DOMAIN_CHANGE || + op == QCOMTEE_ROOT_OP_ADCI_ACCEPT || + op == QCOMTEE_ROOT_OP_ADCI_SHUTDOWN) + return -EINVAL; + + /* + * QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS is to register with QTEE + * by passing a credential object as input OBJREF. TEE_OBJREF_NULL as a + * credential object represents a privileged client for QTEE and + * is used by the kernel only. + */ + if (op == QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS && num_params == 2) { + if (params[0].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT && + params[1].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) { + if (params[0].u.objref.id == TEE_OBJREF_NULL) + return -EINVAL; + } + } + + return 0; +} + +/** + * qcomtee_object_invoke() - Invoke a QTEE object. + * @ctx: TEE context. + * @arg: ioctl arguments. + * @params: parameters for the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_object_invoke(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *params) +{ + struct qcomtee_object_invoke_ctx *oic __free(kfree) = NULL; + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_arg *u __free(kfree) = NULL; + struct qcomtee_object *object; + int i, ret, result; + + if (qcomtee_params_check(params, arg->num_params)) + return -EINVAL; + + /* First, handle reserved operations: */ + if (arg->op == QCOMTEE_MSG_OBJECT_OP_RELEASE) { + del_qtee_object(arg->id, ctxdata); + + return 0; + } + + /* Otherwise, invoke a QTEE object: */ + oic = qcomtee_object_invoke_ctx_alloc(ctx); + if (!oic) + return -ENOMEM; + + /* +1 for ending QCOMTEE_ARG_TYPE_INV. */ + u = kcalloc(arg->num_params + 1, sizeof(*u), GFP_KERNEL); + if (!u) + return -ENOMEM; + + /* Get an object to invoke. */ + if (arg->id == TEE_OBJREF_NULL) { + /* Use ROOT if TEE_OBJREF_NULL is invoked. */ + if (qcomtee_root_object_check(arg->op, params, arg->num_params)) + return -EINVAL; + + object = ROOT_QCOMTEE_OBJECT; + } else if (find_qtee_object(&object, arg->id, ctxdata)) { + return -EINVAL; + } + + ret = qcomtee_params_to_args(u, params, arg->num_params, ctx); + if (ret) + goto out; + + ret = qcomtee_object_do_invoke(oic, object, arg->op, u, &result); + if (ret) { + qcomtee_arg_for_each_input_object(i, u) { + qcomtee_user_object_set_notify(u[i].o, false); + qcomtee_object_put(u[i].o); + } + + goto out; + } + + /* Prase QTEE response and put driver's object copies: */ + + if (!result) { + /* Assume service is UNAVAIL if unable to process the result. */ + if (qcomtee_params_from_args(params, u, arg->num_params, ctx)) + result = QCOMTEE_MSG_ERROR_UNAVAIL; + } else { + /* + * qcomtee_params_to_args() gets a copy of IO for the driver to + * make sure they do not get released while in the middle of + * invocation. On success (!result), qcomtee_params_from_args() + * puts them; Otherwise, put them here. + */ + qcomtee_arg_for_each_input_object(i, u) + qcomtee_object_put(u[i].o); + } + + arg->ret = result; +out: + qcomtee_object_put(object); + + return ret; +} + +/** + * qcomtee_supp_recv() - Wait for a request for the supplicant. + * @ctx: TEE context. + * @op: requested operation on the object. + * @num_params: number of elements in the parameter array. + * @params: parameters for @op. + * + * The first parameter is a meta %TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT. + * On input, it provides a user buffer. This buffer is used for parameters of + * type %TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT in qcomtee_cb_params_from_args(). + * On output, the object ID and request ID are stored in the meta parameter. + * + * @num_params is updated to the number of parameters that actually exist + * in @params on return. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_supp_recv(struct tee_context *ctx, u32 *op, u32 *num_params, + struct tee_param *params) +{ + struct qcomtee_user_object_request_data data; + void __user *uaddr; + size_t ubuf_size; + int i, ret; + + if (!*num_params) + return -EINVAL; + + /* First parameter should be an INOUT + meta parameter. */ + if (params->attr != + (TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT | TEE_IOCTL_PARAM_ATTR_META)) + return -EINVAL; + + /* Other parameters are none. */ + for (i = 1; i < *num_params; i++) + if (params[i].attr) + return -EINVAL; + + if (!IS_ALIGNED(params->u.value.a, 8)) + return -EINVAL; + + /* User buffer and size from meta parameter. */ + uaddr = u64_to_user_ptr(params->u.value.a); + ubuf_size = params->u.value.b; + /* Process TEE parameters. +/-1 to ignore the meta parameter. */ + ret = qcomtee_user_object_select(ctx, params + 1, *num_params - 1, + uaddr, ubuf_size, &data); + if (ret) + return ret; + + params->u.value.a = data.object_id; + params->u.value.b = data.id; + params->u.value.c = 0; + *op = data.op; + *num_params = data.np + 1; + + return 0; +} + +/** + * qcomtee_supp_send() - Submit a response for a request. + * @ctx: TEE context. + * @errno: return value for the request. + * @num_params: number of elements in the parameter array. + * @params: returned parameters. + * + * The first parameter is a meta %TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT. + * It specifies the request ID this response belongs to. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_supp_send(struct tee_context *ctx, u32 errno, u32 num_params, + struct tee_param *params) +{ + int req_id; + + if (!num_params) + return -EINVAL; + + /* First parameter should be an OUTPUT + meta parameter. */ + if (params->attr != (TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT | + TEE_IOCTL_PARAM_ATTR_META)) + return -EINVAL; + + req_id = params->u.value.a; + /* Process TEE parameters. +/-1 to ignore the meta parameter. */ + return qcomtee_user_object_submit(ctx, params + 1, num_params - 1, + req_id, errno); +} + +static int qcomtee_open(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata __free(kfree) = NULL; + + ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL); + if (!ctxdata) + return -ENOMEM; + + /* + * In the QTEE driver, the same context is used to refcount resources + * shared by QTEE. For example, teedev_ctx_get() is called for any + * instance of callback objects (see qcomtee_user_param_to_object()). + * + * Maintain a copy of teedev for QTEE as it serves as a direct user of + * this context. The teedev will be released in the context's release(). + * + * tee_device_unregister() will remain blocked until all contexts + * are released. This includes contexts owned by the user, which are + * closed by teedev_close_context(), as well as those owned by QTEE + * closed by teedev_ctx_put() in object's release(). + */ + if (!tee_device_get(ctx->teedev)) + return -EINVAL; + + idr_init(&ctxdata->qtee_objects_idr); + mutex_init(&ctxdata->qtee_lock); + idr_init(&ctxdata->reqs_idr); + INIT_LIST_HEAD(&ctxdata->reqs_list); + mutex_init(&ctxdata->reqs_lock); + init_completion(&ctxdata->req_c); + + ctx->data = no_free_ptr(ctxdata); + + return 0; +} + +/* Gets called when the user closes the device */ +static void qcomtee_close_context(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_object *object; + int id; + + /* Process QUEUED or PROCESSING requests. */ + qcomtee_requests_destroy(ctxdata); + /* Release QTEE objects. */ + idr_for_each_entry(&ctxdata->qtee_objects_idr, object, id) + qcomtee_object_put(object); +} + +/* Gets called when the final reference to the context goes away. */ +static void qcomtee_release(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + + idr_destroy(&ctxdata->qtee_objects_idr); + idr_destroy(&ctxdata->reqs_idr); + kfree(ctxdata); + + /* There is nothing shared in this context with QTEE. */ + tee_device_put(ctx->teedev); +} + +static void qcomtee_get_version(struct tee_device *teedev, + struct tee_ioctl_version_data *vers) +{ + struct tee_ioctl_version_data v = { + .impl_id = TEE_IMPL_ID_QTEE, + .gen_caps = TEE_GEN_CAP_OBJREF, + }; + + *vers = v; +} + +/** + * qcomtee_get_qtee_feature_list() - Query QTEE features versions. + * @ctx: TEE context. + * @id: ID of the feature to query. + * @version: version of the feature. + * + * Used to query the verion of features supported by QTEE. + */ +static void qcomtee_get_qtee_feature_list(struct tee_context *ctx, u32 id, + u32 *version) +{ + struct qcomtee_object_invoke_ctx *oic __free(kfree); + struct qcomtee_object *client_env, *service; + struct qcomtee_arg u[3] = { 0 }; + int result; + + oic = qcomtee_object_invoke_ctx_alloc(ctx); + if (!oic) + return; + + client_env = qcomtee_object_get_client_env(oic); + if (client_env == NULL_QCOMTEE_OBJECT) + return; + + /* Get ''FeatureVersions Service'' object. */ + service = qcomtee_object_get_service(oic, client_env, + QCOMTEE_FEATURE_VER_UID); + if (service == NULL_QCOMTEE_OBJECT) + goto out_failed; + + /* IB: Feature to query. */ + u[0].b.addr = &id; + u[0].b.size = sizeof(id); + u[0].type = QCOMTEE_ARG_TYPE_IB; + + /* OB: Version returned. */ + u[1].b.addr = version; + u[1].b.size = sizeof(*version); + u[1].type = QCOMTEE_ARG_TYPE_OB; + + qcomtee_object_do_invoke(oic, service, QCOMTEE_FEATURE_VER_OP_GET, u, + &result); + +out_failed: + qcomtee_object_put(service); + qcomtee_object_put(client_env); +} + +static const struct tee_driver_ops qcomtee_ops = { + .get_version = qcomtee_get_version, + .open = qcomtee_open, + .close_context = qcomtee_close_context, + .release = qcomtee_release, + .object_invoke_func = qcomtee_object_invoke, + .supp_recv = qcomtee_supp_recv, + .supp_send = qcomtee_supp_send, +}; + +static const struct tee_desc qcomtee_desc = { + .name = "qcomtee", + .ops = &qcomtee_ops, + .owner = THIS_MODULE, +}; + +static int qcomtee_probe(struct platform_device *pdev) +{ + struct workqueue_struct *async_wq; + struct tee_device *teedev; + struct tee_shm_pool *pool; + struct tee_context *ctx; + struct qcomtee *qcomtee; + int err; + + qcomtee = kzalloc(sizeof(*qcomtee), GFP_KERNEL); + if (!qcomtee) + return -ENOMEM; + + pool = qcomtee_shm_pool_alloc(); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + + goto err_free_qcomtee; + } + + teedev = tee_device_alloc(&qcomtee_desc, NULL, pool, qcomtee); + if (IS_ERR(teedev)) { + err = PTR_ERR(teedev); + + goto err_pool_destroy; + } + + qcomtee->teedev = teedev; + qcomtee->pool = pool; + err = tee_device_register(qcomtee->teedev); + if (err) + goto err_unreg_teedev; + + platform_set_drvdata(pdev, qcomtee); + /* Start async wq. */ + async_wq = alloc_ordered_workqueue("qcomtee_wq", 0); + if (!async_wq) { + err = -ENOMEM; + + goto err_unreg_teedev; + } + + qcomtee->wq = async_wq; + /* Driver context used for async operations of teedev. */ + ctx = teedev_open(qcomtee->teedev); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + + goto err_dest_wq; + } + + qcomtee->ctx = ctx; + /* Init Object table. */ + qcomtee->xa_last_id = 0; + xa_init_flags(&qcomtee->xa_local_objects, XA_FLAGS_ALLOC); + /* Get QTEE verion. */ + qcomtee_get_qtee_feature_list(qcomtee->ctx, + QCOMTEE_FEATURE_VER_OP_GET_QTEE_ID, + &qcomtee->qtee_version); + + pr_info("QTEE version %u.%u.%u\n", + QTEE_VERSION_GET_MAJOR(qcomtee->qtee_version), + QTEE_VERSION_GET_MINOR(qcomtee->qtee_version), + QTEE_VERSION_GET_PATCH(qcomtee->qtee_version)); + + return 0; + +err_dest_wq: + destroy_workqueue(qcomtee->wq); +err_unreg_teedev: + tee_device_unregister(qcomtee->teedev); +err_pool_destroy: + tee_shm_pool_free(pool); +err_free_qcomtee: + kfree(qcomtee); + + return err; +} + +/** + * qcomtee_remove() - Device Removal Routine. + * @pdev: platform device information struct. + * + * It is called by the platform subsystem to alert the driver that it should + * release the device. + * + * QTEE does not provide an API to inform it about a callback object going away. + * However, when releasing QTEE objects, any callback object sent to QTEE + * previously would be released by QTEE as part of the object release. + */ +static void qcomtee_remove(struct platform_device *pdev) +{ + struct qcomtee *qcomtee = platform_get_drvdata(pdev); + + teedev_close_context(qcomtee->ctx); + /* Wait for RELEASE operations to be processed for QTEE objects. */ + tee_device_unregister(qcomtee->teedev); + destroy_workqueue(qcomtee->wq); + tee_shm_pool_free(qcomtee->pool); + kfree(qcomtee); +} + +static const struct platform_device_id qcomtee_ids[] = { { "qcomtee", 0 }, {} }; +MODULE_DEVICE_TABLE(platform, qcomtee_ids); + +static struct platform_driver qcomtee_platform_driver = { + .probe = qcomtee_probe, + .remove = qcomtee_remove, + .driver = { + .name = "qcomtee", + }, + .id_table = qcomtee_ids, +}; + +module_platform_driver(qcomtee_platform_driver); + +MODULE_AUTHOR("Qualcomm"); +MODULE_DESCRIPTION("QTEE driver"); +MODULE_VERSION("1.0"); +MODULE_LICENSE("GPL"); diff --git a/drivers/tee/qcomtee/core.c b/drivers/tee/qcomtee/core.c new file mode 100644 index 000000000000..b6931ed6f200 --- /dev/null +++ b/drivers/tee/qcomtee/core.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "qcomtee.h" + +/* QTEE root object. */ +struct qcomtee_object qcomtee_object_root = { + .name = "root", + .object_type = QCOMTEE_OBJECT_TYPE_ROOT, + .info.qtee_id = QCOMTEE_MSG_OBJECT_ROOT, +}; + +/* Next argument of type @type after index @i. */ +int qcomtee_next_arg_type(struct qcomtee_arg *u, int i, + enum qcomtee_arg_type type) +{ + while (u[i].type != QCOMTEE_ARG_TYPE_INV && u[i].type != type) + i++; + return i; +} + +/* + * QTEE expects IDs with the QCOMTEE_MSG_OBJECT_NS_BIT set for objects + * of the QCOMTEE_OBJECT_TYPE_CB type. + */ +#define QCOMTEE_OBJECT_ID_START (QCOMTEE_MSG_OBJECT_NS_BIT + 1) +#define QCOMTEE_OBJECT_ID_END (U32_MAX) + +#define QCOMTEE_OBJECT_SET(p, type, ...) \ + __QCOMTEE_OBJECT_SET(p, type, ##__VA_ARGS__, 0UL) +#define __QCOMTEE_OBJECT_SET(p, type, optr, ...) \ + do { \ + (p)->object_type = (type); \ + (p)->info.qtee_id = (unsigned long)(optr); \ + } while (0) + +static struct qcomtee_object * +qcomtee_qtee_object_alloc(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + struct qcomtee_object *object; + + object = kzalloc(sizeof(*object), GFP_KERNEL); + if (!object) + return NULL_QCOMTEE_OBJECT; + + /* If failed, "no-name". */ + object->name = kasprintf(GFP_KERNEL, "qcomtee-%u", object_id); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_TEE, object_id); + kref_init(&object->refcount); + /* A QTEE object requires a context for async operations. */ + object->info.qcomtee_async_ctx = qcomtee->ctx; + teedev_ctx_get(object->info.qcomtee_async_ctx); + + return object; +} + +static void qcomtee_qtee_object_free(struct qcomtee_object *object) +{ + /* See qcomtee_qtee_object_alloc(). */ + teedev_ctx_put(object->info.qcomtee_async_ctx); + + kfree(object->name); + kfree(object); +} + +static void qcomtee_do_release_qtee_object(struct work_struct *work) +{ + struct qcomtee_object *object; + struct qcomtee *qcomtee; + int ret, result; + + /* RELEASE does not require any argument. */ + struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; + + object = container_of(work, struct qcomtee_object, work); + qcomtee = tee_get_drvdata(object->info.qcomtee_async_ctx->teedev); + /* Get the TEE context used for asynchronous operations. */ + qcomtee->oic.ctx = object->info.qcomtee_async_ctx; + + ret = qcomtee_object_do_invoke_internal(&qcomtee->oic, object, + QCOMTEE_MSG_OBJECT_OP_RELEASE, + args, &result); + + /* Is it safe to retry the release? */ + if (ret && ret != -ENODEV) { + queue_work(qcomtee->wq, &object->work); + } else { + if (ret || result) + pr_err("%s release failed, ret = %d (%x)\n", + qcomtee_object_name(object), ret, result); + qcomtee_qtee_object_free(object); + } +} + +static void qcomtee_release_qtee_object(struct qcomtee_object *object) +{ + struct qcomtee *qcomtee = + tee_get_drvdata(object->info.qcomtee_async_ctx->teedev); + + INIT_WORK(&object->work, qcomtee_do_release_qtee_object); + queue_work(qcomtee->wq, &object->work); +} + +static void qcomtee_object_release(struct kref *refcount) +{ + struct qcomtee_object *object; + const char *name; + + object = container_of(refcount, struct qcomtee_object, refcount); + + /* + * qcomtee_object_get() is called in a RCU read lock. synchronize_rcu() + * to avoid releasing the object while it is being accessed in + * qcomtee_object_get(). + */ + synchronize_rcu(); + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_TEE: + qcomtee_release_qtee_object(object); + + break; + case QCOMTEE_OBJECT_TYPE_CB: + name = object->name; + + if (object->ops->release) + object->ops->release(object); + + kfree_const(name); + + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_NULL: + default: + break; + } +} + +/** + * qcomtee_object_get() - Increase the object's reference count. + * @object: object to increase the reference count. + * + * Context: The caller should hold RCU read lock. + */ +int qcomtee_object_get(struct qcomtee_object *object) +{ + if (object != NULL_QCOMTEE_OBJECT && object != ROOT_QCOMTEE_OBJECT) + return kref_get_unless_zero(&object->refcount); + + return 0; +} + +/** + * qcomtee_object_put() - Decrease the object's reference count. + * @object: object to decrease the reference count. + */ +void qcomtee_object_put(struct qcomtee_object *object) +{ + if (object != NULL_QCOMTEE_OBJECT && object != ROOT_QCOMTEE_OBJECT) + kref_put(&object->refcount, qcomtee_object_release); +} + +static int qcomtee_idx_alloc(struct qcomtee_object_invoke_ctx *oic, u32 *idx, + struct qcomtee_object *object) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + + /* Every ID allocated here has QCOMTEE_MSG_OBJECT_NS_BIT set. */ + return xa_alloc_cyclic(&qcomtee->xa_local_objects, idx, object, + XA_LIMIT(QCOMTEE_OBJECT_ID_START, + QCOMTEE_OBJECT_ID_END), + &qcomtee->xa_last_id, GFP_KERNEL); +} + +struct qcomtee_object *qcomtee_idx_erase(struct qcomtee_object_invoke_ctx *oic, + u32 idx) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + + if (idx < QCOMTEE_OBJECT_ID_START || idx > QCOMTEE_OBJECT_ID_END) + return NULL_QCOMTEE_OBJECT; + + return xa_erase(&qcomtee->xa_local_objects, idx); +} + +/** + * qcomtee_object_id_get() - Get an ID for an object to send to QTEE. + * @oic: context to use for the invocation. + * @object: object to assign an ID. + * @object_id: object ID. + * + * Called on the path to QTEE to construct the message; see + * qcomtee_prepare_msg() and qcomtee_update_msg(). + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_object_id_get(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, + unsigned int *object_id) +{ + u32 idx; + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_CB: + if (qcomtee_idx_alloc(oic, &idx, object) < 0) + return -ENOSPC; + + *object_id = idx; + + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_TEE: + *object_id = object->info.qtee_id; + + break; + case QCOMTEE_OBJECT_TYPE_NULL: + *object_id = QCOMTEE_MSG_OBJECT_NULL; + + break; + } + + return 0; +} + +/* Release object ID assigned in qcomtee_object_id_get. */ +static void qcomtee_object_id_put(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + qcomtee_idx_erase(oic, object_id); +} + +/** + * qcomtee_local_object_get() - Get the object referenced by the ID. + * @oic: context to use for the invocation. + * @object_id: object ID. + * + * It is called on the path from QTEE. + * It is called on behalf of QTEE to obtain an instance of an object + * for a given ID. It increases the object's reference count on success. + * + * Return: On error, returns %NULL_QCOMTEE_OBJECT. + * On success, returns the object. + */ +static struct qcomtee_object * +qcomtee_local_object_get(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + struct qcomtee_object *object; + + guard(rcu)(); + object = xa_load(&qcomtee->xa_local_objects, object_id); + /* It already checks for %NULL_QCOMTEE_OBJECT. */ + qcomtee_object_get(object); + + return object; +} + +/** + * qcomtee_object_user_init() - Initialize an object for the user. + * @object: object to initialize. + * @ot: type of object as &enum qcomtee_object_type. + * @ops: instance of callbacks. + * @fmt: name assigned to the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_user_init(struct qcomtee_object *object, + enum qcomtee_object_type ot, + struct qcomtee_object_operations *ops, + const char *fmt, ...) +{ + va_list ap; + int ret; + + kref_init(&object->refcount); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_NULL); + + va_start(ap, fmt); + switch (ot) { + case QCOMTEE_OBJECT_TYPE_NULL: + ret = 0; + + break; + case QCOMTEE_OBJECT_TYPE_CB: + object->ops = ops; + if (!object->ops->dispatch) + return -EINVAL; + + /* If failed, "no-name". */ + object->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_CB); + + ret = 0; + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_TEE: + default: + ret = -EINVAL; + } + va_end(ap); + + return ret; +} + +/** + * qcomtee_object_type() - Returns the type of object represented by an ID. + * @object_id: object ID for the object. + * + * Similar to typeof_qcomtee_object(), but instead of receiving an object as + * an argument, it receives an object ID. It is used internally on the return + * path from QTEE. + * + * Return: Returns the type of object referenced by @object_id. + */ +static enum qcomtee_object_type qcomtee_object_type(unsigned int object_id) +{ + if (object_id == QCOMTEE_MSG_OBJECT_NULL) + return QCOMTEE_OBJECT_TYPE_NULL; + + if (object_id & QCOMTEE_MSG_OBJECT_NS_BIT) + return QCOMTEE_OBJECT_TYPE_CB; + + return QCOMTEE_OBJECT_TYPE_TEE; +} + +/** + * qcomtee_object_qtee_init() - Initialize an object for QTEE. + * @oic: context to use for the invocation. + * @object: object returned. + * @object_id: object ID received from QTEE. + * + * Return: On failure, returns < 0 and sets @object to %NULL_QCOMTEE_OBJECT. + * On success, returns 0 + */ +static int qcomtee_object_qtee_init(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object **object, + unsigned int object_id) +{ + int ret = 0; + + switch (qcomtee_object_type(object_id)) { + case QCOMTEE_OBJECT_TYPE_NULL: + *object = NULL_QCOMTEE_OBJECT; + + break; + case QCOMTEE_OBJECT_TYPE_CB: + *object = qcomtee_local_object_get(oic, object_id); + if (*object == NULL_QCOMTEE_OBJECT) + ret = -EINVAL; + + break; + + default: /* QCOMTEE_OBJECT_TYPE_TEE */ + *object = qcomtee_qtee_object_alloc(oic, object_id); + if (*object == NULL_QCOMTEE_OBJECT) + ret = -ENOMEM; + + break; + } + + return ret; +} + +/* + * ''Marshaling API'' + * qcomtee_prepare_msg - Prepare the inbound buffer for sending to QTEE + * qcomtee_update_args - Parse the QTEE response in the inbound buffer + * qcomtee_prepare_args - Parse the QTEE request from the outbound buffer + * qcomtee_update_msg - Update the outbound buffer with the response for QTEE + */ + +static int qcomtee_prepare_msg(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u) +{ + struct qcomtee_msg_object_invoke *msg; + unsigned int object_id; + int i, ib, ob, io, oo; + size_t offset; + + /* Use the input message buffer in 'oic'. */ + msg = oic->in_msg.addr; + + /* Start offset in a message for buffer arguments. */ + offset = qcomtee_msg_buffer_args(struct qcomtee_msg_object_invoke, + qcomtee_args_len(u)); + + /* Get the ID of the object being invoked. */ + if (qcomtee_object_id_get(oic, object, &object_id)) + return -ENOSPC; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, u) { + void *msgptr; /* Address of buffer payload: */ + /* Overflow already checked in qcomtee_msg_buffers_alloc(). */ + msg->args[ib].b.offset = offset; + msg->args[ib].b.size = u[i].b.size; + + msgptr = qcomtee_msg_offset_to_ptr(msg, offset); + /* Userspace client or kernel client!? */ + if (!(u[i].flags & QCOMTEE_ARG_FLAGS_UADDR)) + memcpy(msgptr, u[i].b.addr, u[i].b.size); + else if (copy_from_user(msgptr, u[i].b.uaddr, u[i].b.size)) + return -EINVAL; + + offset += qcomtee_msg_offset_align(u[i].b.size); + ib++; + } + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, u) { + /* Overflow already checked in qcomtee_msg_buffers_alloc(). */ + msg->args[ob].b.offset = offset; + msg->args[ob].b.size = u[i].b.size; + + offset += qcomtee_msg_offset_align(u[i].b.size); + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, u) { + if (qcomtee_object_id_get(oic, u[i].o, &msg->args[io].o)) { + qcomtee_object_id_put(oic, object_id); + for (io--; io >= ob; io--) + qcomtee_object_id_put(oic, msg->args[io].o); + + return -ENOSPC; + } + + io++; + } + + oo = io; + qcomtee_arg_for_each_output_object(i, u) + oo++; + + /* Set object, operation, and argument counts. */ + qcomtee_msg_init(msg, object_id, op, ib, ob, io, oo); + + return 0; +} + +/** + * qcomtee_update_args() - Parse the QTEE response in the inbound buffer. + * @u: array of arguments for the invocation. + * @oic: context to use for the invocation. + * + * @u must be the same as the one used in qcomtee_prepare_msg() when + * initializing the inbound buffer. + * + * On failure, it continues processing the QTEE message. The caller should + * do the necessary cleanup, including calling qcomtee_object_put() + * on the output objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_update_args(struct qcomtee_arg *u, + struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_object_invoke *msg; + int i, ib, ob, io, oo; + int ret = 0; + + /* Use the input message buffer in 'oic'. */ + msg = oic->in_msg.addr; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, u) + ib++; + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, u) { + void *msgptr; /* Address of buffer payload: */ + /* QTEE can override the size to a smaller value. */ + u[i].b.size = msg->args[ob].b.size; + + msgptr = qcomtee_msg_offset_to_ptr(msg, msg->args[ob].b.offset); + /* Userspace client or kernel client!? */ + if (!(u[i].flags & QCOMTEE_ARG_FLAGS_UADDR)) + memcpy(u[i].b.addr, msgptr, u[i].b.size); + else if (copy_to_user(u[i].b.uaddr, msgptr, u[i].b.size)) + ret = -EINVAL; + + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, u) + io++; + + oo = io; + qcomtee_arg_for_each_output_object(i, u) { + if (qcomtee_object_qtee_init(oic, &u[i].o, msg->args[oo].o)) + ret = -EINVAL; + + oo++; + } + + return ret; +} + +/** + * qcomtee_prepare_args() - Parse the QTEE request from the outbound buffer. + * @oic: context to use for the invocation. + * + * It initializes &qcomtee_object_invoke_ctx->u based on the QTEE request in + * the outbound buffer. It sets %QCOMTEE_ARG_TYPE_INV at the end of the array. + * + * On failure, it continues processing the QTEE message. The caller should + * do the necessary cleanup, including calling qcomtee_object_put() + * on the input objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_prepare_args(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_callback *msg; + int i, ret = 0; + + /* Use the output message buffer in 'oic'. */ + msg = oic->out_msg.addr; + + qcomtee_msg_for_each_input_buffer(i, msg) { + oic->u[i].b.addr = + qcomtee_msg_offset_to_ptr(msg, msg->args[i].b.offset); + oic->u[i].b.size = msg->args[i].b.size; + oic->u[i].type = QCOMTEE_ARG_TYPE_IB; + } + + qcomtee_msg_for_each_output_buffer(i, msg) { + oic->u[i].b.addr = + qcomtee_msg_offset_to_ptr(msg, msg->args[i].b.offset); + oic->u[i].b.size = msg->args[i].b.size; + oic->u[i].type = QCOMTEE_ARG_TYPE_OB; + } + + qcomtee_msg_for_each_input_object(i, msg) { + if (qcomtee_object_qtee_init(oic, &oic->u[i].o, msg->args[i].o)) + ret = -EINVAL; + + oic->u[i].type = QCOMTEE_ARG_TYPE_IO; + } + + qcomtee_msg_for_each_output_object(i, msg) + oic->u[i].type = QCOMTEE_ARG_TYPE_OO; + + /* End of Arguments. */ + oic->u[i].type = QCOMTEE_ARG_TYPE_INV; + + return ret; +} + +static int qcomtee_update_msg(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_callback *msg; + int i, ib, ob, io, oo; + + /* Use the output message buffer in 'oic'. */ + msg = oic->out_msg.addr; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, oic->u) + ib++; + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, oic->u) { + /* Only reduce size; never increase it. */ + if (msg->args[ob].b.size < oic->u[i].b.size) + return -EINVAL; + + msg->args[ob].b.size = oic->u[i].b.size; + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, oic->u) + io++; + + oo = io; + qcomtee_arg_for_each_output_object(i, oic->u) { + if (qcomtee_object_id_get(oic, oic->u[i].o, &msg->args[oo].o)) { + for (oo--; oo >= io; oo--) + qcomtee_object_id_put(oic, msg->args[oo].o); + + return -ENOSPC; + } + + oo++; + } + + return 0; +} + +/* Invoke a callback object. */ +static void qcomtee_cb_object_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_msg_callback *msg) +{ + int i, errno; + u32 op; + + /* Get the object being invoked. */ + unsigned int object_id = msg->cxt; + struct qcomtee_object *object; + + /* QTEE cannot invoke a NULL object or objects it hosts. */ + if (qcomtee_object_type(object_id) == QCOMTEE_OBJECT_TYPE_NULL || + qcomtee_object_type(object_id) == QCOMTEE_OBJECT_TYPE_TEE) { + errno = -EINVAL; + goto out; + } + + object = qcomtee_local_object_get(oic, object_id); + if (object == NULL_QCOMTEE_OBJECT) { + errno = -EINVAL; + goto out; + } + + oic->object = object; + + /* Filter bits used by transport. */ + op = msg->op & QCOMTEE_MSG_OBJECT_OP_MASK; + + switch (op) { + case QCOMTEE_MSG_OBJECT_OP_RELEASE: + qcomtee_object_id_put(oic, object_id); + qcomtee_object_put(object); + errno = 0; + + break; + case QCOMTEE_MSG_OBJECT_OP_RETAIN: + qcomtee_object_get(object); + errno = 0; + + break; + default: + errno = qcomtee_prepare_args(oic); + if (errno) { + /* Release any object that arrived as input. */ + qcomtee_arg_for_each_input_buffer(i, oic->u) + qcomtee_object_put(oic->u[i].o); + + break; + } + + errno = object->ops->dispatch(oic, object, op, oic->u); + if (!errno) { + /* On success, notify at the appropriate time. */ + oic->flags |= QCOMTEE_OIC_FLAG_NOTIFY; + } + } + +out: + + oic->errno = errno; +} + +static int +qcomtee_object_invoke_ctx_invoke(struct qcomtee_object_invoke_ctx *oic, + int *result, u64 *res_type) +{ + phys_addr_t out_msg_paddr; + phys_addr_t in_msg_paddr; + int ret; + u64 res; + + tee_shm_get_pa(oic->out_shm, 0, &out_msg_paddr); + tee_shm_get_pa(oic->in_shm, 0, &in_msg_paddr); + if (!(oic->flags & QCOMTEE_OIC_FLAG_BUSY)) + ret = qcom_scm_qtee_invoke_smc(in_msg_paddr, oic->in_msg.size, + out_msg_paddr, oic->out_msg.size, + &res, res_type); + else + ret = qcom_scm_qtee_callback_response(out_msg_paddr, + oic->out_msg.size, + &res, res_type); + + if (ret) + pr_err("QTEE returned with %d.\n", ret); + else + *result = (int)res; + + return ret; +} + +/** + * qcomtee_qtee_objects_put() - Put the callback objects in the argument array. + * @u: array of arguments. + * + * When qcomtee_object_do_invoke_internal() is successfully invoked, + * QTEE takes ownership of the callback objects. If the invocation fails, + * qcomtee_object_do_invoke_internal() calls qcomtee_qtee_objects_put() + * to mimic the release of callback objects by QTEE. + */ +static void qcomtee_qtee_objects_put(struct qcomtee_arg *u) +{ + int i; + + qcomtee_arg_for_each_input_object(i, u) { + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + } +} + +/** + * qcomtee_object_do_invoke_internal() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each + * object, including @object. On return, the caller loses ownership of all + * input objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke_internal(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result) +{ + struct qcomtee_msg_callback *cb_msg; + struct qcomtee_object *qto; + int i, ret, errno; + u64 res_type; + + /* Allocate inbound and outbound buffers. */ + ret = qcomtee_msg_buffers_alloc(oic, u); + if (ret) { + qcomtee_qtee_objects_put(u); + + return ret; + } + + ret = qcomtee_prepare_msg(oic, object, op, u); + if (ret) { + qcomtee_qtee_objects_put(u); + + goto out; + } + + /* Use input message buffer in 'oic'. */ + cb_msg = oic->out_msg.addr; + + while (1) { + if (oic->flags & QCOMTEE_OIC_FLAG_BUSY) { + errno = oic->errno; + if (!errno) + errno = qcomtee_update_msg(oic); + qcomtee_msg_set_result(cb_msg, errno); + } + + /* Invoke the remote object. */ + ret = qcomtee_object_invoke_ctx_invoke(oic, result, &res_type); + /* Return form callback objects result submission: */ + if (oic->flags & QCOMTEE_OIC_FLAG_BUSY) { + qto = oic->object; + if (qto) { + if (oic->flags & QCOMTEE_OIC_FLAG_NOTIFY) { + if (qto->ops->notify) + qto->ops->notify(oic, qto, + errno || ret); + } + + /* Get is in qcomtee_cb_object_invoke(). */ + qcomtee_object_put(qto); + } + + oic->object = NULL_QCOMTEE_OBJECT; + oic->flags &= ~(QCOMTEE_OIC_FLAG_BUSY | + QCOMTEE_OIC_FLAG_NOTIFY); + } + + if (ret) { + /* + * Unable to finished the invocation. + * If QCOMTEE_OIC_FLAG_SHARED is not set, put + * QCOMTEE_OBJECT_TYPE_CB input objects. + */ + if (!(oic->flags & QCOMTEE_OIC_FLAG_SHARED)) + qcomtee_qtee_objects_put(u); + else + ret = -ENODEV; + + goto out; + + } else { + /* + * QTEE obtained ownership of QCOMTEE_OBJECT_TYPE_CB + * input objects in 'u'. On further failure, QTEE is + * responsible for releasing them. + */ + oic->flags |= QCOMTEE_OIC_FLAG_SHARED; + } + + /* Is it a callback request? */ + if (res_type != QCOMTEE_RESULT_INBOUND_REQ_NEEDED) { + /* + * Parse results. If failed, assume the service + * was unavailable (i.e. QCOMTEE_MSG_ERROR_UNAVAIL) + * and put output objects to initiate cleanup. + */ + if (!*result && qcomtee_update_args(u, oic)) { + *result = QCOMTEE_MSG_ERROR_UNAVAIL; + qcomtee_arg_for_each_output_object(i, u) + qcomtee_object_put(u[i].o); + } + + break; + + } else { + oic->flags |= QCOMTEE_OIC_FLAG_BUSY; + qcomtee_fetch_async_reqs(oic); + qcomtee_cb_object_invoke(oic, cb_msg); + } + } + + qcomtee_fetch_async_reqs(oic); +out: + qcomtee_msg_buffers_free(oic); + + return ret; +} + +int qcomtee_object_do_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result) +{ + /* User can not set bits used by transport. */ + if (op & ~QCOMTEE_MSG_OBJECT_OP_MASK) + return -EINVAL; + + /* User can only invoke QTEE hosted objects. */ + if (typeof_qcomtee_object(object) != QCOMTEE_OBJECT_TYPE_TEE && + typeof_qcomtee_object(object) != QCOMTEE_OBJECT_TYPE_ROOT) + return -EINVAL; + + /* User cannot directly issue these operations to QTEE. */ + if (op == QCOMTEE_MSG_OBJECT_OP_RELEASE || + op == QCOMTEE_MSG_OBJECT_OP_RETAIN) + return -EINVAL; + + return qcomtee_object_do_invoke_internal(oic, object, op, u, result); +} + +/** + * qcomtee_object_get_client_env() - Get a privileged client env. object. + * @oic: context to use for the current invocation. + * + * The caller should call qcomtee_object_put() on the returned object + * to release it. + * + * Return: On error, returns %NULL_QCOMTEE_OBJECT. + * On success, returns the object. + */ +struct qcomtee_object * +qcomtee_object_get_client_env(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_arg u[3] = { 0 }; + int ret, result; + + u[0].o = NULL_QCOMTEE_OBJECT; + u[0].type = QCOMTEE_ARG_TYPE_IO; + u[1].type = QCOMTEE_ARG_TYPE_OO; + ret = qcomtee_object_do_invoke(oic, ROOT_QCOMTEE_OBJECT, + QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS, u, + &result); + if (ret || result) + return NULL_QCOMTEE_OBJECT; + + return u[1].o; +} + +struct qcomtee_object * +qcomtee_object_get_service(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *client_env, u32 uid) +{ + struct qcomtee_arg u[3] = { 0 }; + int ret, result; + + u[0].b.addr = &uid; + u[0].b.size = sizeof(uid); + u[0].type = QCOMTEE_ARG_TYPE_IB; + u[1].type = QCOMTEE_ARG_TYPE_OO; + ret = qcomtee_object_do_invoke(oic, client_env, QCOMTEE_CLIENT_ENV_OPEN, + u, &result); + + if (ret || result) + return NULL_QCOMTEE_OBJECT; + + return u[1].o; +} diff --git a/drivers/tee/qcomtee/qcomtee.h b/drivers/tee/qcomtee/qcomtee.h new file mode 100644 index 000000000000..f34be992e68b --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_H +#define QCOMTEE_H + +#include +#include + +#include "qcomtee_msg.h" +#include "qcomtee_object.h" + +/* Flags relating to object reference. */ +#define QCOMTEE_OBJREF_FLAG_TEE BIT(0) +#define QCOMTEE_OBJREF_FLAG_USER BIT(1) + +/** + * struct qcomtee - Main service struct. + * @teedev: client device. + * @pool: shared memory pool. + * @ctx: driver private context. + * @oic: context to use for the current driver invocation. + * @wq: workqueue for QTEE async operations. + * @xa_local_objects: array of objects exported to QTEE. + * @xa_last_id: next ID to allocate. + * @qtee_version: QTEE version. + */ +struct qcomtee { + struct tee_device *teedev; + struct tee_shm_pool *pool; + struct tee_context *ctx; + struct qcomtee_object_invoke_ctx oic; + struct workqueue_struct *wq; + struct xarray xa_local_objects; + u32 xa_last_id; + u32 qtee_version; +}; + +void qcomtee_fetch_async_reqs(struct qcomtee_object_invoke_ctx *oic); +struct qcomtee_object *qcomtee_idx_erase(struct qcomtee_object_invoke_ctx *oic, + u32 idx); + +struct tee_shm_pool *qcomtee_shm_pool_alloc(void); +void qcomtee_msg_buffers_free(struct qcomtee_object_invoke_ctx *oic); +int qcomtee_msg_buffers_alloc(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_arg *u); + +/** + * qcomtee_object_do_invoke_internal() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each + * object, including @object. On return, the caller loses ownership of all + * input objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke_internal(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result); + +/** + * struct qcomtee_context_data - Clients' or supplicants' context. + * @qtee_objects_idr: QTEE objects in this context. + * @qtee_lock: mutex for @qtee_objects_idr. + * @reqs_idr: requests in this context that hold ID. + * @reqs_list: FIFO for requests in PROCESSING or QUEUED state. + * @reqs_lock: mutex for @reqs_idr, @reqs_list and request states. + * @req_c: completion used when the supplicant is waiting for requests. + * @released: state of this context. + */ +struct qcomtee_context_data { + struct idr qtee_objects_idr; + /* Synchronize access to @qtee_objects_idr. */ + struct mutex qtee_lock; + + struct idr reqs_idr; + struct list_head reqs_list; + /* Synchronize access to @reqs_idr, @reqs_list and updating requests states. */ + struct mutex reqs_lock; + + struct completion req_c; + + bool released; +}; + +int qcomtee_context_add_qtee_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx); +int qcomtee_context_find_qtee_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx); +void qcomtee_context_del_qtee_object(struct tee_param *param, + struct tee_context *ctx); + +int qcomtee_objref_to_arg(struct qcomtee_arg *arg, struct tee_param *param, + struct tee_context *ctx); +int qcomtee_objref_from_arg(struct tee_param *param, struct qcomtee_arg *arg, + struct tee_context *ctx); + +/* OBJECTS: */ + +/* (1) User Object API. */ + +int is_qcomtee_user_object(struct qcomtee_object *object); +void qcomtee_user_object_set_notify(struct qcomtee_object *object, bool notify); +void qcomtee_requests_destroy(struct qcomtee_context_data *ctxdata); +int qcomtee_user_param_to_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx); +int qcomtee_user_param_from_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx); + +/** + * struct qcomtee_user_object_request_data - Data for user object request. + * @id: ID assigned to the request. + * @object_id: Object ID being invoked by QTEE. + * @op: Requested operation on object. + * @np: Number of parameters in the request. + */ +struct qcomtee_user_object_request_data { + int id; + u64 object_id; + u32 op; + int np; +}; + +int qcomtee_user_object_select(struct tee_context *ctx, + struct tee_param *params, int num_params, + void __user *uaddr, size_t size, + struct qcomtee_user_object_request_data *data); +int qcomtee_user_object_submit(struct tee_context *ctx, + struct tee_param *params, int num_params, + int req_id, int errno); + +#endif /* QCOMTEE_H */ diff --git a/drivers/tee/qcomtee/qcomtee_msg.h b/drivers/tee/qcomtee/qcomtee_msg.h new file mode 100644 index 000000000000..878f70178a5b --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee_msg.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_MSG_H +#define QCOMTEE_MSG_H + +#include + +/** + * DOC: ''Qualcomm TEE'' (QTEE) Transport Message + * + * There are two buffers shared with QTEE: inbound and outbound buffers. + * The inbound buffer is used for direct object invocation, and the outbound + * buffer is used to make a request from QTEE to the kernel; i.e., a callback + * request. + * + * The unused tail of the outbound buffer is also used for sending and + * receiving asynchronous messages. An asynchronous message is independent of + * the current object invocation (i.e., contents of the inbound buffer) or + * callback request (i.e., the head of the outbound buffer); see + * qcomtee_get_async_buffer(). It is used by endpoints (QTEE or kernel) as an + * optimization to reduce the number of context switches between the secure and + * non-secure worlds. + * + * For instance, QTEE never sends an explicit callback request to release an + * object in the kernel. Instead, it sends asynchronous release messages in the + * outbound buffer when QTEE returns from the previous direct object invocation, + * or appends asynchronous release messages after the current callback request. + * + * QTEE supports two types of arguments in a message: buffer and object + * arguments. Depending on the direction of data flow, they could be input + * buffer (IO) to QTEE, output buffer (OB) from QTEE, input object (IO) to QTEE, + * or output object (OO) from QTEE. Object arguments hold object IDs. Buffer + * arguments hold (offset, size) pairs into the inbound or outbound buffers. + * + * QTEE holds an object table for objects it hosts and exposes to the kernel. + * An object ID is an index to the object table in QTEE. + * + * For the direct object invocation message format in the inbound buffer, see + * &struct qcomtee_msg_object_invoke. For the callback request message format + * in the outbound buffer, see &struct qcomtee_msg_callback. For the message + * format for asynchronous messages in the outbound buffer, see + * &struct qcomtee_async_msg_hdr. + */ + +/** + * define QCOMTEE_MSG_OBJECT_NS_BIT - Non-secure bit + * + * Object ID is a globally unique 32-bit number. IDs referencing objects + * in the kernel should have %QCOMTEE_MSG_OBJECT_NS_BIT set. + */ +#define QCOMTEE_MSG_OBJECT_NS_BIT BIT(31) + +/* Static object IDs recognized by QTEE. */ +#define QCOMTEE_MSG_OBJECT_NULL (0U) +#define QCOMTEE_MSG_OBJECT_ROOT (1U) + +/* Definitions from QTEE as part of the transport protocol. */ + +/* qcomtee_msg_arg is an argument as recognized by QTEE. */ +union qcomtee_msg_arg { + struct { + u32 offset; + u32 size; + } b; + u32 o; +}; + +/* BI and BO payloads in QTEE messages should be at 64-bit boundaries. */ +#define qcomtee_msg_offset_align(o) ALIGN((o), sizeof(u64)) + +/* Operations for objects are 32-bit. Transport uses the upper 16 bits. */ +#define QCOMTEE_MSG_OBJECT_OP_MASK GENMASK(15, 0) + +/* Reserved Operation IDs sent to QTEE: */ +/* QCOMTEE_MSG_OBJECT_OP_RELEASE - Reduces the refcount and releases the object. + * QCOMTEE_MSG_OBJECT_OP_RETAIN - Increases the refcount. + * + * These operation IDs are valid for all objects. + */ + +#define QCOMTEE_MSG_OBJECT_OP_RELEASE (QCOMTEE_MSG_OBJECT_OP_MASK - 0) +#define QCOMTEE_MSG_OBJECT_OP_RETAIN (QCOMTEE_MSG_OBJECT_OP_MASK - 1) + +/* Subset of operations supported by QTEE root object. */ + +#define QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS 5 +#define QCOMTEE_ROOT_OP_NOTIFY_DOMAIN_CHANGE 4 +#define QCOMTEE_ROOT_OP_ADCI_ACCEPT 8 +#define QCOMTEE_ROOT_OP_ADCI_SHUTDOWN 9 + +/* Subset of operations supported by client_env object. */ + +#define QCOMTEE_CLIENT_ENV_OPEN 0 + +/* List of available QTEE service UIDs and subset of operations. */ + +#define QCOMTEE_FEATURE_VER_UID 2033 +#define QCOMTEE_FEATURE_VER_OP_GET 0 +/* Get QTEE version number. */ +#define QCOMTEE_FEATURE_VER_OP_GET_QTEE_ID 10 +#define QTEE_VERSION_GET_MAJOR(x) (((x) >> 22) & 0xffU) +#define QTEE_VERSION_GET_MINOR(x) (((x) >> 12) & 0xffU) +#define QTEE_VERSION_GET_PATCH(x) ((x) >> 0 & 0xfffU) + +/* Response types as returned from qcomtee_object_invoke_ctx_invoke(). */ + +/* The message contains a callback request. */ +#define QCOMTEE_RESULT_INBOUND_REQ_NEEDED 3 + +/** + * struct qcomtee_msg_object_invoke - Direct object invocation message. + * @ctx: object ID hosted in QTEE. + * @op: operation for the object. + * @counts: number of different types of arguments in @args. + * @args: array of arguments. + * + * @counts consists of 4 * 4-bit fields. Bits 0 - 3 represent the number of + * input buffers, bits 4 - 7 represent the number of output buffers, + * bits 8 - 11 represent the number of input objects, and bits 12 - 15 + * represent the number of output objects. The remaining bits should be zero. + * + * 15 12 11 8 7 4 3 0 + * +----------------+----------------+----------------+----------------+ + * | #OO objects | #IO objects | #OB buffers | #IB buffers | + * +----------------+----------------+----------------+----------------+ + * + * The maximum number of arguments of each type is defined by + * %QCOMTEE_ARGS_PER_TYPE. + */ +struct qcomtee_msg_object_invoke { + u32 cxt; + u32 op; + u32 counts; + union qcomtee_msg_arg args[]; +}; + +/* Bit masks for the four 4-bit nibbles holding the counts. */ +#define QCOMTEE_MASK_IB GENMASK(3, 0) +#define QCOMTEE_MASK_OB GENMASK(7, 4) +#define QCOMTEE_MASK_IO GENMASK(11, 8) +#define QCOMTEE_MASK_OO GENMASK(15, 12) + +/** + * struct qcomtee_msg_callback - Callback request message. + * @result: result of operation @op on the object referenced by @cxt. + * @cxt: object ID hosted in the kernel. + * @op: operation for the object. + * @counts: number of different types of arguments in @args. + * @args: array of arguments. + * + * For details of @counts, see &qcomtee_msg_object_invoke.counts. + */ +struct qcomtee_msg_callback { + u32 result; + u32 cxt; + u32 op; + u32 counts; + union qcomtee_msg_arg args[]; +}; + +/* Offset in the message for the beginning of the buffer argument's contents. */ +#define qcomtee_msg_buffer_args(t, n) \ + qcomtee_msg_offset_align(struct_size_t(t, args, n)) +/* Pointer to the beginning of a buffer argument's content at an offset. */ +#define qcomtee_msg_offset_to_ptr(m, off) ((void *)&((char *)(m))[(off)]) + +/* Some helpers to manage msg.counts. */ + +static inline unsigned int qcomtee_msg_num_ib(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_IB, counts); +} + +static inline unsigned int qcomtee_msg_num_ob(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_OB, counts); +} + +static inline unsigned int qcomtee_msg_num_io(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_IO, counts); +} + +static inline unsigned int qcomtee_msg_num_oo(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_OO, counts); +} + +static inline unsigned int qcomtee_msg_idx_ib(u32 counts) +{ + return 0; +} + +static inline unsigned int qcomtee_msg_idx_ob(u32 counts) +{ + return qcomtee_msg_num_ib(counts); +} + +static inline unsigned int qcomtee_msg_idx_io(u32 counts) +{ + return qcomtee_msg_idx_ob(counts) + qcomtee_msg_num_ob(counts); +} + +static inline unsigned int qcomtee_msg_idx_oo(u32 counts) +{ + return qcomtee_msg_idx_io(counts) + qcomtee_msg_num_io(counts); +} + +#define qcomtee_msg_for_each(i, first, num) \ + for ((i) = (first); (i) < (first) + (num); (i)++) + +#define qcomtee_msg_for_each_input_buffer(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_ib((m)->counts), \ + qcomtee_msg_num_ib((m)->counts)) + +#define qcomtee_msg_for_each_output_buffer(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_ob((m)->counts), \ + qcomtee_msg_num_ob((m)->counts)) + +#define qcomtee_msg_for_each_input_object(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_io((m)->counts), \ + qcomtee_msg_num_io((m)->counts)) + +#define qcomtee_msg_for_each_output_object(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_oo((m)->counts), \ + qcomtee_msg_num_oo((m)->counts)) + +/* Sum of arguments in a message. */ +#define qcomtee_msg_args(m) \ + (qcomtee_msg_idx_oo((m)->counts) + qcomtee_msg_num_oo((m)->counts)) + +static inline void qcomtee_msg_init(struct qcomtee_msg_object_invoke *msg, + u32 cxt, u32 op, int in_buffer, + int out_buffer, int in_object, + int out_object) +{ + u32 counts = 0; + + counts |= (in_buffer & 0xfU); + counts |= ((out_buffer - in_buffer) & 0xfU) << 4; + counts |= ((in_object - out_buffer) & 0xfU) << 8; + counts |= ((out_object - in_object) & 0xfU) << 12; + + msg->cxt = cxt; + msg->op = op; + msg->counts = counts; +} + +/* Generic error codes. */ +#define QCOMTEE_MSG_OK 0 /* non-specific success code. */ +#define QCOMTEE_MSG_ERROR 1 /* non-specific error. */ +#define QCOMTEE_MSG_ERROR_INVALID 2 /* unsupported/unrecognized request. */ +#define QCOMTEE_MSG_ERROR_SIZE_IN 3 /* supplied buffer/string too large. */ +#define QCOMTEE_MSG_ERROR_SIZE_OUT 4 /* supplied output buffer too small. */ +#define QCOMTEE_MSG_ERROR_USERBASE 10 /* start of user-defined error range. */ + +/* Transport layer error codes. */ +#define QCOMTEE_MSG_ERROR_DEFUNCT -90 /* object no longer exists. */ +#define QCOMTEE_MSG_ERROR_ABORT -91 /* calling thread must exit. */ +#define QCOMTEE_MSG_ERROR_BADOBJ -92 /* invalid object context. */ +#define QCOMTEE_MSG_ERROR_NOSLOTS -93 /* caller's object table full. */ +#define QCOMTEE_MSG_ERROR_MAXARGS -94 /* too many args. */ +#define QCOMTEE_MSG_ERROR_MAXDATA -95 /* buffers too large. */ +#define QCOMTEE_MSG_ERROR_UNAVAIL -96 /* the request could not be processed. */ +#define QCOMTEE_MSG_ERROR_KMEM -97 /* kernel out of memory. */ +#define QCOMTEE_MSG_ERROR_REMOTE -98 /* local method sent to remote object. */ +#define QCOMTEE_MSG_ERROR_BUSY -99 /* Object is busy. */ +#define QCOMTEE_MSG_ERROR_TIMEOUT -103 /* Call Back Object invocation timed out. */ + +static inline void qcomtee_msg_set_result(struct qcomtee_msg_callback *cb_msg, + int err) +{ + if (!err) { + cb_msg->result = QCOMTEE_MSG_OK; + } else if (err < 0) { + /* If err < 0, then it is a transport error. */ + switch (err) { + case -ENOMEM: + cb_msg->result = QCOMTEE_MSG_ERROR_KMEM; + break; + case -ENODEV: + cb_msg->result = QCOMTEE_MSG_ERROR_DEFUNCT; + break; + case -ENOSPC: + case -EBUSY: + cb_msg->result = QCOMTEE_MSG_ERROR_BUSY; + break; + case -EBADF: + case -EINVAL: + cb_msg->result = QCOMTEE_MSG_ERROR_UNAVAIL; + break; + default: + cb_msg->result = QCOMTEE_MSG_ERROR; + } + } else { + /* If err > 0, then it is user defined error, pass it as is. */ + cb_msg->result = err; + } +} + +#endif /* QCOMTEE_MSG_H */ diff --git a/drivers/tee/qcomtee/qcomtee_object.h b/drivers/tee/qcomtee/qcomtee_object.h new file mode 100644 index 000000000000..5221449be7db --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee_object.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_OBJECT_H +#define QCOMTEE_OBJECT_H + +#include +#include +#include +#include + +struct qcomtee_object; + +/** + * DOC: Overview + * + * qcomtee_object provides object refcounting, ID allocation for objects hosted + * in the kernel, and necessary message marshaling for Qualcomm TEE (QTEE). + * + * To invoke an object in QTEE, the user calls qcomtee_object_do_invoke() + * while passing an instance of &struct qcomtee_object and the requested + * operation + arguments. + * + * After boot, QTEE provides a static object %ROOT_QCOMTEE_OBJECT (type of + * %QCOMTEE_OBJECT_TYPE_ROOT). The root object is invoked to pass the user's + * credentials and obtain other instances of &struct qcomtee_object (type of + * %QCOMTEE_OBJECT_TYPE_TEE) that represent services and TAs in QTEE; + * see &enum qcomtee_object_type. + * + * The objects received from QTEE are refcounted. So the owner of these objects + * can issue qcomtee_object_get() to increase the refcount and pass objects + * to other clients, or issue qcomtee_object_put() to decrease the refcount + * and release the resources in QTEE. + * + * The kernel can host services accessible to QTEE. A driver should embed + * an instance of &struct qcomtee_object in the struct it wants to export to + * QTEE (this is called a callback object). It issues qcomtee_object_user_init() + * to set the dispatch() operation for the callback object and set its type + * to %QCOMTEE_OBJECT_TYPE_CB. + * + * core.c holds an object table for callback objects. An object ID is assigned + * to each callback object, which is an index to the object table. QTEE uses + * these IDs to reference or invoke callback objects. + * + * If QTEE invokes a callback object in the kernel, the dispatch() operation is + * called in the context of the thread that originally called + * qcomtee_object_do_invoke(). + */ + +/** + * enum qcomtee_object_type - Object types. + * @QCOMTEE_OBJECT_TYPE_TEE: object hosted on QTEE. + * @QCOMTEE_OBJECT_TYPE_CB: object hosted on kernel. + * @QCOMTEE_OBJECT_TYPE_ROOT: 'primordial' object. + * @QCOMTEE_OBJECT_TYPE_NULL: NULL object. + * + * The primordial object is used for bootstrapping the IPC connection between + * the kernel and QTEE. It is invoked by the kernel when it wants to get a + * 'client env'. + */ +enum qcomtee_object_type { + QCOMTEE_OBJECT_TYPE_TEE, + QCOMTEE_OBJECT_TYPE_CB, + QCOMTEE_OBJECT_TYPE_ROOT, + QCOMTEE_OBJECT_TYPE_NULL, +}; + +/** + * enum qcomtee_arg_type - Type of QTEE argument. + * @QCOMTEE_ARG_TYPE_INV: invalid type. + * @QCOMTEE_ARG_TYPE_OB: output buffer (OB). + * @QCOMTEE_ARG_TYPE_OO: output object (OO). + * @QCOMTEE_ARG_TYPE_IB: input buffer (IB). + * @QCOMTEE_ARG_TYPE_IO: input object (IO). + * + * Use the invalid type to specify the end of the argument array. + */ +enum qcomtee_arg_type { + QCOMTEE_ARG_TYPE_INV = 0, + QCOMTEE_ARG_TYPE_OB, + QCOMTEE_ARG_TYPE_OO, + QCOMTEE_ARG_TYPE_IB, + QCOMTEE_ARG_TYPE_IO, + QCOMTEE_ARG_TYPE_NR, +}; + +/** + * define QCOMTEE_ARGS_PER_TYPE - Maximum arguments of a specific type. + * + * The QTEE transport protocol limits the maximum number of arguments of + * a specific type (i.e., IB, OB, IO, and OO). + */ +#define QCOMTEE_ARGS_PER_TYPE 16 + +/* Maximum arguments that can fit in a QTEE message, ignoring the type. */ +#define QCOMTEE_ARGS_MAX (QCOMTEE_ARGS_PER_TYPE * (QCOMTEE_ARG_TYPE_NR - 1)) + +struct qcomtee_buffer { + union { + void *addr; + void __user *uaddr; + }; + size_t size; +}; + +/** + * struct qcomtee_arg - Argument for QTEE object invocation. + * @type: type of argument as &enum qcomtee_arg_type. + * @flags: extra flags. + * @b: address and size if the type of argument is a buffer. + * @o: object instance if the type of argument is an object. + * + * &qcomtee_arg.flags only accepts %QCOMTEE_ARG_FLAGS_UADDR for now, which + * states that &qcomtee_arg.b contains a userspace address in uaddr. + */ +struct qcomtee_arg { + enum qcomtee_arg_type type; +/* 'b.uaddr' holds a __user address. */ +#define QCOMTEE_ARG_FLAGS_UADDR BIT(0) + unsigned int flags; + union { + struct qcomtee_buffer b; + struct qcomtee_object *o; + }; +}; + +static inline int qcomtee_args_len(struct qcomtee_arg *args) +{ + int i = 0; + + while (args[i].type != QCOMTEE_ARG_TYPE_INV) + i++; + return i; +} + +/* Context is busy (callback is in progress). */ +#define QCOMTEE_OIC_FLAG_BUSY BIT(1) +/* Context needs to notify the current object. */ +#define QCOMTEE_OIC_FLAG_NOTIFY BIT(2) +/* Context has shared state with QTEE. */ +#define QCOMTEE_OIC_FLAG_SHARED BIT(3) + +/** + * struct qcomtee_object_invoke_ctx - QTEE context for object invocation. + * @ctx: TEE context for this invocation. + * @flags: flags for the invocation context. + * @errno: error code for the invocation. + * @object: current object invoked in this callback context. + * @u: array of arguments for the current invocation (+1 for ending arg). + * @in_msg: inbound buffer shared with QTEE. + * @out_msg: outbound buffer shared with QTEE. + * @in_shm: TEE shm allocated for inbound buffer. + * @out_shm: TEE shm allocated for outbound buffer. + * @data: extra data attached to this context. + */ +struct qcomtee_object_invoke_ctx { + struct tee_context *ctx; + unsigned long flags; + int errno; + + struct qcomtee_object *object; + struct qcomtee_arg u[QCOMTEE_ARGS_MAX + 1]; + + struct qcomtee_buffer in_msg; + struct qcomtee_buffer out_msg; + struct tee_shm *in_shm; + struct tee_shm *out_shm; + + void *data; +}; + +static inline struct qcomtee_object_invoke_ctx * +qcomtee_object_invoke_ctx_alloc(struct tee_context *ctx) +{ + struct qcomtee_object_invoke_ctx *oic; + + oic = kzalloc(sizeof(*oic), GFP_KERNEL); + if (oic) + oic->ctx = ctx; + return oic; +} + +/** + * qcomtee_object_do_invoke() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each object, + * including @object. On return, the caller loses ownership of all input + * objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * @object can be of %QCOMTEE_OBJECT_TYPE_ROOT or %QCOMTEE_OBJECT_TYPE_TEE. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result); + +/** + * struct qcomtee_object_operations - Callback object operations. + * @release: release the object if QTEE is not using it. + * @dispatch: dispatch the operation requested by QTEE. + * @notify: report the status of any pending response submitted by @dispatch. + */ +struct qcomtee_object_operations { + void (*release)(struct qcomtee_object *object); + int (*dispatch)(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *args); + void (*notify)(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, int err); +}; + +/** + * struct qcomtee_object - QTEE or kernel object. + * @name: object name. + * @refcount: reference counter. + * @object_type: object type as &enum qcomtee_object_type. + * @info: extra information for the object. + * @ops: callback operations for objects of type %QCOMTEE_OBJECT_TYPE_CB. + * @work: work for async operations on the object. + * + * @work is used for releasing objects of %QCOMTEE_OBJECT_TYPE_TEE type. + */ +struct qcomtee_object { + const char *name; + struct kref refcount; + + enum qcomtee_object_type object_type; + struct object_info { + unsigned long qtee_id; + /* TEE context for QTEE object async requests. */ + struct tee_context *qcomtee_async_ctx; + } info; + + struct qcomtee_object_operations *ops; + struct work_struct work; +}; + +/* Static instances of qcomtee_object objects. */ +#define NULL_QCOMTEE_OBJECT ((struct qcomtee_object *)(0)) +extern struct qcomtee_object qcomtee_object_root; +#define ROOT_QCOMTEE_OBJECT (&qcomtee_object_root) + +static inline enum qcomtee_object_type +typeof_qcomtee_object(struct qcomtee_object *object) +{ + if (object == NULL_QCOMTEE_OBJECT) + return QCOMTEE_OBJECT_TYPE_NULL; + return object->object_type; +} + +static inline const char *qcomtee_object_name(struct qcomtee_object *object) +{ + if (object == NULL_QCOMTEE_OBJECT) + return "null"; + + if (!object->name) + return "no-name"; + return object->name; +} + +/** + * qcomtee_object_user_init() - Initialize an object for the user. + * @object: object to initialize. + * @ot: type of object as &enum qcomtee_object_type. + * @ops: instance of callbacks. + * @fmt: name assigned to the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_user_init(struct qcomtee_object *object, + enum qcomtee_object_type ot, + struct qcomtee_object_operations *ops, + const char *fmt, ...) __printf(4, 5); + +/* Object release is RCU protected. */ +int qcomtee_object_get(struct qcomtee_object *object); +void qcomtee_object_put(struct qcomtee_object *object); + +#define qcomtee_arg_for_each(i, args) \ + for (i = 0; args[i].type != QCOMTEE_ARG_TYPE_INV; i++) + +/* Next argument of type @type after index @i. */ +int qcomtee_next_arg_type(struct qcomtee_arg *u, int i, + enum qcomtee_arg_type type); + +/* Iterate over argument of given type. */ +#define qcomtee_arg_for_each_type(i, args, at) \ + for (i = qcomtee_next_arg_type(args, 0, at); \ + args[i].type != QCOMTEE_ARG_TYPE_INV; \ + i = qcomtee_next_arg_type(args, i + 1, at)) + +#define qcomtee_arg_for_each_input_buffer(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_IB) +#define qcomtee_arg_for_each_output_buffer(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_OB) +#define qcomtee_arg_for_each_input_object(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_IO) +#define qcomtee_arg_for_each_output_object(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_OO) + +struct qcomtee_object * +qcomtee_object_get_client_env(struct qcomtee_object_invoke_ctx *oic); + +struct qcomtee_object * +qcomtee_object_get_service(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *client_env, u32 uid); + +#endif /* QCOMTEE_OBJECT_H */ diff --git a/drivers/tee/qcomtee/shm.c b/drivers/tee/qcomtee/shm.c new file mode 100644 index 000000000000..2aea76487372 --- /dev/null +++ b/drivers/tee/qcomtee/shm.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "qcomtee.h" + +/** + * define MAX_OUTBOUND_BUFFER_SIZE - Maximum size of outbound buffers. + * + * The size of outbound buffer depends on QTEE callback requests. + */ +#define MAX_OUTBOUND_BUFFER_SIZE SZ_4K + +/** + * define MAX_INBOUND_BUFFER_SIZE - Maximum size of the inbound buffer. + * + * The size of the inbound buffer depends on the user's requests, + * specifically the number of IB and OB arguments. If an invocation + * requires a size larger than %MAX_INBOUND_BUFFER_SIZE, the user should + * consider using another form of shared memory with QTEE. + */ +#define MAX_INBOUND_BUFFER_SIZE SZ_4M + +/** + * qcomtee_msg_buffers_alloc() - Allocate inbound and outbound buffers. + * @oic: context to use for the current invocation. + * @u: array of arguments for the current invocation. + * + * It calculates the size of inbound and outbound buffers based on the + * arguments in @u. It allocates the buffers from the teedev pool. + * + * Return: On success, returns 0. On error, returns < 0. + */ +int qcomtee_msg_buffers_alloc(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_arg *u) +{ + struct tee_context *ctx = oic->ctx; + struct tee_shm *shm; + size_t size; + int i; + + /* Start offset in a message for buffer arguments. */ + size = qcomtee_msg_buffer_args(struct qcomtee_msg_object_invoke, + qcomtee_args_len(u)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + + /* Add size of IB arguments. */ + qcomtee_arg_for_each_input_buffer(i, u) { + size = size_add(size, qcomtee_msg_offset_align(u[i].b.size)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + } + + /* Add size of OB arguments. */ + qcomtee_arg_for_each_output_buffer(i, u) { + size = size_add(size, qcomtee_msg_offset_align(u[i].b.size)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + } + + shm = tee_shm_alloc_priv_buf(ctx, size); + if (IS_ERR(shm)) + return PTR_ERR(shm); + + /* Allocate inbound buffer. */ + oic->in_shm = shm; + shm = tee_shm_alloc_priv_buf(ctx, MAX_OUTBOUND_BUFFER_SIZE); + if (IS_ERR(shm)) { + tee_shm_free(oic->in_shm); + + return PTR_ERR(shm); + } + /* Allocate outbound buffer. */ + oic->out_shm = shm; + + oic->in_msg.addr = tee_shm_get_va(oic->in_shm, 0); + oic->in_msg.size = tee_shm_get_size(oic->in_shm); + oic->out_msg.addr = tee_shm_get_va(oic->out_shm, 0); + oic->out_msg.size = tee_shm_get_size(oic->out_shm); + /* QTEE assume unused buffers are zeroed. */ + memzero_explicit(oic->in_msg.addr, oic->in_msg.size); + memzero_explicit(oic->out_msg.addr, oic->out_msg.size); + + return 0; +} + +void qcomtee_msg_buffers_free(struct qcomtee_object_invoke_ctx *oic) +{ + tee_shm_free(oic->in_shm); + tee_shm_free(oic->out_shm); +} + +/* Dynamic shared memory pool based on tee_dyn_shm_alloc_helper(). */ + +static int qcomtee_shm_register(struct tee_context *ctx, struct tee_shm *shm, + struct page **pages, size_t num_pages, + unsigned long start) +{ + return qcom_tzmem_shm_bridge_create(shm->paddr, shm->size, + &shm->sec_world_id); +} + +static int qcomtee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm) +{ + qcom_tzmem_shm_bridge_delete(shm->sec_world_id); + + return 0; +} + +static int pool_op_alloc(struct tee_shm_pool *pool, struct tee_shm *shm, + size_t size, size_t align) +{ + if (!(shm->flags & TEE_SHM_PRIV)) + return -ENOMEM; + + return tee_dyn_shm_alloc_helper(shm, size, align, qcomtee_shm_register); +} + +static void pool_op_free(struct tee_shm_pool *pool, struct tee_shm *shm) +{ + tee_dyn_shm_free_helper(shm, qcomtee_shm_unregister); +} + +static void pool_op_destroy_pool(struct tee_shm_pool *pool) +{ + kfree(pool); +} + +static const struct tee_shm_pool_ops pool_ops = { + .alloc = pool_op_alloc, + .free = pool_op_free, + .destroy_pool = pool_op_destroy_pool, +}; + +struct tee_shm_pool *qcomtee_shm_pool_alloc(void) +{ + struct tee_shm_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->ops = &pool_ops; + + return pool; +} diff --git a/drivers/tee/qcomtee/user_obj.c b/drivers/tee/qcomtee/user_obj.c new file mode 100644 index 000000000000..0139905f2684 --- /dev/null +++ b/drivers/tee/qcomtee/user_obj.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "qcomtee.h" + +/** + * DOC: User Objects aka Supplicants + * + * Any userspace process with access to the TEE device file can behave as a + * supplicant by creating a user object. Any TEE parameter of type OBJREF with + * %QCOMTEE_OBJREF_FLAG_USER flag set is considered a user object. + * + * A supplicant uses qcomtee_user_object_select() (i.e. TEE_IOC_SUPPL_RECV) to + * receive a QTEE user object request and qcomtee_user_object_submit() + * (i.e. TEE_IOC_SUPPL_SEND) to submit a response. QTEE expects to receive the + * response, including OB and OO in a specific order in the message; parameters + * submitted with qcomtee_user_object_submit() should maintain this order. + */ + +/** + * struct qcomtee_user_object - User object. + * @object: &struct qcomtee_object representing the user object. + * @ctx: context for which the user object is defined. + * @object_id: object ID in @ctx. + * @notify: notify on release. + * + * Any object managed in userspace is represented by this struct. + * If @notify is set, a notification message is sent back to userspace + * upon release. + */ +struct qcomtee_user_object { + struct qcomtee_object object; + struct tee_context *ctx; + u64 object_id; + bool notify; +}; + +#define to_qcomtee_user_object(o) \ + container_of((o), struct qcomtee_user_object, object) + +static struct qcomtee_object_operations qcomtee_user_object_ops; + +/* Is it a user object? */ +int is_qcomtee_user_object(struct qcomtee_object *object) +{ + return object != NULL_QCOMTEE_OBJECT && + typeof_qcomtee_object(object) == QCOMTEE_OBJECT_TYPE_CB && + object->ops == &qcomtee_user_object_ops; +} + +/* Set the user object's 'notify on release' flag. */ +void qcomtee_user_object_set_notify(struct qcomtee_object *object, bool notify) +{ + if (is_qcomtee_user_object(object)) + to_qcomtee_user_object(object)->notify = notify; +} + +/* Supplicant Requests: */ + +/** + * enum qcomtee_req_state - Current state of request. + * @QCOMTEE_REQ_QUEUED: Request is waiting for supplicant. + * @QCOMTEE_REQ_PROCESSING: Request has been picked by the supplicant. + * @QCOMTEE_REQ_PROCESSED: Response has been submitted for the request. + */ +enum qcomtee_req_state { + QCOMTEE_REQ_QUEUED = 1, + QCOMTEE_REQ_PROCESSING, + QCOMTEE_REQ_PROCESSED, +}; + +/* User requests sent to supplicants. */ +struct qcomtee_ureq { + enum qcomtee_req_state state; + + /* User Request: */ + int req_id; + u64 object_id; + u32 op; + struct qcomtee_arg *args; + int errno; + + struct list_head node; + struct completion c; /* Completion for whoever wait. */ +}; + +/* + * Placeholder for a PROCESSING request in qcomtee_context.reqs_idr. + * + * If the thread that calls qcomtee_object_invoke() dies and the supplicant + * is processing the request, replace the entry in qcomtee_context.reqs_idr + * with empty_ureq. This ensures that (1) the req_id remains busy and is not + * reused, and (2) the supplicant fails to submit the response and performs + * the necessary rollback. + */ +static struct qcomtee_ureq empty_ureq = { .state = QCOMTEE_REQ_PROCESSING }; + +/* Enqueue a user request for a context and assign a request ID. */ +static int ureq_enqueue(struct qcomtee_context_data *ctxdata, + struct qcomtee_ureq *ureq) +{ + int ret; + + guard(mutex)(&ctxdata->reqs_lock); + /* Supplicant is dying. */ + if (ctxdata->released) + return -ENODEV; + + /* Allocate an ID and queue the request. */ + ret = idr_alloc(&ctxdata->reqs_idr, ureq, 0, 0, GFP_KERNEL); + if (ret < 0) + return ret; + + ureq->req_id = ret; + ureq->state = QCOMTEE_REQ_QUEUED; + list_add_tail(&ureq->node, &ctxdata->reqs_list); + + return 0; +} + +/** + * ureq_dequeue() - Dequeue a user request from a context. + * @ctxdata: context data for a context to dequeue the request. + * @req_id: ID of the request to be dequeued. + * + * It dequeues a user request and releases its request ID. + * + * Context: The caller should hold &qcomtee_context_data->reqs_lock. + * Return: Returns the user request associated with this ID; otherwise, NULL. + */ +static struct qcomtee_ureq *ureq_dequeue(struct qcomtee_context_data *ctxdata, + int req_id) +{ + struct qcomtee_ureq *ureq; + + ureq = idr_remove(&ctxdata->reqs_idr, req_id); + if (ureq == &empty_ureq || !ureq) + return NULL; + + list_del(&ureq->node); + + return ureq; +} + +/** + * ureq_select() - Select the next request in a context. + * @ctxdata: context data for a context to pop a request. + * @ubuf_size: size of the available buffer for UBUF parameters. + * @num_params: number of entries for the TEE parameter array. + * + * It checks if @num_params is large enough to fit the next request arguments. + * It checks if @ubuf_size is large enough to fit IB buffer arguments. + * + * Context: The caller should hold &qcomtee_context_data->reqs_lock. + * Return: On success, returns a request; + * on failure, returns NULL and ERR_PTR. + */ +static struct qcomtee_ureq *ureq_select(struct qcomtee_context_data *ctxdata, + size_t ubuf_size, int num_params) +{ + struct qcomtee_ureq *req, *ureq = NULL; + struct qcomtee_arg *u; + int i; + + /* Find the a queued request. */ + list_for_each_entry(req, &ctxdata->reqs_list, node) { + if (req->state == QCOMTEE_REQ_QUEUED) { + ureq = req; + break; + } + } + + if (!ureq) + return NULL; + + u = ureq->args; + /* (1) Is there enough TEE parameters? */ + if (num_params < qcomtee_args_len(u)) + return ERR_PTR(-EINVAL); + /* (2) Is there enough space to pass input buffers? */ + qcomtee_arg_for_each_input_buffer(i, u) { + ubuf_size = size_sub(ubuf_size, u[i].b.size); + if (ubuf_size == SIZE_MAX) + return ERR_PTR(-EINVAL); + + ubuf_size = round_down(ubuf_size, 8); + } + + return ureq; +} + +/* Gets called when the user closes the device. */ +void qcomtee_requests_destroy(struct qcomtee_context_data *ctxdata) +{ + struct qcomtee_ureq *req, *ureq; + + guard(mutex)(&ctxdata->reqs_lock); + /* So ureq_enqueue() refuses new requests from QTEE. */ + ctxdata->released = true; + /* ureqs in reqs_list are in QUEUED or PROCESSING (!= empty_ureq) state. */ + list_for_each_entry_safe(ureq, req, &ctxdata->reqs_list, node) { + ureq_dequeue(ctxdata, ureq->req_id); + + if (ureq->op != QCOMTEE_MSG_OBJECT_OP_RELEASE) { + ureq->state = QCOMTEE_REQ_PROCESSED; + ureq->errno = -ENODEV; + + complete(&ureq->c); + } else { + kfree(ureq); + } + } +} + +/* User Object API. */ + +/* User object dispatcher. */ +static int qcomtee_user_object_dispatch(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *args) +{ + struct qcomtee_user_object *uo = to_qcomtee_user_object(object); + struct qcomtee_context_data *ctxdata = uo->ctx->data; + struct qcomtee_ureq *ureq __free(kfree) = NULL; + int errno; + + ureq = kzalloc(sizeof(*ureq), GFP_KERNEL); + if (!ureq) + return -ENOMEM; + + init_completion(&ureq->c); + ureq->object_id = uo->object_id; + ureq->op = op; + ureq->args = args; + + /* Queue the request. */ + if (ureq_enqueue(ctxdata, ureq)) + return -ENODEV; + /* Wakeup supplicant to process it. */ + complete(&ctxdata->req_c); + + /* + * Wait for the supplicant to process the request. Wait as KILLABLE + * in case the supplicant and invoke thread are both running from the + * same process, the supplicant crashes, or the shutdown sequence + * starts with supplicant dies first; otherwise, it stuck indefinitely. + * + * If the supplicant processes long-running requests, also use + * TASK_FREEZABLE to allow the device to safely suspend if needed. + */ + if (!wait_for_completion_state(&ureq->c, + TASK_KILLABLE | TASK_FREEZABLE)) { + errno = ureq->errno; + if (!errno) + oic->data = no_free_ptr(ureq); + } else { + enum qcomtee_req_state prev_state; + + errno = -ENODEV; + + scoped_guard(mutex, &ctxdata->reqs_lock) { + prev_state = ureq->state; + /* Replace with empty_ureq to keep req_id reserved. */ + if (prev_state == QCOMTEE_REQ_PROCESSING) { + list_del(&ureq->node); + idr_replace(&ctxdata->reqs_idr, + &empty_ureq, ureq->req_id); + + /* Remove as supplicant has never seen this request. */ + } else if (prev_state == QCOMTEE_REQ_QUEUED) { + ureq_dequeue(ctxdata, ureq->req_id); + } + } + + /* Supplicant did some work, do not discard it. */ + if (prev_state == QCOMTEE_REQ_PROCESSED) { + errno = ureq->errno; + if (!errno) + oic->data = no_free_ptr(ureq); + } + } + + return errno; +} + +/* Gets called after submitting the dispatcher response. */ +static void qcomtee_user_object_notify(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *unused_object, + int err) +{ + struct qcomtee_ureq *ureq = oic->data; + struct qcomtee_arg *u = ureq->args; + int i; + + /* + * If err, there was a transport issue, and QTEE did not receive the + * response for the dispatcher. Release the callback object created for + * QTEE, in addition to the copies of objects kept for the drivers. + */ + qcomtee_arg_for_each_output_object(i, u) { + if (err && + (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB)) + qcomtee_object_put(u[i].o); + qcomtee_object_put(u[i].o); + } + + kfree(ureq); +} + +static void qcomtee_user_object_release(struct qcomtee_object *object) +{ + struct qcomtee_user_object *uo = to_qcomtee_user_object(object); + struct qcomtee_context_data *ctxdata = uo->ctx->data; + struct qcomtee_ureq *ureq; + + /* RELEASE does not require any argument. */ + static struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; + + if (!uo->notify) + goto out_no_notify; + + ureq = kzalloc(sizeof(*ureq), GFP_KERNEL); + if (!ureq) + goto out_no_notify; + + /* QUEUE a release request: */ + ureq->object_id = uo->object_id; + ureq->op = QCOMTEE_MSG_OBJECT_OP_RELEASE; + ureq->args = args; + if (ureq_enqueue(ctxdata, ureq)) { + kfree(ureq); + /* Ignore the notification if it cannot be queued. */ + goto out_no_notify; + } + + complete(&ctxdata->req_c); + +out_no_notify: + teedev_ctx_put(uo->ctx); + kfree(uo); +} + +static struct qcomtee_object_operations qcomtee_user_object_ops = { + .release = qcomtee_user_object_release, + .notify = qcomtee_user_object_notify, + .dispatch = qcomtee_user_object_dispatch, +}; + +/** + * qcomtee_user_param_to_object() - OBJREF parameter to &struct qcomtee_object. + * @object: object returned. + * @param: TEE parameter. + * @ctx: context in which the conversion should happen. + * + * @param is an OBJREF with %QCOMTEE_OBJREF_FLAG_USER flags. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_param_to_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_user_object *user_object __free(kfree) = NULL; + int err; + + user_object = kzalloc(sizeof(*user_object), GFP_KERNEL); + if (!user_object) + return -ENOMEM; + + user_object->ctx = ctx; + user_object->object_id = param->u.objref.id; + /* By default, always notify userspace upon release. */ + user_object->notify = true; + err = qcomtee_object_user_init(&user_object->object, + QCOMTEE_OBJECT_TYPE_CB, + &qcomtee_user_object_ops, "uo-%llu", + param->u.objref.id); + if (err) + return err; + /* Matching teedev_ctx_put() is in qcomtee_user_object_release(). */ + teedev_ctx_get(ctx); + + *object = &no_free_ptr(user_object)->object; + + return 0; +} + +/* Reverse what qcomtee_user_param_to_object() does. */ +int qcomtee_user_param_from_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx) +{ + struct qcomtee_user_object *uo; + + uo = to_qcomtee_user_object(object); + /* Ensure the object is in the same context as the caller. */ + if (uo->ctx != ctx) + return -EINVAL; + + param->u.objref.id = uo->object_id; + param->u.objref.flags = QCOMTEE_OBJREF_FLAG_USER; + + /* User objects are valid in userspace; do not keep a copy. */ + qcomtee_object_put(object); + + return 0; +} + +/** + * qcomtee_cb_params_from_args() - Convert QTEE arguments to TEE parameters. + * @params: TEE parameters. + * @u: QTEE arguments. + * @num_params: number of elements in the parameter array. + * @ubuf_addr: user buffer for arguments of type %QCOMTEE_ARG_TYPE_IB. + * @ubuf_size: size of the user buffer. + * @ctx: context in which the conversion should happen. + * + * It expects @params to have enough entries for @u. Entries in @params are of + * %TEE_IOCTL_PARAM_ATTR_TYPE_NONE. + * + * Return: On success, returns the number of input parameters; + * on failure, returns < 0. + */ +static int qcomtee_cb_params_from_args(struct tee_param *params, + struct qcomtee_arg *u, int num_params, + void __user *ubuf_addr, size_t ubuf_size, + struct tee_context *ctx) +{ + int i, np; + void __user *uaddr; + + qcomtee_arg_for_each(i, u) { + switch (u[i].type) { + case QCOMTEE_ARG_TYPE_IB: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT; + + /* Underflow already checked in ureq_select(). */ + ubuf_size = round_down(ubuf_size - u[i].b.size, 8); + uaddr = (void __user *)(ubuf_addr + ubuf_size); + + params[i].u.ubuf.uaddr = uaddr; + params[i].u.ubuf.size = u[i].b.size; + if (copy_to_user(params[i].u.ubuf.uaddr, u[i].b.addr, + u[i].b.size)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OB: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT; + /* Let the user knows the maximum size QTEE expects. */ + params[i].u.ubuf.size = u[i].b.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT; + if (qcomtee_objref_from_arg(¶ms[i], &u[i], ctx)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OO: + params[i].attr = + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT; + + break; + default: /* Never get here! */ + goto out_failed; + } + } + + return i; + +out_failed: + /* Undo qcomtee_objref_from_arg(). */ + for (np = i; np >= 0; np--) { + if (params[np].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT) + qcomtee_context_del_qtee_object(¶ms[np], ctx); + } + + /* Release any IO objects not processed. */ + for (; u[i].type; i++) { + if (u[i].type == QCOMTEE_ARG_TYPE_IO) + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_cb_params_to_args() - Convert TEE parameters to QTEE arguments. + * @u: QTEE arguments. + * @params: TEE parameters. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_cb_params_to_args(struct qcomtee_arg *u, + struct tee_param *params, int num_params, + struct tee_context *ctx) +{ + int i; + + qcomtee_arg_for_each(i, u) { + switch (u[i].type) { + case QCOMTEE_ARG_TYPE_IB: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OB: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT) + goto out_failed; + + /* Client can not send more data than requested. */ + if (params[i].u.ubuf.size > u[i].b.size) + goto out_failed; + + if (copy_from_user(u[i].b.addr, params[i].u.ubuf.uaddr, + params[i].u.ubuf.size)) + goto out_failed; + + u[i].b.size = params[i].u.ubuf.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OO: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) + goto out_failed; + + if (qcomtee_objref_to_arg(&u[i], ¶ms[i], ctx)) + goto out_failed; + + break; + default: /* Never get here! */ + goto out_failed; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_to_arg(). */ + for (i--; i >= 0; i--) { + if (u[i].type != QCOMTEE_ARG_TYPE_OO) + continue; + + qcomtee_user_object_set_notify(u[i].o, false); + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_user_object_select() - Select a request for a user object. + * @ctx: context to look for a user object. + * @params: parameters for @op. + * @num_params: number of elements in the parameter array. + * @uaddr: user buffer for output UBUF parameters. + * @size: size of user buffer @uaddr. + * @data: information for the selected request. + * + * @params is filled along with @data for the selected request. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_object_select(struct tee_context *ctx, + struct tee_param *params, int num_params, + void __user *uaddr, size_t size, + struct qcomtee_user_object_request_data *data) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_ureq *ureq; + int ret; + + /* + * Hold the reqs_lock not only for ureq_select() and updating the ureq + * state to PROCESSING but for the entire duration of ureq access. + * This prevents qcomtee_user_object_dispatch() from freeing + * ureq while it is still in use, if client dies. + */ + + while (1) { + scoped_guard(mutex, &ctxdata->reqs_lock) { + ureq = ureq_select(ctxdata, size, num_params); + if (!ureq) + goto wait_for_request; + + if (IS_ERR(ureq)) + return PTR_ERR(ureq); + + /* Processing the request 'QUEUED -> PROCESSING'. */ + ureq->state = QCOMTEE_REQ_PROCESSING; + /* ''Prepare user request:'' */ + data->id = ureq->req_id; + data->object_id = ureq->object_id; + data->op = ureq->op; + ret = qcomtee_cb_params_from_args(params, ureq->args, + num_params, uaddr, + size, ctx); + if (ret >= 0) + goto done_request; + + /* Something is wrong with the request: */ + ureq_dequeue(ctxdata, data->id); + /* Send error to QTEE. */ + ureq->state = QCOMTEE_REQ_PROCESSED; + ureq->errno = ret; + + complete(&ureq->c); + } + + continue; +wait_for_request: + /* Wait for a new QUEUED request. */ + if (wait_for_completion_interruptible(&ctxdata->req_c)) + return -ERESTARTSYS; + } + +done_request: + /* No one is waiting for the response. */ + if (data->op == QCOMTEE_MSG_OBJECT_OP_RELEASE) { + scoped_guard(mutex, &ctxdata->reqs_lock) + ureq_dequeue(ctxdata, data->id); + kfree(ureq); + } + + data->np = ret; + + return 0; +} + +/** + * qcomtee_user_object_submit() - Submit a response for a user object. + * @ctx: context to look for a user object. + * @params: returned parameters. + * @num_params: number of elements in the parameter array. + * @req_id: request ID for the response. + * @errno: result of user object invocation. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_object_submit(struct tee_context *ctx, + struct tee_param *params, int num_params, + int req_id, int errno) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_ureq *ureq; + + /* See comments for reqs_lock in qcomtee_user_object_select(). */ + guard(mutex)(&ctxdata->reqs_lock); + + ureq = ureq_dequeue(ctxdata, req_id); + if (!ureq) + return -EINVAL; + + ureq->state = QCOMTEE_REQ_PROCESSED; + + if (!errno) + ureq->errno = qcomtee_cb_params_to_args(ureq->args, params, + num_params, ctx); + else + ureq->errno = errno; + /* Return errno if qcomtee_cb_params_to_args() failed; otherwise 0. */ + if (!errno && ureq->errno) + errno = ureq->errno; + else + errno = 0; + + /* Send result to QTEE. */ + complete(&ureq->c); + + return errno; +} diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index a5466b503bfe..386ad36f1a0a 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -59,6 +59,7 @@ #define TEE_IMPL_ID_OPTEE 1 #define TEE_IMPL_ID_AMDTEE 2 #define TEE_IMPL_ID_TSTEE 3 +#define TEE_IMPL_ID_QTEE 4 /* * OP-TEE specific capabilities -- cgit v1.2.3 From 2c895133950646f45e5cf3900b168c952c8dbee8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 15 Sep 2025 03:26:17 +0000 Subject: bpf: Do not limit bpf_cgroup_from_id to current's namespace The bpf_cgroup_from_id kfunc relies on cgroup_get_from_id to obtain the cgroup corresponding to a given cgroup ID. This helper can be called in a lot of contexts where the current thread can be random. A recent example was its use in sched_ext's ops.tick(), to obtain the root cgroup pointer. Since the current task can be whatever random user space task preempted by the timer tick, this makes the behavior of the helper unreliable. Refactor out __cgroup_get_from_id as the non-namespace aware version of cgroup_get_from_id, and change bpf_cgroup_from_id to make use of it. There is no compatibility breakage here, since changing the namespace against which the lookup is being done to the root cgroup namespace only permits a wider set of lookups to succeed now. The cgroup IDs across namespaces are globally unique, and thus don't need to be retranslated. Reported-by: Dan Schatzberg Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250915032618.1551762-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cgroup.h | 1 + kernel/bpf/helpers.c | 2 +- kernel/cgroup/cgroup.c | 24 ++++++++++++++++++++---- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..b08c8e62881c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -650,6 +650,7 @@ static inline void cgroup_kthread_ready(void) } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); +struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c0c0764a2025..51229aba5318 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2546,7 +2546,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; - cgrp = cgroup_get_from_id(cgid); + cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..6c2d20ac697c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6343,15 +6343,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure - * Only cgroups within current task's cgroup NS are valid. + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp, *root_cgrp; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) @@ -6373,6 +6373,22 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { -- cgit v1.2.3 From 61b2f7baa9779b12a7bf1b9800a3f2a2549a1315 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 11 Sep 2025 13:06:30 +0200 Subject: tcp: fast path functions later MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following patch will use tcp_ecn_mode_accecn(), TCP_ACCECN_CEP_INIT_OFFSET, TCP_ACCECN_CEP_ACE_MASK in __tcp_fast_path_on() to make new flag for AccECN. No functional changes. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 277914c4d067..e25340459ce4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -821,33 +821,6 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } -static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) -{ - /* mptcp hooks are only on the slow path */ - if (sk_is_mptcp((struct sock *)tp)) - return; - - tp->pred_flags = htonl((tp->tcp_header_len << 26) | - ntohl(TCP_FLAG_ACK) | - snd_wnd); -} - -static inline void tcp_fast_path_on(struct tcp_sock *tp) -{ - __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); -} - -static inline void tcp_fast_path_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && - tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && - !tp->urg_data) - tcp_fast_path_on(tp); -} - u32 tcp_delack_max(const struct sock *sk); /* Compute the actual rto_min value */ @@ -1807,6 +1780,33 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +{ + /* mptcp hooks are only on the slow path */ + if (sk_is_mptcp((struct sock *)tp)) + return; + + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +} + +static inline void tcp_fast_path_on(struct tcp_sock *tp) +{ + __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); +} + +static inline void tcp_fast_path_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && + tp->rcv_wnd && + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + !tp->urg_data) + tcp_fast_path_on(tp); +} + bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); -- cgit v1.2.3 From c3426ba2ed6942fe33c75bf17fc7513ba2c6ac64 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:31 +0200 Subject: tcp: reorganize tcp_sock_write_txrx group for variables later Use the first 3-byte hole at the beginning of the tcp_sock_write_txrx group for 'noneagle'/'rate_app_limited' to fill in the existing hole in later patches. Therefore, the group size of tcp_sock_write_txrx is reduced from 92 + 4 to 91 + 4. In addition, the group size of tcp_sock_write_rx is changed to 96 to fit in the pahole outcome. Below are the trimmed pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ /* XXX 3 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ u8 nonagle:4; /* 2612: 0 1 */ u8 rate_app_limited:1; /* 2612: 4 1 */ /* XXX 3 bits hole, try to pack */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2613 0 */ /* XXX 3 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 57e478bfaef2..d103cc0e7a35 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -285,6 +285,8 @@ struct tcp_sock { * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -303,8 +305,6 @@ struct tcp_sock { * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; - u8 nonagle : 4,/* Disable Nagle algorithm? */ - rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7f9c671b1ee0..1f643faa8b93 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5145,7 +5145,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); @@ -5162,7 +5162,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); } void __init tcp_init(void) -- cgit v1.2.3 From 30f5ca00624397d81c99515bdd43286ade93d7c8 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:32 +0200 Subject: tcp: ecn functions in separated include file The following patches will modify ECN helpers and add AccECN herlpers, and this patch moves the existing ones into a separated include file. No functional changes. Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-5-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ecn.h | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 45 +------------------- net/ipv4/tcp_output.c | 56 +----------------------- 3 files changed, 118 insertions(+), 99 deletions(-) create mode 100644 include/net/tcp_ecn.h (limited to 'include') diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h new file mode 100644 index 000000000000..b3430557676b --- /dev/null +++ b/include/net/tcp_ecn.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_ECN_H +#define _TCP_ECN_H + +#include +#include + +#include +#include +#include +#include + +static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) +{ + if (tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_accept_cwr(struct sock *sk, + const struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) { + tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } +} + +static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) + return true; + return false; +} + +/* Packet ECN state for a SYN-ACK */ +static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; + if (tcp_ecn_disabled(tp)) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); +} + +/* Packet ECN state for a SYN. */ +static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } + + tp->ecn_flags = 0; + + if (use_ecn) { + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } +} + +static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + +static inline void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) +{ + if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +#endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f1be65af1a77..b2793e749cfd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -339,31 +340,6 @@ static bool tcp_in_quickack_mode(struct sock *sk) (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -static void tcp_ecn_queue_cwr(struct tcp_sock *tp) -{ - if (tcp_ecn_mode_rfc3168(tp)) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; -} - -static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - - /* If the sender is telling us it has entered CWR, then its - * cwnd may be very low (even just 1 packet), so we should ACK - * immediately. - */ - if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } -} - -static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) -{ - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -} - static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -399,25 +375,6 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} - -static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} - -static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) -{ - if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) - return true; - return false; -} - static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 54b8faa3ad95..be8ceefa5332 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include @@ -319,61 +320,6 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -/* Packet ECN state for a SYN-ACK */ -static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (tcp_ecn_disabled(tp)) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk) || - tcp_bpf_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); -} - -/* Packet ECN state for a SYN. */ -static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; - - if (!use_ecn) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) - use_ecn = true; - } - - tp->ecn_flags = 0; - - if (use_ecn) { - if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) - INET_ECN_xmit(sk); - - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); - } -} - -static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) -{ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) - /* tp->ecn_flags are cleared at a later point in time when - * SYN ACK is ultimatively being received. - */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); -} - -static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) -{ - if (inet_rsk(req)->ecn_ok) - th->ece = 1; -} - /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -- cgit v1.2.3 From 07c446e35b89bc8774792f8036e595cffdf5b162 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:43 +0900 Subject: firewire: core: maintain phy packet receivers locally in cdev layer The list of receivers for phy packet is used only by cdev layer, while it is maintained as a member of fw_card structure. This commit maintains the list locally in cdev layer. Link: https://lore.kernel.org/r/20250915234747.915922-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 1 - drivers/firewire/core-cdev.c | 27 ++++++++++++++++++++------- include/linux/firewire.h | 2 -- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index b5e01a711145..616abb836ef3 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -556,7 +556,6 @@ void fw_card_initialize(struct fw_card *card, kref_init(&card->kref); init_completion(&card->done); INIT_LIST_HEAD(&card->transaction_list); - INIT_LIST_HEAD(&card->phy_receiver_list); spin_lock_init(&card->lock); card->local_node = NULL; diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 1be8f5eb3e17..112b33099610 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -47,6 +47,9 @@ #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 #define FW_CDEV_VERSION_EVENT_ASYNC_TSTAMP 6 +static DEFINE_SPINLOCK(phy_receiver_list_lock); +static LIST_HEAD(phy_receiver_list); + struct client { u32 version; struct fw_device *device; @@ -1669,15 +1672,16 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg) static int ioctl_receive_phy_packets(struct client *client, union ioctl_arg *arg) { struct fw_cdev_receive_phy_packets *a = &arg->receive_phy_packets; - struct fw_card *card = client->device->card; /* Access policy: Allow this ioctl only on local nodes' device files. */ if (!client->device->is_local) return -ENOSYS; - guard(spinlock_irq)(&card->lock); + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) + list_move_tail(&client->phy_receiver_link, &phy_receiver_list); - list_move_tail(&client->phy_receiver_link, &card->phy_receiver_list); client->phy_receiver_closure = a->closure; return 0; @@ -1687,10 +1691,17 @@ void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p) { struct client *client; - guard(spinlock_irqsave)(&card->lock); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + guard(spinlock_irqsave)(&phy_receiver_list_lock); + + list_for_each_entry(client, &phy_receiver_list, phy_receiver_link) { + struct inbound_phy_packet_event *e; + + if (client->device->card != card) + continue; - list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) { - struct inbound_phy_packet_event *e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); + e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); if (e == NULL) break; @@ -1857,7 +1868,9 @@ static int fw_device_op_release(struct inode *inode, struct file *file) struct client_resource *resource; unsigned long index; - scoped_guard(spinlock_irq, &client->device->card->lock) + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) list_del(&client->phy_receiver_link); scoped_guard(mutex, &client->device->client_list_mutex) diff --git a/include/linux/firewire.h b/include/linux/firewire.h index d38c6e538e5c..f3260aacf730 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -115,8 +115,6 @@ struct fw_card { int index; struct list_head link; - struct list_head phy_receiver_list; - struct delayed_work br_work; /* bus reset job */ bool br_short; -- cgit v1.2.3 From 7d138cb269dbd2fa9b0da89a9c10503d1cf269d5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:44 +0900 Subject: firewire: core: use spin lock specific to topology map At present, the operation for read transaction to topology map register is not protected by any kind of lock primitives. This causes a potential problem to result in the mixed content of topology map. This commit adds and uses spin lock specific to topology map. Link: https://lore.kernel.org/r/20250915234747.915922-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 22 ++++++++++++++-------- drivers/firewire/core-transaction.c | 6 +++++- include/linux/firewire.h | 6 +++++- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 17aaf14cab0b..c62cf93f3f65 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -435,20 +435,22 @@ static void update_tree(struct fw_card *card, struct fw_node *root) } } -static void update_topology_map(struct fw_card *card, - u32 *self_ids, int self_id_count) +static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_node_id, + const u32 *self_ids, int self_id_count) { - int node_count = (card->root_node->node_id & 0x3f) + 1; - __be32 *map = card->topology_map; + __be32 *map = buffer; + int node_count = (root_node_id & 0x3f) + 1; + + memset(map, 0, buffer_size); *map++ = cpu_to_be32((self_id_count + 2) << 16); - *map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1); + *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1); *map++ = cpu_to_be32((node_count << 16) | self_id_count); while (self_id_count--) *map++ = cpu_to_be32p(self_ids++); - fw_compute_block_crc(card->topology_map); + fw_compute_block_crc(buffer); } void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, @@ -479,8 +481,6 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, local_node = build_tree(card, self_ids, self_id_count, generation); - update_topology_map(card, self_ids, self_id_count); - card->color++; if (local_node == NULL) { @@ -493,5 +493,11 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, update_tree(card, local_node); } } + + // Just used by transaction layer. + scoped_guard(spinlock, &card->topology_map.lock) { + update_topology_map(card->topology_map.buffer, sizeof(card->topology_map.buffer), + card->root_node->node_id, self_ids, self_id_count); + } } EXPORT_SYMBOL(fw_core_handle_bus_reset); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 623e1d9bd107..8edffafd21c1 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1196,7 +1196,11 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request } start = (offset - topology_map_region.start) / 4; - memcpy(payload, &card->topology_map[start], length); + + // NOTE: This can be without irqsave when we can guarantee that fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->topology_map.lock) + memcpy(payload, &card->topology_map.buffer[start], length); fw_send_response(card, request, RCODE_COMPLETE); } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f3260aacf730..aeb71c39e57e 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -129,7 +129,11 @@ struct fw_card { bool broadcast_channel_allocated; u32 broadcast_channel; - __be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + + struct { + __be32 buffer[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + spinlock_t lock; + } topology_map; __be32 maint_utility_register; -- cgit v1.2.3 From 420bd7068cbfaea0a857472dd631dc48311e2a8f Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:45 +0900 Subject: firewire: core: use spin lock specific to transaction The list of instance for asynchronous transaction to wait for response subaction is maintained as a member of fw_card structure. The card-wide spinlock is used at present for any operation over the list, however it is not necessarily suited for the purpose. This commit adds and uses the spin lock specific to maintain the list. Link: https://lore.kernel.org/r/20250915234747.915922-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 12 +++++--- drivers/firewire/core-transaction.c | 59 ++++++++++++++++++++++--------------- include/linux/firewire.h | 10 +++++-- 3 files changed, 51 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 616abb836ef3..39f95007305f 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -544,8 +544,12 @@ void fw_card_initialize(struct fw_card *card, card->index = atomic_inc_return(&index); card->driver = driver; card->device = device; - card->current_tlabel = 0; - card->tlabel_mask = 0; + + card->transactions.current_tlabel = 0; + card->transactions.tlabel_mask = 0; + INIT_LIST_HEAD(&card->transactions.list); + spin_lock_init(&card->transactions.lock); + card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; @@ -555,7 +559,7 @@ void fw_card_initialize(struct fw_card *card, kref_init(&card->kref); init_completion(&card->done); - INIT_LIST_HEAD(&card->transaction_list); + spin_lock_init(&card->lock); card->local_node = NULL; @@ -772,7 +776,7 @@ void fw_core_remove_card(struct fw_card *card) destroy_workqueue(card->isoc_wq); destroy_workqueue(card->async_wq); - WARN_ON(!list_empty(&card->transaction_list)); + WARN_ON(!list_empty(&card->transactions.list)); } EXPORT_SYMBOL(fw_core_remove_card); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 8edffafd21c1..5366d8a781ac 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -49,12 +49,14 @@ static int close_transaction(struct fw_transaction *transaction, struct fw_card { struct fw_transaction *t = NULL, *iter; - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter == transaction) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; @@ -117,11 +119,11 @@ static void split_transaction_timeout_callback(struct timer_list *timer) struct fw_transaction *t = timer_container_of(t, timer, split_timeout_timer); struct fw_card *card = t->card; - scoped_guard(spinlock_irqsave, &card->lock) { + scoped_guard(spinlock_irqsave, &card->transactions.lock) { if (list_empty(&t->link)) return; list_del(&t->link); - card->tlabel_mask &= ~(1ULL << t->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << t->tlabel); } if (!t->with_tstamp) { @@ -259,18 +261,21 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel, } static int allocate_tlabel(struct fw_card *card) +__must_hold(&card->transactions_lock) { int tlabel; - tlabel = card->current_tlabel; - while (card->tlabel_mask & (1ULL << tlabel)) { + lockdep_assert_held(&card->transactions.lock); + + tlabel = card->transactions.current_tlabel; + while (card->transactions.tlabel_mask & (1ULL << tlabel)) { tlabel = (tlabel + 1) & 0x3f; - if (tlabel == card->current_tlabel) + if (tlabel == card->transactions.current_tlabel) return -EBUSY; } - card->current_tlabel = (tlabel + 1) & 0x3f; - card->tlabel_mask |= 1ULL << tlabel; + card->transactions.current_tlabel = (tlabel + 1) & 0x3f; + card->transactions.tlabel_mask |= 1ULL << tlabel; return tlabel; } @@ -331,7 +336,6 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode void *payload, size_t length, union fw_transaction_callback callback, bool with_tstamp, void *callback_data) { - unsigned long flags; int tlabel; /* @@ -339,11 +343,11 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode * the list while holding the card spinlock. */ - spin_lock_irqsave(&card->lock, flags); - - tlabel = allocate_tlabel(card); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + tlabel = allocate_tlabel(card); if (tlabel < 0) { - spin_unlock_irqrestore(&card->lock, flags); if (!with_tstamp) { callback.without_tstamp(card, RCODE_SEND_ERROR, NULL, 0, callback_data); } else { @@ -368,15 +372,22 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode t->callback = callback; t->with_tstamp = with_tstamp; t->callback_data = callback_data; - - fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, generation, - speed, offset, payload, length); t->packet.callback = transmit_complete_callback; - list_add_tail(&t->link, &card->transaction_list); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->lock) { + // The node_id field of fw_card can be updated when handling SelfIDComplete. + fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, + generation, speed, offset, payload, length); + } - spin_unlock_irqrestore(&card->lock, flags); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + list_add_tail(&t->link, &card->transactions.list); + // Safe with no lock, since the index field of fw_card is immutable once assigned. trace_async_request_outbound_initiate((uintptr_t)t, card->index, generation, speed, t->packet.header, payload, tcode_is_read_request(tcode) ? 0 : length / 4); @@ -1111,12 +1122,14 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) break; } - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter->node_id == source && iter->tlabel == tlabel) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index aeb71c39e57e..8d6801cf2fca 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -88,11 +88,15 @@ struct fw_card { int node_id; int generation; - int current_tlabel; - u64 tlabel_mask; - struct list_head transaction_list; u64 reset_jiffies; + struct { + int current_tlabel; + u64 tlabel_mask; + struct list_head list; + spinlock_t lock; + } transactions; + u32 split_timeout_hi; u32 split_timeout_lo; unsigned int split_timeout_cycles; -- cgit v1.2.3 From b5725cfa4120a4d234ab112aad151d731531d093 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:46 +0900 Subject: firewire: core: use spin lock specific to timer for split transaction At present the parameters to compute timeout time for split transaction is protected by card-wide spin lock, while it is not necessarily convenient in a point to narrower critical section. This commit adds and uses another spin lock specific for the purpose. Link: https://lore.kernel.org/r/20250915234747.915922-6-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 10 +++--- drivers/firewire/core-transaction.c | 63 ++++++++++++++++++++++++------------- include/linux/firewire.h | 15 +++++---- 3 files changed, 57 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 39f95007305f..96495392a1f6 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -550,10 +550,12 @@ void fw_card_initialize(struct fw_card *card, INIT_LIST_HEAD(&card->transactions.list); spin_lock_init(&card->transactions.lock); - card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; - card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; - card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; - card->split_timeout_jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000; + card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; + card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + spin_lock_init(&card->split_timeout.lock); + card->color = 0; card->broadcast_channel = BROADCAST_CHANNEL_INITIAL; diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 5366d8a781ac..dd3656a0c1ff 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -137,14 +137,18 @@ static void split_transaction_timeout_callback(struct timer_list *timer) static void start_split_transaction_timeout(struct fw_transaction *t, struct fw_card *card) { - guard(spinlock_irqsave)(&card->lock); + unsigned long delta; if (list_empty(&t->link) || WARN_ON(t->is_split_transaction)) return; t->is_split_transaction = true; - mod_timer(&t->split_timeout_timer, - jiffies + card->split_timeout_jiffies); + + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + delta = card->split_timeout.jiffies; + mod_timer(&t->split_timeout_timer, jiffies + delta); } static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp); @@ -164,8 +168,12 @@ static void transmit_complete_callback(struct fw_packet *packet, break; case ACK_PENDING: { - t->split_timeout_cycle = - compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + t->split_timeout_cycle = + compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + } start_split_transaction_timeout(t, card); break; } @@ -790,11 +798,14 @@ EXPORT_SYMBOL(fw_fill_response); static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; u32 timestamp; - cycles = card->split_timeout_cycles; + lockdep_assert_held(&card->split_timeout.lock); + + cycles = card->split_timeout.cycles; cycles += request_timestamp & 0x1fff; timestamp = request_timestamp & ~0x1fff; @@ -845,9 +856,12 @@ static struct fw_request *allocate_request(struct fw_card *card, return NULL; kref_init(&request->kref); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + request->response.timestamp = compute_split_timeout_timestamp(card, p->timestamp); + request->response.speed = p->speed; - request->response.timestamp = - compute_split_timeout_timestamp(card, p->timestamp); request->response.generation = p->generation; request->response.ack = 0; request->response.callback = free_response_callback; @@ -1228,16 +1242,17 @@ static const struct fw_address_region registers_region = .end = CSR_REGISTER_BASE | CSR_CONFIG_ROM, }; static void update_split_timeout(struct fw_card *card) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; - cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19); + cycles = card->split_timeout.hi * 8000 + (card->split_timeout.lo >> 19); /* minimum per IEEE 1394, maximum which doesn't overflow OHCI */ cycles = clamp(cycles, 800u, 3u * 8000u); - card->split_timeout_cycles = cycles; - card->split_timeout_jiffies = isoc_cycles_to_jiffies(cycles); + card->split_timeout.cycles = cycles; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(cycles); } static void handle_registers(struct fw_card *card, struct fw_request *request, @@ -1287,12 +1302,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_HI: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_hi); + *data = cpu_to_be32(card->split_timeout.hi); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_hi = be32_to_cpu(*data) & 7; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.hi = be32_to_cpu(*data) & 7; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } @@ -1300,12 +1318,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_LO: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_lo); + *data = cpu_to_be32(card->split_timeout.lo); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_lo = be32_to_cpu(*data) & 0xfff80000; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.lo = be32_to_cpu(*data) & 0xfff80000; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 8d6801cf2fca..6d208769d456 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -97,18 +97,21 @@ struct fw_card { spinlock_t lock; } transactions; - u32 split_timeout_hi; - u32 split_timeout_lo; - unsigned int split_timeout_cycles; - unsigned int split_timeout_jiffies; + struct { + u32 hi; + u32 lo; + unsigned int cycles; + unsigned int jiffies; + spinlock_t lock; + } split_timeout; unsigned long long guid; unsigned max_receive; int link_speed; int config_rom_generation; - spinlock_t lock; /* Take this lock when handling the lists in - * this struct. */ + spinlock_t lock; + struct fw_node *local_node; struct fw_node *root_node; struct fw_node *irm_node; -- cgit v1.2.3 From 8c01cc2382bc351672c8eb946c16f567cfffef60 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 10 Sep 2025 21:41:22 +0100 Subject: dt-bindings: net: pcs: renesas,rzn1-miic: Add RZ/T2H and RZ/N2H support Add device tree binding support for RZ/T2H and RZ/N2H SoCs to the existing RZ/N1 MIIC converter binding. These SoCs share similar MIIC functionality but have architectural differences that require schema updates. Add new compatible strings "renesas,r9a09g077-miic" for RZ/T2H and "renesas,r9a09g087-miic" for RZ/N2H, with the latter falling back to the RZ/T2H variant. The new SoCs require reset support with two reset lines for converter register reset and converter reset, which are not present on RZ/N1. Update port configurations to accommodate the different architectures. RZ/N1 supports 5 ports numbered 1-5 with complex input mappings covering indices 0-13, while RZ/T2H and RZ/N2H support 4 ports numbered 0-3 with simplified input mappings covering indices 0-8. Extend the switch port configuration property to support value 0 for the new SoCs. Add a new dt-bindings header file with media interface connection matrix constants that map GMAC, ESC, and ETHSW ports to numeric identifiers for use with RZ/T2H and RZ/N2H device trees. Update DT schema validation to ensure proper port numbering and input mappings per SoC variant. Signed-off-by: Lad Prabhakar Tested-by: Wolfram Sang Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250910204132.319975-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Jakub Kicinski --- .../bindings/net/pcs/renesas,rzn1-miic.yaml | 177 +++++++++++++++------ .../dt-bindings/net/renesas,r9a09g077-pcs-miic.h | 36 +++++ 2 files changed, 165 insertions(+), 48 deletions(-) create mode 100644 include/dt-bindings/net/renesas,r9a09g077-pcs-miic.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml b/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml index 2d33bbab7163..3adbcf56d2be 100644 --- a/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml +++ b/Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml @@ -4,14 +4,15 @@ $id: http://devicetree.org/schemas/net/pcs/renesas,rzn1-miic.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Renesas RZ/N1 MII converter +title: Renesas RZ/N1, RZ/N2H and RZ/T2H MII converter maintainers: - Clément Léger + - Lad Prabhakar description: | - This MII converter is present on the Renesas RZ/N1 SoC family. It is - responsible to do MII passthrough or convert it to RMII/RGMII. + This MII converter is present on the Renesas RZ/N1, RZ/N2H and RZ/T2H SoC + families. It is responsible to do MII passthrough or convert it to RMII/RGMII. properties: '#address-cells': @@ -21,10 +22,16 @@ properties: const: 0 compatible: - items: - - enum: - - renesas,r9a06g032-miic - - const: renesas,rzn1-miic + oneOf: + - items: + - enum: + - renesas,r9a06g032-miic + - const: renesas,rzn1-miic + - items: + - const: renesas,r9a09g077-miic # RZ/T2H + - items: + - const: renesas,r9a09g087-miic # RZ/N2H + - const: renesas,r9a09g077-miic reg: maxItems: 1 @@ -43,11 +50,22 @@ properties: - const: rmii_ref - const: hclk + resets: + items: + - description: Converter register reset + - description: Converter reset + + reset-names: + items: + - const: rst + - const: crst + renesas,miic-switch-portin: description: MII Switch PORTIN configuration. This value should use one of - the values defined in dt-bindings/net/pcs-rzn1-miic.h. + the values defined in dt-bindings/net/pcs-rzn1-miic.h for RZ/N1 SoC and + include/dt-bindings/net/renesas,r9a09g077-pcs-miic.h for RZ/N2H, RZ/T2H SoCs. $ref: /schemas/types.yaml#/definitions/uint32 - enum: [1, 2] + enum: [0, 1, 2] power-domains: maxItems: 1 @@ -60,11 +78,12 @@ patternProperties: properties: reg: description: MII Converter port number. - enum: [1, 2, 3, 4, 5] + enum: [0, 1, 2, 3, 4, 5] renesas,miic-input: description: Converter input port configuration. This value should use - one of the values defined in dt-bindings/net/pcs-rzn1-miic.h. + one of the values defined in dt-bindings/net/pcs-rzn1-miic.h for RZ/N1 SoC + and include/dt-bindings/net/renesas,r9a09g077-pcs-miic.h for RZ/N2H, RZ/T2H SoCs. $ref: /schemas/types.yaml#/definitions/uint32 required: @@ -73,47 +92,109 @@ patternProperties: additionalProperties: false - allOf: - - if: - properties: - reg: - const: 1 - then: - properties: - renesas,miic-input: - const: 0 - - if: +allOf: + - if: + properties: + compatible: + contains: + const: renesas,rzn1-miic + then: + properties: + renesas,miic-switch-portin: + enum: [1, 2] + resets: false + reset-names: false + patternProperties: + "^mii-conv@[0-5]$": properties: reg: - const: 2 - then: - properties: - renesas,miic-input: - enum: [1, 11] - - if: - properties: - reg: - const: 3 - then: - properties: - renesas,miic-input: - enum: [7, 10] - - if: + enum: [1, 2, 3, 4, 5] + allOf: + - if: + properties: + reg: + const: 1 + then: + properties: + renesas,miic-input: + const: 0 + - if: + properties: + reg: + const: 2 + then: + properties: + renesas,miic-input: + enum: [1, 11] + - if: + properties: + reg: + const: 3 + then: + properties: + renesas,miic-input: + enum: [7, 10] + - if: + properties: + reg: + const: 4 + then: + properties: + renesas,miic-input: + enum: [4, 6, 9, 13] + - if: + properties: + reg: + const: 5 + then: + properties: + renesas,miic-input: + enum: [3, 5, 8, 12] + else: + properties: + renesas,miic-switch-portin: + const: 0 + required: + - resets + - reset-names + patternProperties: + "^mii-conv@[0-5]$": properties: reg: - const: 4 - then: - properties: - renesas,miic-input: - enum: [4, 6, 9, 13] - - if: - properties: - reg: - const: 5 - then: - properties: - renesas,miic-input: - enum: [3, 5, 8, 12] + enum: [0, 1, 2, 3] + allOf: + - if: + properties: + reg: + const: 0 + then: + properties: + renesas,miic-input: + enum: [0, 3, 6] + - if: + properties: + reg: + const: 1 + then: + properties: + renesas,miic-input: + enum: [1, 4, 7] + - if: + properties: + reg: + const: 2 + then: + properties: + renesas,miic-input: + enum: [2, 5, 8] + - if: + properties: + reg: + const: 3 + then: + properties: + renesas,miic-input: + const: 1 required: - '#address-cells' diff --git a/include/dt-bindings/net/renesas,r9a09g077-pcs-miic.h b/include/dt-bindings/net/renesas,r9a09g077-pcs-miic.h new file mode 100644 index 000000000000..43a2b5743a63 --- /dev/null +++ b/include/dt-bindings/net/renesas,r9a09g077-pcs-miic.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (C) 2025 Renesas Electronics Corporation. + */ + +#ifndef _DT_BINDINGS_RENASAS_R9A09G077_PCS_MIIC_H +#define _DT_BINDINGS_RENASAS_R9A09G077_PCS_MIIC_H + +/* + * Media Interface Connection Matrix + * =========================================================== + * + * Selects the function of the Media interface of the MAC to be used + * + * SW_MODE[2:0] | Port 0 | Port 1 | Port 2 | Port 3 + * -------------|-------------|-------------|-------------|------------- + * 000b | ETHSW Port0 | ETHSW Port1 | ETHSW Port2 | GMAC1 + * 001b | ESC Port0 | ESC Port1 | GMAC2 | GMAC1 + * 010b | ESC Port0 | ESC Port1 | ETHSW Port2 | GMAC1 + * 011b | ESC Port0 | ESC Port1 | ESC Port2 | GMAC1 + * 100b | ETHSW Port0 | ESC Port1 | ESC Port2 | GMAC1 + * 101b | ETHSW Port0 | ESC Port1 | ETHSW Port2 | GMAC1 + * 110b | ETHSW Port0 | ETHSW Port1 | GMAC2 | GMAC1 + * 111b | GMAC0 | GMAC1 | GMAC2 | - + */ +#define ETHSS_GMAC0_PORT 0 +#define ETHSS_GMAC1_PORT 1 +#define ETHSS_GMAC2_PORT 2 +#define ETHSS_ESC_PORT0 3 +#define ETHSS_ESC_PORT1 4 +#define ETHSS_ESC_PORT2 5 +#define ETHSS_ETHSW_PORT0 6 +#define ETHSS_ETHSW_PORT1 7 +#define ETHSS_ETHSW_PORT2 8 + +#endif -- cgit v1.2.3 From f9fadf23c7f1a0df72ef50a873e1bd3bd4631ec1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Feb 2024 21:25:18 -0500 Subject: security_dentry_init_security(): constify qstr argument Nothing outside of fs/dcache.c has any business modifying dentry names; passing &dentry->d_name as an argument should have that argument declared as a const pointer. Acked-by: Casey Schaufler # smack part Acked-by: Paul Moore Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 4 ++-- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..aa4d6ec9c98b 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -85,7 +85,7 @@ LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, struct lsm_context *cp) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, - struct qstr *name, const struct cred *old, struct cred *new) + const struct qstr *name, const struct cred *old, struct cred *new) #ifdef CONFIG_SECURITY_PATH LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry) diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..3f694d3ebd70 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -391,7 +391,7 @@ int security_dentry_init_security(struct dentry *dentry, int mode, const char **xattr_name, struct lsm_context *lsmcxt); int security_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new); int security_path_notify(const struct path *path, u64 mask, @@ -871,7 +871,7 @@ static inline int security_dentry_init_security(struct dentry *dentry, } static inline int security_dentry_create_files_as(struct dentry *dentry, - int mode, struct qstr *name, + int mode, const struct qstr *name, const struct cred *old, struct cred *new) { diff --git a/security/security.c b/security/security.c index ad163f06bf7a..db2d75be87cc 100644 --- a/security/security.c +++ b/security/security.c @@ -1775,7 +1775,7 @@ EXPORT_SYMBOL(security_dentry_init_security); * Return: Returns 0 on success, error on failure. */ int security_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, dentry, mode, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..58ce49954206 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2901,7 +2901,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, } static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new) { diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..5caa372ffbf3 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4908,7 +4908,7 @@ static int smack_inode_copy_up_xattr(struct dentry *src, const char *name) } static int smack_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new) { -- cgit v1.2.3 From ca97d6c60b1d1dff519a7e3dd86708304e657365 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Jul 2025 05:45:01 -0400 Subject: generic_ci_validate_strict_name(): constify name argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..6dcfc1c399ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3719,7 +3719,8 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ -static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) +static inline bool generic_ci_validate_strict_name(struct inode *dir, + const struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; @@ -3734,7 +3735,8 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qst return !utf8_validate(dir->i_sb->s_encoding, name); } #else -static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) +static inline bool generic_ci_validate_strict_name(struct inode *dir, + const struct qstr *name) { return true; } -- cgit v1.2.3 From 180a9cc3fd6a020746fbd7f97b9b62295a325fd2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 9 Feb 2024 14:57:43 -0500 Subject: make it easier to catch those who try to modify ->d_name Turn d_name into an anon union of const struct qstr d_name with struct qstr __d_name. Very few places need to modify it (all in fs/dcache.c); those are switched to use of ->__d_name. Note that ->d_name can actually change under you unless you have the right locking environment; this const just prohibits accidentally doing stores without being easily spotted. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 26 +++++++++++++------------- include/linux/dcache.h | 5 ++++- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..b4cd5e1321b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1717,13 +1717,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_shortname.string; } - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; + dentry->__d_name.len = name->len; + dentry->__d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ + smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); @@ -2743,15 +2743,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target) /* * Both external: swap the pointers */ - swap(target->d_name.name, dentry->d_name.name); + swap(target->__d_name.name, dentry->__d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ - dentry->d_name.name = target->d_name.name; + dentry->__d_name.name = target->__d_name.name; target->d_shortname = dentry->d_shortname; - target->d_name.name = target->d_shortname.string; + target->__d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { @@ -2759,9 +2759,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - target->d_name.name = dentry->d_name.name; + target->__d_name.name = dentry->__d_name.name; dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; + dentry->__d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. @@ -2771,7 +2771,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target) target->d_shortname.words[i]); } } - swap(dentry->d_name.hash_len, target->d_name.hash_len); + swap(dentry->__d_name.hash_len, target->__d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) @@ -2781,11 +2781,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); - dentry->d_name = target->d_name; + dentry->__d_name = target->__d_name; } else { dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; - dentry->d_name.hash_len = target->d_name.hash_len; + dentry->__d_name.name = dentry->d_shortname.string; + dentry->__d_name.hash_len = target->__d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); @@ -3133,7 +3133,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", + dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index cc3e1c1a3454..c83e02b94389 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -95,7 +95,10 @@ struct dentry { seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ - struct qstr d_name; + union { + struct qstr __d_name; /* for use ONLY in fs/dcache.c */ + const struct qstr d_name; + }; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ union shortname_store d_shortname; -- cgit v1.2.3 From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3..d1b4829b580a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893..5fd5b4cf75ca 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636c..7359d34da446 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..ce7d42d3bd00 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } -- cgit v1.2.3 From dae575e669811b201114702d96f6854d5c8324b5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2025 23:16:35 -0400 Subject: backing_file_user_path(): constify struct path * Callers never use the resulting pointer to modify the struct path it points to (nor should they). Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/file_table.c | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/file_table.c b/fs/file_table.c index 81c72576e548..85b53e39138d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -54,7 +54,7 @@ struct backing_file { #define backing_file(f) container_of(f, struct backing_file, file) -struct path *backing_file_user_path(const struct file *f) +const struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..3bcc878817be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2879,7 +2879,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct path *backing_file_user_path(const struct file *f); +const struct path *backing_file_user_path(const struct file *f); /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file -- cgit v1.2.3 From 63dbfb077cdad21b356e17d4ce76650e67b83159 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 21:58:05 -0400 Subject: done_path_create(): constify path argument Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namei.c | 2 +- include/linux/namei.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index 869976213b0c..3eb0408e3400 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4170,7 +4170,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, } EXPORT_SYMBOL(kern_path_create); -void done_path_create(struct path *path, struct dentry *dentry) +void done_path_create(const struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471..75c0b665fbd4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -60,7 +60,7 @@ extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); +extern void done_path_create(const struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); -- cgit v1.2.3 From 2930afe2c9cb9aec329269e40c851bf56cdcc09c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 21:32:41 -0400 Subject: export_operations->open(): constify path argument for the method and its sole instance... Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/pidfs.c | 2 +- include/linux/exportfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..5af4fee288ea 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -847,7 +847,7 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) { /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..f43c83e0b8c5 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -270,7 +270,7 @@ struct export_operations { int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); - struct file * (*open)(struct path *path, unsigned int oflags); + struct file * (*open)(const struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ -- cgit v1.2.3 From 1f6df5847454dee8608f78ee0df7352472cb2447 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 18:45:02 -0400 Subject: drop_collected_paths(): constify arguments ... and use that to constify the pointers in callers Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- kernel/audit_tree.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 62cffd3b6de2..346e577073bb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2334,9 +2334,9 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, struct path *prealloc) { - for (struct path *p = paths; p->mnt; p++) + for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); diff --git a/include/linux/mount.h b/include/linux/mount.h index 5f9c053b0897..c09032463b36 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -105,7 +105,7 @@ extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(struct path *, struct path *); +extern void drop_collected_paths(const struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0eae2a3c895..32007edf0e55 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -678,7 +678,7 @@ void audit_trim_trees(void) struct audit_tree *tree; struct path path; struct audit_node *node; - struct path *paths; + const struct path *paths; struct path array[16]; int err; @@ -701,7 +701,7 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { struct inode *inode = p->dentry->d_inode; if (inode_to_key(inode) == chunk->key) { node->index &= ~(1U<<31); @@ -740,9 +740,9 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mounts(struct path *paths, struct audit_tree *tree) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { int err = tag_chunk(p->dentry->d_inode, tree); if (err) return err; @@ -805,7 +805,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct audit_tree *seed = rule->tree, *tree; struct path path; struct path array[16]; - struct path *paths; + const struct path *paths; int err; rule->tree = NULL; @@ -877,7 +877,7 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct path array[16]; - struct path *paths; + const struct path *paths; int err; err = kern_path(new, 0, &path2); -- cgit v1.2.3 From b42ffcd5069d5cfb777b8982a1c55c7e2f1d3998 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 19:34:37 -0400 Subject: collect_paths(): constify the return value callers have no business modifying the paths they get Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 346e577073bb..cf8dbf0741f0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2300,7 +2300,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2334,7 +2334,7 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(const struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { for (const struct path *p = paths; p->mnt; p++) path_put(p); diff --git a/include/linux/mount.h b/include/linux/mount.h index c09032463b36..18e4b97f8a98 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -104,8 +104,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(const struct path *, struct path *); +extern const struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); -- cgit v1.2.3 From 880fcc329e2473ba02ffbc446fcd403972ab1fca Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 9 Sep 2025 00:03:24 -0700 Subject: drivers/perf: riscv: Export PMU event info function The event mapping function can be used in event info function to find out the corresponding SBI PMU event encoding during the get_event_info function as well. Refactor and export it so that it can be invoked from kvm and internal driver. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Acked-by: Paul Walmsley Link: https://lore.kernel.org/r/20250909-pmu_event_info-v6-5-d8f80cacb884@rivosinc.com Signed-off-by: Anup Patel --- drivers/perf/riscv_pmu_sbi.c | 122 ++++++++++++++++++++++------------------- include/linux/perf/riscv_pmu.h | 1 + 2 files changed, 68 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index a6c479f853e1..0392900d828e 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -100,6 +100,7 @@ static unsigned int riscv_pmu_irq; /* Cache the available counters in a bitmask */ static unsigned long cmask; +static int pmu_event_find_cache(u64 config); struct sbi_pmu_event_data { union { union { @@ -412,6 +413,71 @@ static bool pmu_sbi_ctr_is_fw(int cidx) return (info->type == SBI_PMU_CTR_TYPE_FW) ? true : false; } +int riscv_pmu_get_event_info(u32 type, u64 config, u64 *econfig) +{ + int ret = -ENOENT; + + switch (type) { + case PERF_TYPE_HARDWARE: + if (config >= PERF_COUNT_HW_MAX) + return -EINVAL; + ret = pmu_hw_event_map[config].event_idx; + break; + case PERF_TYPE_HW_CACHE: + ret = pmu_event_find_cache(config); + break; + case PERF_TYPE_RAW: + /* + * As per SBI v0.3 specification, + * -- the upper 16 bits must be unused for a hardware raw event. + * As per SBI v2.0 specification, + * -- the upper 8 bits must be unused for a hardware raw event. + * Bits 63:62 are used to distinguish between raw events + * 00 - Hardware raw event + * 10 - SBI firmware events + * 11 - Risc-V platform specific firmware event + */ + switch (config >> 62) { + case 0: + if (sbi_v3_available) { + /* Return error any bits [56-63] is set as it is not allowed by the spec */ + if (!(config & ~RISCV_PMU_RAW_EVENT_V2_MASK)) { + if (econfig) + *econfig = config & RISCV_PMU_RAW_EVENT_V2_MASK; + ret = RISCV_PMU_RAW_EVENT_V2_IDX; + } + /* Return error any bits [48-63] is set as it is not allowed by the spec */ + } else if (!(config & ~RISCV_PMU_RAW_EVENT_MASK)) { + if (econfig) + *econfig = config & RISCV_PMU_RAW_EVENT_MASK; + ret = RISCV_PMU_RAW_EVENT_IDX; + } + break; + case 2: + ret = (config & 0xFFFF) | (SBI_PMU_EVENT_TYPE_FW << 16); + break; + case 3: + /* + * For Risc-V platform specific firmware events + * Event code - 0xFFFF + * Event data - raw event encoding + */ + ret = SBI_PMU_EVENT_TYPE_FW << 16 | RISCV_PLAT_FW_EVENT; + if (econfig) + *econfig = config & RISCV_PMU_PLAT_FW_EVENT_MASK; + break; + default: + break; + } + break; + default: + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(riscv_pmu_get_event_info); + /* * Returns the counter width of a programmable counter and number of hardware * counters. As we don't support heterogeneous CPUs yet, it is okay to just @@ -577,7 +643,6 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) { u32 type = event->attr.type; u64 config = event->attr.config; - int ret = -ENOENT; /* * Ensure we are finished checking standard hardware events for @@ -585,60 +650,7 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) */ flush_work(&check_std_events_work); - switch (type) { - case PERF_TYPE_HARDWARE: - if (config >= PERF_COUNT_HW_MAX) - return -EINVAL; - ret = pmu_hw_event_map[event->attr.config].event_idx; - break; - case PERF_TYPE_HW_CACHE: - ret = pmu_event_find_cache(config); - break; - case PERF_TYPE_RAW: - /* - * As per SBI v0.3 specification, - * -- the upper 16 bits must be unused for a hardware raw event. - * As per SBI v2.0 specification, - * -- the upper 8 bits must be unused for a hardware raw event. - * Bits 63:62 are used to distinguish between raw events - * 00 - Hardware raw event - * 10 - SBI firmware events - * 11 - Risc-V platform specific firmware event - */ - - switch (config >> 62) { - case 0: - if (sbi_v3_available) { - if (!(config & ~RISCV_PMU_RAW_EVENT_V2_MASK)) { - *econfig = config & RISCV_PMU_RAW_EVENT_V2_MASK; - ret = RISCV_PMU_RAW_EVENT_V2_IDX; - } - } else if (!(config & ~RISCV_PMU_RAW_EVENT_MASK)) { - *econfig = config & RISCV_PMU_RAW_EVENT_MASK; - ret = RISCV_PMU_RAW_EVENT_IDX; - } - break; - case 2: - ret = (config & 0xFFFF) | (SBI_PMU_EVENT_TYPE_FW << 16); - break; - case 3: - /* - * For Risc-V platform specific firmware events - * Event code - 0xFFFF - * Event data - raw event encoding - */ - ret = SBI_PMU_EVENT_TYPE_FW << 16 | RISCV_PLAT_FW_EVENT; - *econfig = config & RISCV_PMU_PLAT_FW_EVENT_MASK; - break; - default: - break; - } - break; - default: - break; - } - - return ret; + return riscv_pmu_get_event_info(type, config, econfig); } static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu) diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index 701974639ff2..f82a28040594 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -89,6 +89,7 @@ static inline void riscv_pmu_legacy_skip_init(void) {}; struct riscv_pmu *riscv_pmu_alloc(void); #ifdef CONFIG_RISCV_PMU_SBI int riscv_pmu_get_hpm_info(u32 *hw_ctr_width, u32 *num_hw_ctr); +int riscv_pmu_get_event_info(u32 type, u64 config, u64 *econfig); #endif #endif /* CONFIG_RISCV_PMU */ -- cgit v1.2.3 From 94deac977fbd0246c971b4f1d17a6385f5e0b1a4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:04 +0000 Subject: fs: add an enum for number of life time hints Add WRITE_LIFE_HINT_NR into the rw_hint enum to define the number of values write life time hints can be set to. This is useful for e.g. file systems which may want to map these values to allocation groups. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- include/linux/rw_hint.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h index 309ca72f2dfb..adcc43042c90 100644 --- a/include/linux/rw_hint.h +++ b/include/linux/rw_hint.h @@ -14,6 +14,7 @@ enum rw_hint { WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, + WRITE_LIFE_HINT_NR, } __packed; /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ -- cgit v1.2.3 From 51dad33ede63618a6b425c650f3042d85e646dac Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:46 +0800 Subject: mfd: Add core driver for Nuvoton NCT6694 The Nuvoton NCT6694 provides an USB interface to the host to access its features. Sub-devices can use the USB functions nct6694_read_msg() and nct6694_write_msg() to issue a command. They can also request interrupt that will be called when the USB device receives its interrupt pipe. Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-2-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 6 + drivers/mfd/Kconfig | 15 ++ drivers/mfd/Makefile | 2 + drivers/mfd/nct6694.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/nct6694.h | 102 ++++++++++++ 5 files changed, 513 insertions(+) create mode 100644 drivers/mfd/nct6694.c create mode 100644 include/linux/mfd/nct6694.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..a8a05872d077 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18082,6 +18082,12 @@ F: drivers/nubus/ F: include/linux/nubus.h F: include/uapi/linux/nubus.h +NUVOTON NCT6694 MFD DRIVER +M: Ming Yu +S: Supported +F: drivers/mfd/nct6694.c +F: include/linux/mfd/nct6694.h + NUVOTON NCT7201 IIO DRIVER M: Eason Yang L: linux-iio@vger.kernel.org diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..f3d157776e93 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1134,6 +1134,21 @@ config MFD_MENF21BMC This driver can also be built as a module. If so the module will be called menf21bmc. +config MFD_NCT6694 + tristate "Nuvoton NCT6694 support" + select MFD_CORE + depends on USB + help + This enables support for the Nuvoton USB device NCT6694, which shares + peripherals. + The Nuvoton NCT6694 is a peripheral expander with 16 GPIO chips, + 6 I2C controllers, 2 CANfd controllers, 2 Watchdog timers, ADC, + PWM, and RTC. + This driver provides core APIs to access the NCT6694 hardware + monitoring and control features. + Additional drivers must be enabled to utilize the specific + functionalities of the device. + config MFD_OCELOT tristate "Microsemi Ocelot External Control Support" depends on SPI_MASTER diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..1e7738c02b2c 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -121,6 +121,8 @@ obj-$(CONFIG_MFD_MC13XXX) += mc13xxx-core.o obj-$(CONFIG_MFD_MC13XXX_SPI) += mc13xxx-spi.o obj-$(CONFIG_MFD_MC13XXX_I2C) += mc13xxx-i2c.o +obj-$(CONFIG_MFD_NCT6694) += nct6694.o + obj-$(CONFIG_MFD_CORE) += mfd-core.o ocelot-soc-objs := ocelot-core.o ocelot-spi.o diff --git a/drivers/mfd/nct6694.c b/drivers/mfd/nct6694.c new file mode 100644 index 000000000000..308b2fda3055 --- /dev/null +++ b/drivers/mfd/nct6694.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 core driver using USB interface to provide + * access to the NCT6694 hardware monitoring and control features. + * + * The NCT6694 is an integrated controller that provides GPIO, I2C, + * CAN, WDT, HWMON and RTC management. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell nct6694_devs[] = { + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + + MFD_CELL_NAME("nct6694-canfd"), + MFD_CELL_NAME("nct6694-canfd"), + + MFD_CELL_NAME("nct6694-wdt"), + MFD_CELL_NAME("nct6694-wdt"), + + MFD_CELL_NAME("nct6694-hwmon"), + + MFD_CELL_NAME("nct6694-rtc"), +}; + +static int nct6694_response_err_handling(struct nct6694 *nct6694, unsigned char err_status) +{ + switch (err_status) { + case NCT6694_NO_ERROR: + return 0; + case NCT6694_NOT_SUPPORT_ERROR: + dev_err(nct6694->dev, "Command is not supported!\n"); + break; + case NCT6694_NO_RESPONSE_ERROR: + dev_warn(nct6694->dev, "Command received no response!\n"); + break; + case NCT6694_TIMEOUT_ERROR: + dev_warn(nct6694->dev, "Command timed out!\n"); + break; + case NCT6694_PENDING: + dev_err(nct6694->dev, "Command is pending!\n"); + break; + default: + return -EINVAL; + } + + return -EIO; +} + +/** + * nct6694_read_msg() - Read message from NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer to store the response data + * + * Sends a command to the NCT6694 device and reads the response. + * The command header is specified in @cmd_hd, and the response + * data is stored in @buf. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_GET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected received length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_read_msg); + +/** + * nct6694_write_msg() - Write message to NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer containing the data to be sent + * + * Sends a command to the NCT6694 device and writes the data + * from @buf. The command header is specified in @cmd_hd. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_SET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Send data packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), buf, + le16_to_cpu(cmd_hd->len), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected transmitted length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_write_msg); + +static void usb_int_callback(struct urb *urb) +{ + struct nct6694 *nct6694 = urb->context; + __le32 *status_le = urb->transfer_buffer; + u32 int_status; + int ret; + + switch (urb->status) { + case 0: + break; + case -ECONNRESET: + case -ENOENT: + case -ESHUTDOWN: + return; + default: + goto resubmit; + } + + int_status = le32_to_cpu(*status_le); + + while (int_status) { + int irq = __ffs(int_status); + + generic_handle_irq_safe(irq_find_mapping(nct6694->domain, irq)); + int_status &= ~BIT(irq); + } + +resubmit: + ret = usb_submit_urb(urb, GFP_ATOMIC); + if (ret) + dev_warn(nct6694->dev, "Failed to resubmit urb, status %pe", ERR_PTR(ret)); +} + +static void nct6694_irq_enable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable |= BIT(hwirq); +} + +static void nct6694_irq_disable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable &= ~BIT(hwirq); +} + +static const struct irq_chip nct6694_irq_chip = { + .name = "nct6694-irq", + .flags = IRQCHIP_SKIP_SET_WAKE, + .irq_enable = nct6694_irq_enable, + .irq_disable = nct6694_irq_disable, +}; + +static int nct6694_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) +{ + struct nct6694 *nct6694 = d->host_data; + + irq_set_chip_data(irq, nct6694); + irq_set_chip_and_handler(irq, &nct6694_irq_chip, handle_simple_irq); + + return 0; +} + +static void nct6694_irq_domain_unmap(struct irq_domain *d, unsigned int irq) +{ + irq_set_chip_and_handler(irq, NULL, NULL); + irq_set_chip_data(irq, NULL); +} + +static const struct irq_domain_ops nct6694_irq_domain_ops = { + .map = nct6694_irq_domain_map, + .unmap = nct6694_irq_domain_unmap, +}; + +static int nct6694_usb_probe(struct usb_interface *iface, + const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(iface); + struct usb_endpoint_descriptor *int_endpoint; + struct usb_host_interface *interface; + struct device *dev = &iface->dev; + struct nct6694 *nct6694; + int ret; + + nct6694 = devm_kzalloc(dev, sizeof(*nct6694), GFP_KERNEL); + if (!nct6694) + return -ENOMEM; + + nct6694->usb_msg = devm_kzalloc(dev, sizeof(union nct6694_usb_msg), GFP_KERNEL); + if (!nct6694->usb_msg) + return -ENOMEM; + + nct6694->int_buffer = devm_kzalloc(dev, sizeof(*nct6694->int_buffer), GFP_KERNEL); + if (!nct6694->int_buffer) + return -ENOMEM; + + nct6694->int_in_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!nct6694->int_in_urb) + return -ENOMEM; + + nct6694->domain = irq_domain_create_simple(NULL, NCT6694_NR_IRQS, 0, + &nct6694_irq_domain_ops, + nct6694); + if (!nct6694->domain) { + ret = -ENODEV; + goto err_urb; + } + + nct6694->dev = dev; + nct6694->udev = udev; + + ida_init(&nct6694->gpio_ida); + ida_init(&nct6694->i2c_ida); + ida_init(&nct6694->canfd_ida); + ida_init(&nct6694->wdt_ida); + + spin_lock_init(&nct6694->irq_lock); + + ret = devm_mutex_init(dev, &nct6694->access_lock); + if (ret) + goto err_ida; + + interface = iface->cur_altsetting; + + int_endpoint = &interface->endpoint[0].desc; + if (!usb_endpoint_is_int_in(int_endpoint)) { + ret = -ENODEV; + goto err_ida; + } + + usb_fill_int_urb(nct6694->int_in_urb, udev, usb_rcvintpipe(udev, NCT6694_INT_IN_EP), + nct6694->int_buffer, sizeof(*nct6694->int_buffer), usb_int_callback, + nct6694, int_endpoint->bInterval); + + ret = usb_submit_urb(nct6694->int_in_urb, GFP_KERNEL); + if (ret) + goto err_ida; + + usb_set_intfdata(iface, nct6694); + + ret = mfd_add_hotplug_devices(dev, nct6694_devs, ARRAY_SIZE(nct6694_devs)); + if (ret) + goto err_mfd; + + return 0; + +err_mfd: + usb_kill_urb(nct6694->int_in_urb); +err_ida: + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); +err_urb: + usb_free_urb(nct6694->int_in_urb); + return ret; +} + +static void nct6694_usb_disconnect(struct usb_interface *iface) +{ + struct nct6694 *nct6694 = usb_get_intfdata(iface); + + mfd_remove_devices(nct6694->dev); + usb_kill_urb(nct6694->int_in_urb); + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); + usb_free_urb(nct6694->int_in_urb); +} + +static const struct usb_device_id nct6694_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(NCT6694_VENDOR_ID, NCT6694_PRODUCT_ID, 0xFF, 0x00, 0x00) }, + { } +}; +MODULE_DEVICE_TABLE(usb, nct6694_ids); + +static struct usb_driver nct6694_usb_driver = { + .name = "nct6694", + .id_table = nct6694_ids, + .probe = nct6694_usb_probe, + .disconnect = nct6694_usb_disconnect, +}; +module_usb_driver(nct6694_usb_driver); + +MODULE_DESCRIPTION("Nuvoton NCT6694 core driver"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/nct6694.h b/include/linux/mfd/nct6694.h new file mode 100644 index 000000000000..6eb9be2cd4a0 --- /dev/null +++ b/include/linux/mfd/nct6694.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 USB transaction and data structure. + */ + +#ifndef __MFD_NCT6694_H +#define __MFD_NCT6694_H + +#define NCT6694_VENDOR_ID 0x0416 +#define NCT6694_PRODUCT_ID 0x200B +#define NCT6694_INT_IN_EP 0x81 +#define NCT6694_BULK_IN_EP 0x02 +#define NCT6694_BULK_OUT_EP 0x03 + +#define NCT6694_HCTRL_SET 0x40 +#define NCT6694_HCTRL_GET 0x80 + +#define NCT6694_URB_TIMEOUT 1000 + +enum nct6694_irq_id { + NCT6694_IRQ_GPIO0 = 0, + NCT6694_IRQ_GPIO1, + NCT6694_IRQ_GPIO2, + NCT6694_IRQ_GPIO3, + NCT6694_IRQ_GPIO4, + NCT6694_IRQ_GPIO5, + NCT6694_IRQ_GPIO6, + NCT6694_IRQ_GPIO7, + NCT6694_IRQ_GPIO8, + NCT6694_IRQ_GPIO9, + NCT6694_IRQ_GPIOA, + NCT6694_IRQ_GPIOB, + NCT6694_IRQ_GPIOC, + NCT6694_IRQ_GPIOD, + NCT6694_IRQ_GPIOE, + NCT6694_IRQ_GPIOF, + NCT6694_IRQ_CAN0, + NCT6694_IRQ_CAN1, + NCT6694_IRQ_RTC, + NCT6694_NR_IRQS, +}; + +enum nct6694_response_err_status { + NCT6694_NO_ERROR = 0, + NCT6694_FORMAT_ERROR, + NCT6694_RESERVED1, + NCT6694_RESERVED2, + NCT6694_NOT_SUPPORT_ERROR, + NCT6694_NO_RESPONSE_ERROR, + NCT6694_TIMEOUT_ERROR, + NCT6694_PENDING, +}; + +struct __packed nct6694_cmd_header { + u8 rsv1; + u8 mod; + union __packed { + __le16 offset; + struct __packed { + u8 cmd; + u8 sel; + }; + }; + u8 hctrl; + u8 rsv2; + __le16 len; +}; + +struct __packed nct6694_response_header { + u8 sequence_id; + u8 sts; + u8 reserved[4]; + __le16 len; +}; + +union __packed nct6694_usb_msg { + struct nct6694_cmd_header cmd_header; + struct nct6694_response_header response_header; +}; + +struct nct6694 { + struct device *dev; + struct ida gpio_ida; + struct ida i2c_ida; + struct ida canfd_ida; + struct ida wdt_ida; + struct irq_domain *domain; + struct mutex access_lock; + spinlock_t irq_lock; + struct urb *int_in_urb; + struct usb_device *udev; + union nct6694_usb_msg *usb_msg; + __le32 *int_buffer; + unsigned int irq_enable; +}; + +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); + +#endif -- cgit v1.2.3 From a22ddeef55c4df847d9ac862b6192da774948fe1 Mon Sep 17 00:00:00 2001 From: Kamel Bouhara Date: Sun, 24 Aug 2025 13:57:21 +0200 Subject: mfd: Add max7360 support Add core driver to support MAX7360 i2c chip, multi function device with keypad, GPIO, PWM, GPO and rotary encoder submodules. Signed-off-by: Kamel Bouhara Co-developed-by: Mathieu Dubois-Briand Signed-off-by: Mathieu Dubois-Briand Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-2-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 14 ++++ drivers/mfd/Makefile | 1 + drivers/mfd/max7360.c | 171 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/max7360.h | 109 ++++++++++++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 drivers/mfd/max7360.c create mode 100644 include/linux/mfd/max7360.h (limited to 'include') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..58b1c2900d59 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2481,5 +2481,19 @@ config MFD_UPBOARD_FPGA To compile this driver as a module, choose M here: the module will be called upboard-fpga. +config MFD_MAX7360 + tristate "Maxim MAX7360 I2C IO Expander" + depends on I2C + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + help + Say yes here to add support for Maxim MAX7360 device, embedding + keypad, rotary encoder, PWM and GPIO features. + + This driver provides common support for accessing the device; + additional drivers must be enabled in order to use the functionality + of the device. + endmenu endif diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..c81c6a8473e1 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -163,6 +163,7 @@ obj-$(CONFIG_MFD_DA9063) += da9063.o obj-$(CONFIG_MFD_DA9150) += da9150-core.o obj-$(CONFIG_MFD_MAX14577) += max14577.o +obj-$(CONFIG_MFD_MAX7360) += max7360.o obj-$(CONFIG_MFD_MAX77541) += max77541.o obj-$(CONFIG_MFD_MAX77620) += max77620.o obj-$(CONFIG_MFD_MAX77650) += max77650.o diff --git a/drivers/mfd/max7360.c b/drivers/mfd/max7360.c new file mode 100644 index 000000000000..5ee459c490ec --- /dev/null +++ b/drivers/mfd/max7360.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Maxim MAX7360 Core Driver + * + * Copyright 2025 Bootlin + * + * Authors: + * Kamel Bouhara + * Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell max7360_cells[] = { + { .name = "max7360-pinctrl" }, + { .name = "max7360-pwm" }, + { .name = "max7360-keypad" }, + { .name = "max7360-rotary" }, + { + .name = "max7360-gpo", + .of_compatible = "maxim,max7360-gpo", + }, + { + .name = "max7360-gpio", + .of_compatible = "maxim,max7360-gpio", + }, +}; + +static const struct regmap_range max7360_volatile_ranges[] = { + regmap_reg_range(MAX7360_REG_KEYFIFO, MAX7360_REG_KEYFIFO), + regmap_reg_range(MAX7360_REG_I2C_TIMEOUT, MAX7360_REG_RTR_CNT), +}; + +static const struct regmap_access_table max7360_volatile_table = { + .yes_ranges = max7360_volatile_ranges, + .n_yes_ranges = ARRAY_SIZE(max7360_volatile_ranges), +}; + +static const struct regmap_config max7360_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MAX7360_REG_PWMCFG(MAX7360_PORT_PWM_COUNT - 1), + .volatile_table = &max7360_volatile_table, + .cache_type = REGCACHE_MAPLE, +}; + +static int max7360_mask_irqs(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + unsigned int val; + int ret; + + /* + * GPIO/PWM interrupts are not masked on reset: as the MAX7360 "INTI" + * interrupt line is shared between GPIOs and rotary encoder, this could + * result in repeated spurious interrupts on the rotary encoder driver + * if the GPIO driver is not loaded. Mask them now to avoid this + * situation. + */ + for (unsigned int i = 0; i < MAX7360_PORT_PWM_COUNT; i++) { + ret = regmap_write_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_MASK, + MAX7360_PORT_CFG_INTERRUPT_MASK); + if (ret) + return dev_err_probe(dev, ret, + "Failed to write MAX7360 port configuration\n"); + } + + /* Read GPIO in register, to ACK any pending IRQ. */ + ret = regmap_read(regmap, MAX7360_REG_GPIOIN, &val); + if (ret) + return dev_err_probe(dev, ret, "Failed to read GPIO values\n"); + + return 0; +} + +static int max7360_reset(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + int ret; + + ret = regmap_write(regmap, MAX7360_REG_GPIOCFG, MAX7360_GPIO_CFG_GPIO_RST); + if (ret) { + dev_err(dev, "Failed to reset GPIO configuration: %x\n", ret); + return ret; + } + + ret = regcache_drop_region(regmap, MAX7360_REG_GPIOCFG, MAX7360_REG_GPIO_LAST); + if (ret) { + dev_err(dev, "Failed to drop regmap cache: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_SLEEP, 0); + if (ret) { + dev_err(dev, "Failed to reset autosleep configuration: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_DEBOUNCE, 0); + if (ret) + dev_err(dev, "Failed to reset GPO port count: %x\n", ret); + + return ret; +} + +static int max7360_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct regmap *regmap; + int ret; + + regmap = devm_regmap_init_i2c(client, &max7360_regmap_config); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), "Failed to initialise regmap\n"); + + ret = max7360_reset(regmap); + if (ret) + return dev_err_probe(dev, ret, "Failed to reset device\n"); + + /* Get the device out of shutdown mode. */ + ret = regmap_write_bits(regmap, MAX7360_REG_GPIOCFG, + MAX7360_GPIO_CFG_GPIO_EN, + MAX7360_GPIO_CFG_GPIO_EN); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable GPIO and PWM module\n"); + + ret = max7360_mask_irqs(regmap); + if (ret) + return dev_err_probe(dev, ret, "Could not mask interrupts\n"); + + ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + max7360_cells, ARRAY_SIZE(max7360_cells), + NULL, 0, NULL); + if (ret) + return dev_err_probe(dev, ret, "Failed to register child devices\n"); + + return 0; +} + +static const struct of_device_id max7360_dt_match[] = { + { .compatible = "maxim,max7360" }, + {} +}; +MODULE_DEVICE_TABLE(of, max7360_dt_match); + +static struct i2c_driver max7360_driver = { + .driver = { + .name = "max7360", + .of_match_table = max7360_dt_match, + }, + .probe = max7360_probe, +}; +module_i2c_driver(max7360_driver); + +MODULE_DESCRIPTION("Maxim MAX7360 I2C IO Expander core driver"); +MODULE_AUTHOR("Kamel Bouhara "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/max7360.h b/include/linux/mfd/max7360.h new file mode 100644 index 000000000000..44cf2bf651a2 --- /dev/null +++ b/include/linux/mfd/max7360.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_MFD_MAX7360_H +#define __LINUX_MFD_MAX7360_H + +#include + +#define MAX7360_MAX_KEY_ROWS 8 +#define MAX7360_MAX_KEY_COLS 8 +#define MAX7360_MAX_KEY_NUM (MAX7360_MAX_KEY_ROWS * MAX7360_MAX_KEY_COLS) +#define MAX7360_ROW_SHIFT 3 + +#define MAX7360_MAX_GPIO 8 +#define MAX7360_MAX_GPO 6 +#define MAX7360_PORT_PWM_COUNT 8 +#define MAX7360_PORT_RTR_PIN (MAX7360_PORT_PWM_COUNT - 1) + +/* + * MAX7360 registers + */ +#define MAX7360_REG_KEYFIFO 0x00 +#define MAX7360_REG_CONFIG 0x01 +#define MAX7360_REG_DEBOUNCE 0x02 +#define MAX7360_REG_INTERRUPT 0x03 +#define MAX7360_REG_PORTS 0x04 +#define MAX7360_REG_KEYREP 0x05 +#define MAX7360_REG_SLEEP 0x06 + +/* + * MAX7360 GPIO registers + * + * All these registers are reset together when writing bit 3 of + * MAX7360_REG_GPIOCFG. + */ +#define MAX7360_REG_GPIOCFG 0x40 +#define MAX7360_REG_GPIOCTRL 0x41 +#define MAX7360_REG_GPIODEB 0x42 +#define MAX7360_REG_GPIOCURR 0x43 +#define MAX7360_REG_GPIOOUTM 0x44 +#define MAX7360_REG_PWMCOM 0x45 +#define MAX7360_REG_RTRCFG 0x46 +#define MAX7360_REG_I2C_TIMEOUT 0x48 +#define MAX7360_REG_GPIOIN 0x49 +#define MAX7360_REG_RTR_CNT 0x4A +#define MAX7360_REG_PWMBASE 0x50 +#define MAX7360_REG_PWMCFGBASE 0x58 + +#define MAX7360_REG_GPIO_LAST 0x5F + +#define MAX7360_REG_PWM(x) (MAX7360_REG_PWMBASE + (x)) +#define MAX7360_REG_PWMCFG(x) (MAX7360_REG_PWMCFGBASE + (x)) + +/* + * Configuration register bits + */ +#define MAX7360_FIFO_EMPTY 0x3F +#define MAX7360_FIFO_OVERFLOW 0x7F +#define MAX7360_FIFO_RELEASE BIT(6) +#define MAX7360_FIFO_COL GENMASK(5, 3) +#define MAX7360_FIFO_ROW GENMASK(2, 0) + +#define MAX7360_CFG_SLEEP BIT(7) +#define MAX7360_CFG_INTERRUPT BIT(5) +#define MAX7360_CFG_KEY_RELEASE BIT(3) +#define MAX7360_CFG_WAKEUP BIT(1) +#define MAX7360_CFG_TIMEOUT BIT(0) + +#define MAX7360_DEBOUNCE GENMASK(4, 0) +#define MAX7360_DEBOUNCE_MIN 9 +#define MAX7360_DEBOUNCE_MAX 40 +#define MAX7360_PORTS GENMASK(8, 5) + +#define MAX7360_INTERRUPT_TIME_MASK GENMASK(4, 0) +#define MAX7360_INTERRUPT_FIFO_MASK GENMASK(7, 5) + +#define MAX7360_PORT_CFG_INTERRUPT_MASK BIT(7) +#define MAX7360_PORT_CFG_INTERRUPT_EDGES BIT(6) +#define MAX7360_PORT_CFG_COMMON_PWM BIT(5) + +/* + * Autosleep register values + */ +#define MAX7360_AUTOSLEEP_8192MS 0x01 +#define MAX7360_AUTOSLEEP_4096MS 0x02 +#define MAX7360_AUTOSLEEP_2048MS 0x03 +#define MAX7360_AUTOSLEEP_1024MS 0x04 +#define MAX7360_AUTOSLEEP_512MS 0x05 +#define MAX7360_AUTOSLEEP_256MS 0x06 + +#define MAX7360_GPIO_CFG_RTR_EN BIT(7) +#define MAX7360_GPIO_CFG_GPIO_EN BIT(4) +#define MAX7360_GPIO_CFG_GPIO_RST BIT(3) + +#define MAX7360_ROT_DEBOUNCE GENMASK(3, 0) +#define MAX7360_ROT_DEBOUNCE_MIN 0 +#define MAX7360_ROT_DEBOUNCE_MAX 15 +#define MAX7360_ROT_INTCNT GENMASK(6, 4) +#define MAX7360_ROT_INTCNT_DLY BIT(7) + +#define MAX7360_INT_INTI 0 +#define MAX7360_INT_INTK 1 + +#define MAX7360_INT_GPIO 0 +#define MAX7360_INT_KEYPAD 1 +#define MAX7360_INT_ROTARY 2 + +#define MAX7360_NR_INTERNAL_IRQS 3 + +#endif -- cgit v1.2.3 From 553b75d4bfe9264f631d459fe9996744e0672b0e Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:24 +0200 Subject: gpio: regmap: Allow to allocate regmap-irq device GPIO controller often have support for IRQ: allow to easily allocate both gpio-regmap and regmap-irq in one operation. Reviewed-by: Andy Shevchenko Acked-by: Bartosz Golaszewski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-5-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/gpio-regmap.c | 29 +++++++++++++++++++++++++++-- include/linux/gpio/regmap.h | 11 +++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e8a32dfebdcb..e1944931ee7c 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -32,6 +32,11 @@ struct gpio_regmap { unsigned int reg_dir_in_base; unsigned int reg_dir_out_base; +#ifdef CONFIG_REGMAP_IRQ + int regmap_irq_line; + struct regmap_irq_chip_data *irq_chip_data; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); @@ -215,6 +220,7 @@ EXPORT_SYMBOL_GPL(gpio_regmap_get_drvdata); */ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config) { + struct irq_domain *irq_domain; struct gpio_regmap *gpio; struct gpio_chip *chip; int ret; @@ -295,8 +301,22 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config if (ret < 0) goto err_free_gpio; - if (config->irq_domain) { - ret = gpiochip_irqchip_add_domain(chip, config->irq_domain); +#ifdef CONFIG_REGMAP_IRQ + if (config->regmap_irq_chip) { + gpio->regmap_irq_line = config->regmap_irq_line; + ret = regmap_add_irq_chip_fwnode(dev_fwnode(config->parent), config->regmap, + config->regmap_irq_line, config->regmap_irq_flags, + 0, config->regmap_irq_chip, &gpio->irq_chip_data); + if (ret) + goto err_free_gpio; + + irq_domain = regmap_irq_get_domain(gpio->irq_chip_data); + } else +#endif + irq_domain = config->irq_domain; + + if (irq_domain) { + ret = gpiochip_irqchip_add_domain(chip, irq_domain); if (ret) goto err_remove_gpiochip; } @@ -317,6 +337,11 @@ EXPORT_SYMBOL_GPL(gpio_regmap_register); */ void gpio_regmap_unregister(struct gpio_regmap *gpio) { +#ifdef CONFIG_REGMAP_IRQ + if (gpio->irq_chip_data) + regmap_del_irq_chip(gpio->regmap_irq_line, gpio->irq_chip_data); +#endif + gpiochip_remove(&gpio->gpio_chip); kfree(gpio); } diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index c722c67668c6..19b52ac03a5d 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -40,6 +40,11 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If + * set, a regmap-irq device will be created and the IRQ + * domain will be set accordingly. + * @regmap_irq_line (Optional) The IRQ the device uses to signal interrupts. + * @regmap_irq_flags (Optional) The IRQF_ flags to use for the interrupt. * * The ->reg_mask_xlate translates a given base address and GPIO offset to * register and mask pair. The base address is one of the given register @@ -78,6 +83,12 @@ struct gpio_regmap_config { int ngpio_per_reg; struct irq_domain *irq_domain; +#ifdef CONFIG_REGMAP_IRQ + struct regmap_irq_chip *regmap_irq_chip; + int regmap_irq_line; + unsigned long regmap_irq_flags; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); -- cgit v1.2.3 From 0627b71fa5508ab605b6e9fd74baed40805cfdda Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:25 +0200 Subject: gpio: regmap: Allow to provide init_valid_mask callback Allows to populate the gpio_regmap_config structure with init_valid_mask() callback to set on the final gpio_chip structure. Reviewed-by: Michael Walle Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Reviewed-by: Bartosz Golaszewski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-6-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/gpio-regmap.c | 1 + include/linux/gpio/regmap.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e1944931ee7c..d9d23853e032 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -261,6 +261,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config chip->names = config->names; chip->label = config->label ?: dev_name(config->parent); chip->can_sleep = regmap_might_sleep(config->regmap); + chip->init_valid_mask = config->init_valid_mask; chip->request = gpiochip_generic_request; chip->free = gpiochip_generic_free; diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 19b52ac03a5d..622a2939ebe0 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -6,6 +6,7 @@ struct device; struct fwnode_handle; struct gpio_regmap; +struct gpio_chip; struct irq_domain; struct regmap; @@ -40,6 +41,8 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @init_valid_mask: (Optional) Routine to initialize @valid_mask, to be used + * if not all GPIOs are valid. * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If * set, a regmap-irq device will be created and the IRQ * domain will be set accordingly. @@ -93,6 +96,10 @@ struct gpio_regmap_config { unsigned int offset, unsigned int *reg, unsigned int *mask); + int (*init_valid_mask)(struct gpio_chip *gc, + unsigned long *valid_mask, + unsigned int ngpios); + void *drvdata; }; -- cgit v1.2.3 From 747436750bc0ef73be32391bd5d0d7dcd185da7f Mon Sep 17 00:00:00 2001 From: Ryan Wanner Date: Wed, 10 Sep 2025 09:20:38 -0700 Subject: ARM: at91: pm: Remove 2.5V regulator Remove 2.5V regulator since enabling and disabling this regulator is no longer supported. Signed-off-by: Ryan Wanner Link: https://lore.kernel.org/r/a6785a40648b315a07152bca261a42bbf0f356af.1757519351.git.Ryan.Wanner@microchip.com Signed-off-by: Nicolas Ferre --- arch/arm/mach-at91/pm_suspend.S | 29 ----------------------------- include/soc/at91/sama7-sfrbu.h | 7 ------- 2 files changed, 36 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S index aad53ec9e957..2e639f9ed648 100644 --- a/arch/arm/mach-at91/pm_suspend.S +++ b/arch/arm/mach-at91/pm_suspend.S @@ -87,29 +87,6 @@ tmp3 .req r6 .endm -/** - * Set state for 2.5V low power regulator - * @ena: 0 - disable regulator - * 1 - enable regulator - * - * Side effects: overwrites r7, r8, r9, r10 - */ - .macro at91_2_5V_reg_set_low_power ena -#ifdef CONFIG_SOC_SAMA7 - ldr r7, .sfrbu - mov r8, #\ena - ldr r9, [r7, #AT91_SFRBU_25LDOCR] - orr r9, r9, #AT91_SFRBU_25LDOCR_LP - cmp r8, #1 - beq lp_done_\ena - bic r9, r9, #AT91_SFRBU_25LDOCR_LP -lp_done_\ena: - ldr r10, =AT91_SFRBU_25LDOCR_LDOANAKEY - orr r9, r9, r10 - str r9, [r7, #AT91_SFRBU_25LDOCR] -#endif - .endm - .macro at91_backup_set_lpm reg #ifdef CONFIG_SOC_SAMA7 orr \reg, \reg, #0x200000 @@ -1023,9 +1000,6 @@ save_mck: at91_plla_disable - /* Enable low power mode for 2.5V regulator. */ - at91_2_5V_reg_set_low_power 1 - ldr tmp3, .pm_mode cmp tmp3, #AT91_PM_ULP1 beq ulp1_mode @@ -1038,9 +1012,6 @@ ulp1_mode: b ulp_exit ulp_exit: - /* Disable low power mode for 2.5V regulator. */ - at91_2_5V_reg_set_low_power 0 - ldr pmc, .pmc_base at91_plla_enable diff --git a/include/soc/at91/sama7-sfrbu.h b/include/soc/at91/sama7-sfrbu.h index 76b740810d34..8cee48d1ae2c 100644 --- a/include/soc/at91/sama7-sfrbu.h +++ b/include/soc/at91/sama7-sfrbu.h @@ -18,13 +18,6 @@ #define AT91_SFRBU_PSWBU_SOFTSWITCH (1 << 1) /* Power switch BU source selection */ #define AT91_SFRBU_PSWBU_CTRL (1 << 0) /* Power switch BU control */ -#define AT91_SFRBU_25LDOCR (0x0C) /* SFRBU 2.5V LDO Control Register */ -#define AT91_SFRBU_25LDOCR_LDOANAKEY (0x3B6E18 << 8) /* Specific value mandatory to allow writing of other register bits. */ -#define AT91_SFRBU_25LDOCR_STATE (1 << 3) /* LDOANA Switch On/Off Control */ -#define AT91_SFRBU_25LDOCR_LP (1 << 2) /* LDOANA Low-Power Mode Control */ -#define AT91_SFRBU_PD_VALUE_MSK (0x3) -#define AT91_SFRBU_25LDOCR_PD_VALUE(v) ((v) & AT91_SFRBU_PD_VALUE_MSK) /* LDOANA Pull-down value */ - #define AT91_FRBU_DDRPWR (0x10) /* SFRBU DDR Power Control Register */ #define AT91_FRBU_DDRPWR_STATE (1 << 0) /* DDR Power Mode State */ -- cgit v1.2.3 From f8d9e56aeb87ce82ce8636cd176cc59b69aa0e41 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 22 Aug 2025 13:56:27 +0300 Subject: i3c: master: Add helpers for DMA mapping and bounce buffer handling Some I3C controllers such as MIPI I3C HCI may pad the last DWORD (32-bit) with stale data from the RX FIFO in DMA transfers if the receive length is not DWORD aligned and when the device DMA is IOMMU mapped. In such a case, a properly sized bounce buffer is required in order to avoid possible data corruption. In a review discussion, proposal was to have a common helpers in I3C core for DMA mapping and bounce buffer handling. Drivers may use the helper i3c_master_dma_map_single() to map a buffer for a DMA transfer. It internally allocates a bounce buffer if buffer is not DMA'able or when the driver requires it for a transfer. Helper i3c_master_dma_unmap_single() does the needed cleanups and data copying from the bounce buffer. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250822105630.2820009-2-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/i3c/master.h | 26 ++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'include') diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 2ef898a8fd80..033d06cabca2 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1727,6 +1728,79 @@ int i3c_master_do_daa(struct i3c_master_controller *master) } EXPORT_SYMBOL_GPL(i3c_master_do_daa); +/** + * i3c_master_dma_map_single() - Map buffer for single DMA transfer + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @force_bounce: true, force to use a bounce buffer, + * false, function will auto check is a bounce buffer required + * @dir: DMA direction + * + * Map buffer for a DMA transfer and allocate a bounce buffer if required. + * + * Return: I3C DMA transfer descriptor or NULL in case of error. + */ +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *buf, + size_t len, bool force_bounce, enum dma_data_direction dir) +{ + struct i3c_dma *dma_xfer __free(kfree) = NULL; + void *bounce __free(kfree) = NULL; + void *dma_buf = buf; + + dma_xfer = kzalloc(sizeof(*dma_xfer), GFP_KERNEL); + if (!dma_xfer) + return NULL; + + dma_xfer->dev = dev; + dma_xfer->buf = buf; + dma_xfer->dir = dir; + dma_xfer->len = len; + dma_xfer->map_len = len; + + if (is_vmalloc_addr(buf)) + force_bounce = true; + + if (force_bounce) { + dma_xfer->map_len = ALIGN(len, cache_line_size()); + if (dir == DMA_FROM_DEVICE) + bounce = kzalloc(dma_xfer->map_len, GFP_KERNEL); + else + bounce = kmemdup(buf, dma_xfer->map_len, GFP_KERNEL); + if (!bounce) + return NULL; + dma_buf = bounce; + } + + dma_xfer->addr = dma_map_single(dev, dma_buf, dma_xfer->map_len, dir); + if (dma_mapping_error(dev, dma_xfer->addr)) + return NULL; + + dma_xfer->bounce_buf = no_free_ptr(bounce); + return no_free_ptr(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_map_single); + +/** + * i3c_master_dma_unmap_single() - Unmap buffer after DMA + * @dma_xfer: DMA transfer and mapping descriptor + * + * Unmap buffer and cleanup DMA transfer descriptor. + */ +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer) +{ + dma_unmap_single(dma_xfer->dev, dma_xfer->addr, + dma_xfer->map_len, dma_xfer->dir); + if (dma_xfer->bounce_buf) { + if (dma_xfer->dir == DMA_FROM_DEVICE) + memcpy(dma_xfer->buf, dma_xfer->bounce_buf, + dma_xfer->len); + kfree(dma_xfer->bounce_buf); + } + kfree(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_unmap_single); + /** * i3c_master_set_info() - set master device information * @master: master used to send frames on the bus diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 043f5c7ff398..c52a82dd79a6 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -558,6 +558,26 @@ struct i3c_master_controller { #define i3c_bus_for_each_i3cdev(bus, dev) \ list_for_each_entry(dev, &(bus)->devs.i3c, common.node) +/** + * struct i3c_dma - DMA transfer and mapping descriptor + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @map_len: length of DMA mapping + * @addr: mapped DMA address for a Host Controller Driver + * @dir: DMA direction + * @bounce_buf: an allocated bounce buffer if transfer needs it or NULL + */ +struct i3c_dma { + struct device *dev; + void *buf; + size_t len; + size_t map_len; + dma_addr_t addr; + enum dma_data_direction dir; + void *bounce_buf; +}; + int i3c_master_do_i2c_xfers(struct i3c_master_controller *master, const struct i2c_msg *xfers, int nxfers); @@ -575,6 +595,12 @@ int i3c_master_get_free_addr(struct i3c_master_controller *master, int i3c_master_add_i3c_dev_locked(struct i3c_master_controller *master, u8 addr); int i3c_master_do_daa(struct i3c_master_controller *master); +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *ptr, + size_t len, bool force_bounce, + enum dma_data_direction dir); +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer); +DEFINE_FREE(i3c_master_dma_unmap_single, void *, + if (_T) i3c_master_dma_unmap_single(_T)) int i3c_master_set_info(struct i3c_master_controller *master, const struct i3c_device_info *info); -- cgit v1.2.3 From 3baeae36039afc233d4a42d6ff4aa7019892619f Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 Aug 2025 16:10:57 +0300 Subject: PCI: Use pci_release_resource() instead of release_resource() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few places in setup-bus.c call release_resource() directly and end up duplicating functionality from pci_release_resource() such as parent check, logging, and clearing the resource. Worse yet, the way the resource is cleared is inconsistent between different sites. Convert release_resource() calls into pci_release_resource() to remove code duplication. This will also make the resource start, end, and flags behavior consistent, i.e., start address is cleared, and only IORESOURCE_UNSET is asserted for the resource. While at it, eliminate the unnecessary initialization of idx variable in pci_bridge_release_resources(). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250829131113.36754-9-ilpo.jarvinen@linux.intel.com --- drivers/pci/setup-bus.c | 46 ++++++++++++++-------------------------------- drivers/pci/setup-res.c | 11 ++++++++--- include/linux/pci.h | 2 +- 3 files changed, 23 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 6bdc1af887da..b62465665abc 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -473,8 +473,6 @@ static void __assign_resources_sorted(struct list_head *head, struct pci_dev_resource *dev_res, *tmp_res, *dev_res2; struct resource *res; struct pci_dev *dev; - const char *res_name; - int idx; unsigned long fail_type; resource_size_t add_align, align; @@ -582,14 +580,7 @@ assign: res = dev_res->res; dev = dev_res->dev; - if (!res->parent) - continue; - - idx = pci_resource_num(dev, res); - res_name = pci_resource_name(dev, idx); - pci_dbg(dev, "%s %pR: releasing\n", res_name, res); - - release_resource(res); + pci_release_resource(dev, pci_resource_num(dev, res)); restore_dev_resource(dev_res); } /* Restore start/end/flags from saved list */ @@ -1732,7 +1723,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus, struct resource *r; unsigned int old_flags; struct resource *b_res; - int idx = 1; + int idx, ret; b_res = &dev->resource[PCI_BRIDGE_RESOURCES]; @@ -1766,21 +1757,18 @@ static void pci_bridge_release_resources(struct pci_bus *bus, /* If there are children, release them all */ release_child_resources(r); - if (!release_resource(r)) { - type = old_flags = r->flags & PCI_RES_TYPE_MASK; - pci_info(dev, "resource %d %pR released\n", - PCI_BRIDGE_RESOURCES + idx, r); - /* Keep the old size */ - resource_set_range(r, 0, resource_size(r)); - r->flags = 0; - /* Avoiding touch the one without PREF */ - if (type & IORESOURCE_PREFETCH) - type = IORESOURCE_PREFETCH; - __pci_setup_bridge(bus, type); - /* For next child res under same bridge */ - r->flags = old_flags; - } + type = old_flags = r->flags & PCI_RES_TYPE_MASK; + ret = pci_release_resource(dev, PCI_BRIDGE_RESOURCES + idx); + if (ret) + return; + + /* Avoiding touch the one without PREF */ + if (type & IORESOURCE_PREFETCH) + type = IORESOURCE_PREFETCH; + __pci_setup_bridge(bus, type); + /* For next child res under same bridge */ + r->flags = old_flags; } enum release_type { @@ -2425,7 +2413,6 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type) for (i = PCI_BRIDGE_RESOURCES; i < PCI_BRIDGE_RESOURCE_END; i++) { struct resource *res = &bridge->resource[i]; - const char *res_name = pci_resource_name(bridge, i); if ((res->flags ^ type) & PCI_RES_TYPE_MASK) continue; @@ -2438,12 +2425,7 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type) if (ret) goto cleanup; - pci_info(bridge, "%s %pR: releasing\n", res_name, res); - - if (res->parent) - release_resource(res); - res->start = 0; - res->end = 0; + pci_release_resource(bridge, i); break; } if (i == PCI_BRIDGE_RESOURCE_END) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index d2b3ed51e880..0468c058b598 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -406,20 +406,25 @@ int pci_reassign_resource(struct pci_dev *dev, int resno, return 0; } -void pci_release_resource(struct pci_dev *dev, int resno) +int pci_release_resource(struct pci_dev *dev, int resno) { struct resource *res = pci_resource_n(dev, resno); const char *res_name = pci_resource_name(dev, resno); + int ret; if (!res->parent) - return; + return 0; pci_info(dev, "%s %pR: releasing\n", res_name, res); - release_resource(res); + ret = release_resource(res); + if (ret) + return ret; res->end = resource_size(res) - 1; res->start = 0; res->flags |= IORESOURCE_UNSET; + + return 0; } EXPORT_SYMBOL(pci_release_resource); diff --git a/include/linux/pci.h b/include/linux/pci.h index 59876de13860..275df4058767 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1417,7 +1417,7 @@ void pci_reset_secondary_bus(struct pci_dev *dev); void pcibios_reset_secondary_bus(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); -void pci_release_resource(struct pci_dev *dev, int resno); +int pci_release_resource(struct pci_dev *dev, int resno); static inline int pci_rebar_bytes_to_size(u64 bytes) { bytes = roundup_pow_of_two(bytes); -- cgit v1.2.3 From 4292a1e45fd464551efac7b2b52fd3606e956c28 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 Aug 2025 16:11:09 +0300 Subject: PCI: Refactor distributing available memory to use loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_bus_distribute_available_resources() and pci_bridge_distribute_available_resources() retain bridge window resources and related data needed for distributing the available window in independent variables for io, memory, and prefetchable memory windows. The code is essentially the same for all of them and therefore repeated three times with different variable names. Refactor pci_bus_distribute_available_resources() to take an array. This is complicated slightly by the function taking advantage of passing the struct as value, which cannot be done for arrays in C. Therefore, copy the data into a local array in the stack in the first loop. Variable names are (hopefully) improved slightly as well. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250829131113.36754-21-ilpo.jarvinen@linux.intel.com --- drivers/pci/setup-bus.c | 162 +++++++++++++++++++++--------------------------- include/linux/pci.h | 3 +- 2 files changed, 74 insertions(+), 91 deletions(-) (limited to 'include') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 720159bca54d..3bc329b1b923 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2059,15 +2059,16 @@ static void remove_dev_resource(struct resource *avail, struct pci_dev *dev, avail->start = min(avail->start + tmp, avail->end + 1); } -static void remove_dev_resources(struct pci_dev *dev, struct resource *io, - struct resource *mmio, - struct resource *mmio_pref) +static void remove_dev_resources(struct pci_dev *dev, + struct resource available[PCI_P2P_BRIDGE_RESOURCE_NUM]) { + struct resource *mmio_pref = &available[PCI_BUS_BRIDGE_PREF_MEM_WINDOW]; struct resource *res; pci_dev_for_each_resource(dev, res) { if (resource_type(res) == IORESOURCE_IO) { - remove_dev_resource(io, dev, res); + remove_dev_resource(&available[PCI_BUS_BRIDGE_IO_WINDOW], + dev, res); } else if (resource_type(res) == IORESOURCE_MEM) { /* @@ -2081,10 +2082,13 @@ static void remove_dev_resources(struct pci_dev *dev, struct resource *io, */ if ((res->flags & IORESOURCE_PREFETCH) && ((res->flags & IORESOURCE_MEM_64) == - (mmio_pref->flags & IORESOURCE_MEM_64))) - remove_dev_resource(mmio_pref, dev, res); - else - remove_dev_resource(mmio, dev, res); + (mmio_pref->flags & IORESOURCE_MEM_64))) { + remove_dev_resource(&available[PCI_BUS_BRIDGE_PREF_MEM_WINDOW], + dev, res); + } else { + remove_dev_resource(&available[PCI_BUS_BRIDGE_MEM_WINDOW], + dev, res); + } } } } @@ -2099,45 +2103,39 @@ static void remove_dev_resources(struct pci_dev *dev, struct resource *io, * shared with the bridges. */ static void pci_bus_distribute_available_resources(struct pci_bus *bus, - struct list_head *add_list, - struct resource io, - struct resource mmio, - struct resource mmio_pref) + struct list_head *add_list, + struct resource available_in[PCI_P2P_BRIDGE_RESOURCE_NUM]) { + struct resource available[PCI_P2P_BRIDGE_RESOURCE_NUM]; unsigned int normal_bridges = 0, hotplug_bridges = 0; - struct resource *io_res, *mmio_res, *mmio_pref_res; struct pci_dev *dev, *bridge = bus->self; - resource_size_t io_per_b, mmio_per_b, mmio_pref_per_b, align; + resource_size_t per_bridge[PCI_P2P_BRIDGE_RESOURCE_NUM]; + resource_size_t align; + int i; - io_res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; - mmio_res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; - mmio_pref_res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + struct resource *res = pci_bus_resource_n(bus, i); - /* - * The alignment of this bridge is yet to be considered, hence it must - * be done now before extending its bridge window. - */ - align = pci_resource_alignment(bridge, io_res); - if (!io_res->parent && align) - io.start = min(ALIGN(io.start, align), io.end + 1); - - align = pci_resource_alignment(bridge, mmio_res); - if (!mmio_res->parent && align) - mmio.start = min(ALIGN(mmio.start, align), mmio.end + 1); + available[i] = available_in[i]; - align = pci_resource_alignment(bridge, mmio_pref_res); - if (!mmio_pref_res->parent && align) - mmio_pref.start = min(ALIGN(mmio_pref.start, align), - mmio_pref.end + 1); + /* + * The alignment of this bridge is yet to be considered, + * hence it must be done now before extending its bridge + * window. + */ + align = pci_resource_alignment(bridge, res); + if (!res->parent && align) + available[i].start = min(ALIGN(available[i].start, align), + available[i].end + 1); - /* - * Now that we have adjusted for alignment, update the bridge window - * resources to fill as much remaining resource space as possible. - */ - adjust_bridge_window(bridge, io_res, add_list, resource_size(&io)); - adjust_bridge_window(bridge, mmio_res, add_list, resource_size(&mmio)); - adjust_bridge_window(bridge, mmio_pref_res, add_list, - resource_size(&mmio_pref)); + /* + * Now that we have adjusted for alignment, update the + * bridge window resources to fill as much remaining + * resource space as possible. + */ + adjust_bridge_window(bridge, res, add_list, + resource_size(&available[i])); + } /* * Calculate how many hotplug bridges and normal bridges there @@ -2161,7 +2159,7 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, */ list_for_each_entry(dev, &bus->devices, bus_list) { if (!dev->is_virtfn) - remove_dev_resources(dev, &io, &mmio, &mmio_pref); + remove_dev_resources(dev, available); } /* @@ -2173,16 +2171,9 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, * split between non-hotplug bridges. This is to allow possible * hotplug bridges below them to get the extra space as well. */ - if (hotplug_bridges) { - io_per_b = div64_ul(resource_size(&io), hotplug_bridges); - mmio_per_b = div64_ul(resource_size(&mmio), hotplug_bridges); - mmio_pref_per_b = div64_ul(resource_size(&mmio_pref), - hotplug_bridges); - } else { - io_per_b = div64_ul(resource_size(&io), normal_bridges); - mmio_per_b = div64_ul(resource_size(&mmio), normal_bridges); - mmio_pref_per_b = div64_ul(resource_size(&mmio_pref), - normal_bridges); + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + per_bridge[i] = div64_ul(resource_size(&available[i]), + hotplug_bridges ?: normal_bridges); } for_each_pci_bridge(dev, bus) { @@ -2195,49 +2186,41 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, if (hotplug_bridges && !dev->is_hotplug_bridge) continue; - res = &dev->resource[PCI_BRIDGE_IO_WINDOW]; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + res = pci_bus_resource_n(bus, i); - /* - * Make sure the split resource space is properly aligned - * for bridge windows (align it down to avoid going above - * what is available). - */ - align = pci_resource_alignment(dev, res); - resource_set_size(&io, ALIGN_DOWN_IF_NONZERO(io_per_b, align)); - - /* - * The x_per_b holds the extra resource space that can be - * added for each bridge but there is the minimal already - * reserved as well so adjust x.start down accordingly to - * cover the whole space. - */ - io.start -= resource_size(res); - - res = &dev->resource[PCI_BRIDGE_MEM_WINDOW]; - align = pci_resource_alignment(dev, res); - resource_set_size(&mmio, - ALIGN_DOWN_IF_NONZERO(mmio_per_b,align)); - mmio.start -= resource_size(res); + /* + * Make sure the split resource space is properly + * aligned for bridge windows (align it down to + * avoid going above what is available). + */ + align = pci_resource_alignment(dev, res); + resource_set_size(&available[i], + ALIGN_DOWN_IF_NONZERO(per_bridge[i], + align)); - res = &dev->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; - align = pci_resource_alignment(dev, res); - resource_set_size(&mmio_pref, - ALIGN_DOWN_IF_NONZERO(mmio_pref_per_b, align)); - mmio_pref.start -= resource_size(res); + /* + * The per_bridge holds the extra resource space + * that can be added for each bridge but there is + * the minimal already reserved as well so adjust + * x.start down accordingly to cover the whole + * space. + */ + available[i].start -= resource_size(res); + } - pci_bus_distribute_available_resources(b, add_list, io, mmio, - mmio_pref); + pci_bus_distribute_available_resources(b, add_list, available); - io.start += io.end + 1; - mmio.start += mmio.end + 1; - mmio_pref.start += mmio_pref.end + 1; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) + available[i].start += available[i].end + 1; } } static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, struct list_head *add_list) { - struct resource available_io, available_mmio, available_mmio_pref; + struct resource *res, available[PCI_P2P_BRIDGE_RESOURCE_NUM]; + unsigned int i; if (!bridge->is_hotplug_bridge) return; @@ -2245,14 +2228,13 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, pci_dbg(bridge, "distributing available resources\n"); /* Take the initial extra resources from the hotplug port */ - available_io = bridge->resource[PCI_BRIDGE_IO_WINDOW]; - available_mmio = bridge->resource[PCI_BRIDGE_MEM_WINDOW]; - available_mmio_pref = bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + res = pci_resource_n(bridge, PCI_BRIDGE_RESOURCES + i); + available[i] = *res; + } pci_bus_distribute_available_resources(bridge->subordinate, - add_list, available_io, - available_mmio, - available_mmio_pref); + add_list, available); } static bool pci_bridge_resources_not_assigned(struct pci_dev *dev) diff --git a/include/linux/pci.h b/include/linux/pci.h index 275df4058767..723e9cede69d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -119,7 +119,8 @@ enum { #define PCI_CB_BRIDGE_MEM_1_WINDOW (PCI_BRIDGE_RESOURCES + 3) /* Total number of bridge resources for P2P and CardBus */ -#define PCI_BRIDGE_RESOURCE_NUM 4 +#define PCI_P2P_BRIDGE_RESOURCE_NUM 3 +#define PCI_BRIDGE_RESOURCE_NUM 4 /* Resources assigned to buses behind the bridge */ PCI_BRIDGE_RESOURCES, -- cgit v1.2.3 From 705d2ac7b2044f1ca05ba6033183151a04dbff4d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 16 Sep 2025 15:28:02 +0100 Subject: io_uring/zcrx: allow synchronous buffer return Returning buffers via a ring is performant and convenient, but it becomes a problem when/if the user misconfigured the ring size and it becomes full. Add a synchronous way to return buffers back to the page pool via a new register opcode. It's supposed to be a reliable slow path for refilling. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++ io_uring/register.c | 3 ++ io_uring/zcrx.c | 68 +++++++++++++++++++++++++++++++++++++++++++ io_uring/zcrx.h | 7 +++++ 4 files changed, 90 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ce17c535944..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,6 +689,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1070,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 96e9cac12823..43f04c47522c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -833,6 +833,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_REFILL: + ret = io_zcrx_return_bufs(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 81d4aa75a69f..07a114f9a542 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -927,6 +927,74 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) +#define IO_ZCRX_SYS_REFILL_BATCH 32 + +static void io_return_buffers(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_rqe *rqes, unsigned nr) +{ + int i; + + for (i = 0; i < nr; i++) { + struct net_iov *niov; + netmem_ref netmem; + + if (!io_parse_rqe(&rqes[i], ifq, &niov)) + continue; + + scoped_guard(spinlock_bh, &ifq->rq_lock) { + if (!io_zcrx_put_niov_uref(niov)) + continue; + } + + netmem = net_iov_to_netmem(niov); + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; + struct io_uring_zcrx_rqe __user *user_rqes; + struct io_uring_zcrx_sync_refill zr; + struct io_zcrx_ifq *ifq; + unsigned nr, i; + + if (nr_arg) + return -EINVAL; + if (copy_from_user(&zr, arg, sizeof(zr))) + return -EFAULT; + if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) + return -EINVAL; + if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) + return -EINVAL; + + ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); + if (!ifq) + return -EINVAL; + nr = zr.nr_entries; + user_rqes = u64_to_user_ptr(zr.rqes); + + for (i = 0; i < nr;) { + unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); + size_t size = batch * sizeof(rqes[0]); + + if (copy_from_user(rqes, user_rqes + i, size)) + return i ? i : -EFAULT; + io_return_buffers(ifq, rqes, batch); + + i += batch; + + if (fatal_signal_pending(current)) + return i; + cond_resched(); + } + return nr; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index a48871b5adad..33ef61503092 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -63,6 +63,8 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -95,6 +97,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From 5020d05b3476f3561377a4ab076d42fda00e3607 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Thu, 11 Sep 2025 19:24:06 +0800 Subject: ACPI: processor: Remove unused empty stubs of some functions Empty stubs are defined in processor.h for some functions provided by the ACPI processor idle driver, but those functions are only used in the main ACPI processor driver which requires the ACPI processor idle driver to be present (selecting CONFIG_ACPI_PROCESSOR causes CONFIG_ACPI_PROCESSOR_IDLE to be selected too automatically). This means that the empty stubs in question are not really necessary and if both CONFIG_ACPI_PROCESSOR and CONFIG_ACPI_PROCESSOR_IDLE are unset, the compiler complains that they are defined, but not used. Drop them to get rid of the compiler warning. Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250911112408.1668431-2-lihuisong@huawei.com [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index ff864c1cee3a..2976a6d0c54f 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -425,26 +425,6 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); -#else -static inline int acpi_processor_power_init(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_power_exit(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_hotplug(struct acpi_processor *pr) -{ - return -ENODEV; -} #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ -- cgit v1.2.3 From dadb3ebcf395ebee3626d88ac7e5e234f15bae2c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sun, 14 Sep 2025 15:44:26 +0200 Subject: workqueue: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b6834b7aee4b..71a9900c03c7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,7 +410,7 @@ enum wq_flags { __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ - __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI | WQ_PERCPU, }; enum wq_consts { @@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90db8cf015c2..45320e27a16c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7828,22 +7828,22 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", 0, 0); - system_percpu_wq = alloc_workqueue("events", 0, 0); - system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); + system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_highpri_wq = alloc_workqueue("events_highpri", + WQ_HIGHPRI | WQ_PERCPU, 0); + system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); + WQ_FREEZABLE | WQ_PERCPU, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", - WQ_POWER_EFFICIENT, 0); + WQ_POWER_EFFICIENT | WQ_PERCPU, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", - WQ_FREEZABLE | WQ_POWER_EFFICIENT, - 0); - system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", - WQ_BH | WQ_HIGHPRI, 0); + WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || -- cgit v1.2.3 From 6b4be64fd9fec16418f365c2d8e47a7566e9eba5 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 15 Sep 2025 15:24:32 +0300 Subject: net/mlx5e: Harden uplink netdev access against device unbind The function mlx5_uplink_netdev_get() gets the uplink netdevice pointer from mdev->mlx5e_res.uplink_netdev. However, the netdevice can be removed and its pointer cleared when unbound from the mlx5_core.eth driver. This results in a NULL pointer, causing a kernel panic. BUG: unable to handle page fault for address: 0000000000001300 at RIP: 0010:mlx5e_vport_rep_load+0x22a/0x270 [mlx5_core] Call Trace: mlx5_esw_offloads_rep_load+0x68/0xe0 [mlx5_core] esw_offloads_enable+0x593/0x910 [mlx5_core] mlx5_eswitch_enable_locked+0x341/0x420 [mlx5_core] mlx5_devlink_eswitch_mode_set+0x17e/0x3a0 [mlx5_core] devlink_nl_eswitch_set_doit+0x60/0xd0 genl_family_rcv_msg_doit+0xe0/0x130 genl_rcv_msg+0x183/0x290 netlink_rcv_skb+0x4b/0xf0 genl_rcv+0x24/0x40 netlink_unicast+0x255/0x380 netlink_sendmsg+0x1f3/0x420 __sock_sendmsg+0x38/0x60 __sys_sendto+0x119/0x180 do_syscall_64+0x53/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Ensure the pointer is valid before use by checking it for NULL. If it is valid, immediately call netdev_hold() to take a reference, and preventing the netdevice from being freed while it is in use. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 27 ++++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 15 +++++++++++- include/linux/mlx5/driver.h | 1 + 4 files changed, 38 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..cd0242eb008c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + priv = netdev_priv(netdev); rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b4977650183..5f2d6c35f1ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) speed = lksettings.base.speed; out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index b111ccd03b02..74ea5da58b7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..10fe492e1fed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; -- cgit v1.2.3 From fb1f4568346153d2f80fdb4ffcfa0cf4fb257d3c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Sep 2025 12:06:07 -0700 Subject: scsi: ufs: core: Disable timestamp functionality if not supported Some Kioxia UFS 4 devices do not support the qTimestamp attribute. Set the UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT for these devices such that no error messages appear in the kernel log about failures to set the qTimestamp attribute. Signed-off-by: Bart Van Assche Reviewed-by: Avri Altman Tested-by: Nitin Rawat # on SM8650-QRD Reviewed-by: Nitin Rawat Reviewed-by: Peter Wang Reviewed-by: Manivannan Sadhasivam Message-ID: <20250909190614.3531435-1-bvanassche@acm.org> Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 6 +++++- include/ufs/ufs_quirks.h | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index e2157128e3bf..cfc149f8238e 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -316,6 +316,9 @@ static const struct ufs_dev_quirk ufs_fixups[] = { { .wmanufacturerid = UFS_VENDOR_TOSHIBA, .model = "THGLF2G9D8KBADG", .quirk = UFS_DEVICE_QUIRK_PA_TACTIVATE }, + { .wmanufacturerid = UFS_VENDOR_TOSHIBA, + .model = "THGJFJT1E45BATP", + .quirk = UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT }, {} }; @@ -8778,7 +8781,8 @@ static void ufshcd_set_timestamp_attr(struct ufs_hba *hba) struct ufs_dev_info *dev_info = &hba->dev_info; struct utp_upiu_query_v4_0 *upiu_data; - if (dev_info->wspecversion < 0x400) + if (dev_info->wspecversion < 0x400 || + hba->dev_quirks & UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT) return; ufshcd_dev_man_lock(hba); diff --git a/include/ufs/ufs_quirks.h b/include/ufs/ufs_quirks.h index f52de5ed1b3b..83563247c36c 100644 --- a/include/ufs/ufs_quirks.h +++ b/include/ufs/ufs_quirks.h @@ -113,4 +113,7 @@ struct ufs_dev_quirk { */ #define UFS_DEVICE_QUIRK_PA_HIBER8TIME (1 << 12) +/* Some UFS 4 devices do not support the qTimestamp attribute */ +#define UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT (1 << 13) + #endif /* UFS_QUIRKS_H_ */ -- cgit v1.2.3 From ea6bb47fd6a4c5a332f9349c39bf7462e3e7a35b Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 27 Aug 2025 13:56:46 +0200 Subject: thunderbolt: Update thunderbolt.h header file Make Thunderbolt header file compliant with current kernel-doc standards. No functional changes. Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg --- include/linux/thunderbolt.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 75247486616b..0ba112175bb3 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -213,7 +213,7 @@ enum tb_link_width { * queried first * @service_ids: Used to generate IDs for the services * @in_hopids: Input HopIDs for DMA tunneling - * @out_hopids; Output HopIDs for DMA tunneling + * @out_hopids: Output HopIDs for DMA tunneling * @local_property_block: Local block of properties * @local_property_block_gen: Generation of @local_property_block * @local_property_block_len: Length of the @local_property_block in dwords @@ -356,7 +356,7 @@ int tb_xdomain_request(struct tb_xdomain *xd, const void *request, unsigned int timeout_msec); /** - * tb_protocol_handler - Protocol specific handler + * struct tb_protocol_handler - Protocol specific handler * @uuid: XDomain messages with this UUID are dispatched to this handler * @callback: Callback called with the XDomain message. Returning %1 * here tells the XDomain core that the message was handled @@ -437,7 +437,7 @@ static inline struct tb_service *tb_to_service(struct device *dev) } /** - * tb_service_driver - Thunderbolt service driver + * struct tb_service_driver - Thunderbolt service driver * @driver: Driver structure * @probe: Called when the driver is probed * @remove: Called when the driver is removed (optional) @@ -519,6 +519,7 @@ struct tb_nhi { * @head: Head of the ring (write next descriptor here) * @tail: Tail of the ring (complete next descriptor here) * @descriptors: Allocated descriptors for this ring + * @descriptors_dma: DMA address of descriptors for this ring * @queue: Queue holding frames to be transferred over this ring * @in_flight: Queue holding frames that are currently in flight * @work: Interrupt work structure @@ -571,12 +572,12 @@ typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled); /** * enum ring_desc_flags - Flags for DMA ring descriptor - * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only) - * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only) - * %RING_DESC_COMPLETED: Descriptor completed (set by NHI) - * %RING_DESC_POSTED: Always set this - * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun - * %RING_DESC_INTERRUPT: Request an interrupt on completion + * @RING_DESC_ISOCH: Enable isonchronous DMA (Tx only) + * @RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only) + * @RING_DESC_COMPLETED: Descriptor completed (set by NHI) + * @RING_DESC_POSTED: Always set this + * @RING_DESC_BUFFER_OVERRUN: RX buffer overrun + * @RING_DESC_INTERRUPT: Request an interrupt on completion */ enum ring_desc_flags { RING_DESC_ISOCH = 0x1, @@ -636,7 +637,7 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); * If ring_stop() is called after the packet has been enqueued * @frame->callback will be called with canceled set to true. * - * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + * Return: %-ESHUTDOWN if ring_stop() has been called, %0 otherwise. */ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) { @@ -657,7 +658,7 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) * If ring_stop() is called after the packet has been enqueued @frame->callback * will be called with canceled set to true. * - * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + * Return: %-ESHUTDOWN if ring_stop has been called, %0 otherwise. */ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) { @@ -675,6 +676,8 @@ void tb_ring_poll_complete(struct tb_ring *ring); * * Use this function when you are mapping DMA for buffers that are * passed to the ring for sending/receiving. + * + * Return: Pointer to device used for DMA mapping. */ static inline struct device *tb_ring_dma_device(struct tb_ring *ring) { -- cgit v1.2.3 From 29589343488e116ac31f6f3cfa83e43949a2207a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Sep 2025 23:32:30 +0200 Subject: asm-generic: Provide generic TIF infrastructure Common TIF bits do not have to be defined by every architecture. They can be defined in a generic header. That allows adding generic TIF bits without chasing a gazillion of architecture headers, which is again a unjustified burden on anyone who works on generic infrastructure as it always needs a boat load of work to keep existing architecture code working when adding new stuff. While it is not as horrible as the ignorance of the generic entry infrastructure, it is a welcome mechanism to make architecture people rethink their approach of just leaching generic improvements into architecture code and thereby making it accumulatingly harder to maintain and improve generic code. It's about time that this changes. Provide the infrastructure and split the TIF space in half, 16 generic and 16 architecture specific bits. This could probably be extended by TIF_SINGLESTEP and BLOCKSTEP, but those are only used in architecture specific code. So leave them alone for now. Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Acked-by: Arnd Bergmann --- arch/Kconfig | 4 +++ include/asm-generic/thread_info_tif.h | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 include/asm-generic/thread_info_tif.h (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e085..c20df40a7220 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1730,6 +1730,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS relocations preserved. This is used by some architectures to construct bespoke relocation tables for KASLR. +# Select if architecture uses the common generic TIF bits +config HAVE_GENERIC_TIF_BITS + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h new file mode 100644 index 000000000000..ee3793e9b1a4 --- /dev/null +++ b/include/asm-generic/thread_info_tif.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_THREAD_INFO_TIF_H_ +#define _ASM_GENERIC_THREAD_INFO_TIF_H_ + +#include + +/* Bits 16-31 are reserved for architecture specific purposes */ + +#define TIF_NOTIFY_RESUME 0 // callback before returning to user +#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) + +#define TIF_SIGPENDING 1 // signal pending +#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) + +#define TIF_NOTIFY_SIGNAL 2 // signal notifications exist +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) + +#define TIF_MEMDIE 3 // is terminating due to OOM killer +#define _TIF_MEMDIE BIT(TIF_MEMDIE) + +#define TIF_NEED_RESCHED 4 // rescheduling necessary +#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) + +#ifdef HAVE_TIF_NEED_RESCHED_LAZY +# define TIF_NEED_RESCHED_LAZY 5 // Lazy rescheduling needed +# define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) +#endif + +#ifdef HAVE_TIF_POLLING_NRFLAG +# define TIF_POLLING_NRFLAG 6 // idle is polling for TIF_NEED_RESCHED +# define _TIF_POLLING_NRFLAG BIT(TIF_POLLING_NRFLAG) +#endif + +#define TIF_USER_RETURN_NOTIFY 7 // notify kernel of userspace return +#define _TIF_USER_RETURN_NOTIFY BIT(TIF_USER_RETURN_NOTIFY) + +#define TIF_UPROBE 8 // breakpointed or singlestepping +#define _TIF_UPROBE BIT(TIF_UPROBE) + +#define TIF_PATCH_PENDING 9 // pending live patching update +#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) + +#ifdef HAVE_TIF_RESTORE_SIGMASK +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) +#endif + +#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ -- cgit v1.2.3 From de2be98541dbe0de58d2dccf7fa19dfc9d9a8260 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Sep 2025 10:10:17 +0300 Subject: net/mlx5: Remove VLAN insertion fields from WQE Ether segment Now that the driver no longer uses VLAN TX insertion via the WQE Ethernet segment, the related fields and flags can be removed. Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757574619-604874-2-git-send-email-tariqt@nvidia.com Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- include/linux/mlx5/qp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index fc7eeff99a8a..5546c7bd2c83 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -237,13 +237,11 @@ enum { }; enum { - MLX5_ETH_WQE_SVLAN = 1 << 0, MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC = 1 << 26, MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC = 1 << 27, MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC = 3 << 26, MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC = 1 << 28, MLX5_ETH_WQE_INSERT_TRAILER = 1 << 30, - MLX5_ETH_WQE_INSERT_VLAN = 1 << 15, }; enum { @@ -275,10 +273,6 @@ struct mlx5_wqe_eth_seg { DECLARE_FLEX_ARRAY(u8, data); }; } inline_hdr; - struct { - __be16 type; - __be16 vlan_tci; - } insert; __be32 trailer; }; }; -- cgit v1.2.3 From cce65f32443b61db2370a67d2e92d16b773fe8a4 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Sep 2025 10:10:18 +0300 Subject: net/mlx5: Refactor MACsec WQE metadata shifts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce MLX5_ETH_WQE_FT_META_SHIFT as a shared base offset for features that use the lower 8 bits of the WQE flow_table_metadata field, currently used for timestamping, IPsec, and MACsec. Define MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK so that fs_id occupies bits 2–5, making it clear that fs_id occupies bits in the metadata. Set MLX5_ETH_WQE_FT_META_MACSEC_MASK as the OR of the MACsec flag and MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK, corresponding to the original 0x3E mask. Update the fs_id macro to right-shift the MACsec flag by MLX5_ETH_WQE_FT_META_SHIFT and update the RoCE modify-header action to use it. Introduce the helper macro MLX5_MACSEC_TX_METADATA(fs_id) to compose the full shifted MACsec metadata value. These changes make it explicit exactly which metadata bits carry MACsec information, simplifying future feature exclusions when multiple features share the WQE flowtable metadata. In addition, drop the incorrect “RX flow steering” comment, since this applies to TX flow steering. Signed-off-by: Carolina Jubran Reviewed-by: Jianbo Liu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757574619-604874-3-git-send-email-tariqt@nvidia.com Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c | 12 +++++------- drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h | 15 +++++++++++++++ include/linux/mlx5/qp.h | 9 +++++++-- 4 files changed, 28 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 6ab02f3fc291..528b04d4de41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1676,7 +1676,7 @@ void mlx5e_macsec_tx_build_eseg(struct mlx5e_macsec *macsec, if (!fs_id) return; - eseg->flow_table_metadata = cpu_to_be32(MLX5_ETH_WQE_FT_META_MACSEC | fs_id << 2); + eseg->flow_table_metadata = cpu_to_be32(MLX5_MACSEC_TX_METADATA(fs_id)); } void mlx5e_macsec_offload_handle_rx_skb(struct net_device *netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c index 762d55ba9e51..9ec450603176 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c @@ -45,11 +45,7 @@ #define MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI 0x8 #define MLX5_SECTAG_HEADER_SIZE_WITH_SCI (MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI + MACSEC_SCI_LEN) -/* MACsec RX flow steering */ -#define MLX5_ETH_WQE_FT_META_MACSEC_MASK 0x3E - /* MACsec fs_id handling for steering */ -#define macsec_fs_set_tx_fs_id(fs_id) (MLX5_ETH_WQE_FT_META_MACSEC | (fs_id) << 2) #define macsec_fs_set_rx_fs_id(fs_id) ((fs_id) | BIT(30)) struct mlx5_sectag_header { @@ -597,7 +593,7 @@ static int macsec_fs_tx_setup_fte(struct mlx5_macsec_fs *macsec_fs, MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a, MLX5_ETH_WQE_FT_META_MACSEC_MASK); MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a, - macsec_fs_set_tx_fs_id(id)); + MLX5_MACSEC_TX_METADATA(id)); *fs_id = id; flow_act->crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC; @@ -2219,8 +2215,10 @@ static int mlx5_macsec_fs_add_roce_rule_tx(struct mlx5_macsec_fs *macsec_fs, u32 MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_A); - MLX5_SET(set_action_in, action, data, macsec_fs_set_tx_fs_id(fs_id)); - MLX5_SET(set_action_in, action, offset, 0); + MLX5_SET(set_action_in, action, data, + mlx5_macsec_fs_set_tx_fs_id(fs_id)); + MLX5_SET(set_action_in, action, offset, + MLX5_ETH_WQE_FT_META_MACSEC_SHIFT); MLX5_SET(set_action_in, action, length, 32); modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h index 34b80c3ef6a5..15acaff43641 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h @@ -12,6 +12,21 @@ #define MLX5_MACSEC_METADATA_MARKER(metadata) ((((metadata) >> 30) & 0x3) == 0x1) #define MLX5_MACSEC_RX_METADAT_HANDLE(metadata) ((metadata) & MLX5_MACSEC_RX_FS_ID_MASK) +/* MACsec TX flow steering */ +#define MLX5_ETH_WQE_FT_META_MACSEC_MASK \ + (MLX5_ETH_WQE_FT_META_MACSEC | MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK) +#define MLX5_ETH_WQE_FT_META_MACSEC_SHIFT MLX5_ETH_WQE_FT_META_SHIFT + +/* MACsec fs_id handling for steering */ +#define mlx5_macsec_fs_set_tx_fs_id(fs_id) \ + (((MLX5_ETH_WQE_FT_META_MACSEC) >> MLX5_ETH_WQE_FT_META_MACSEC_SHIFT) \ + | ((fs_id) << 2)) + +#define MLX5_MACSEC_TX_METADATA(fs_id) \ + (mlx5_macsec_fs_set_tx_fs_id(fs_id) << \ + MLX5_ETH_WQE_FT_META_MACSEC_SHIFT) + +/* MACsec fs_id uses 4 bits, supports up to 16 interfaces */ #define MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES 16 struct mlx5_macsec_fs; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 5546c7bd2c83..b21be7630575 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -251,9 +251,14 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; +/* Base shift for metadata bits used by timestamping, IPsec, and MACsec */ +#define MLX5_ETH_WQE_FT_META_SHIFT 0 + enum { - MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), - MLX5_ETH_WQE_FT_META_MACSEC = BIT(1), + MLX5_ETH_WQE_FT_META_IPSEC = BIT(0) << MLX5_ETH_WQE_FT_META_SHIFT, + MLX5_ETH_WQE_FT_META_MACSEC = BIT(1) << MLX5_ETH_WQE_FT_META_SHIFT, + MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK = + GENMASK(5, 2) << MLX5_ETH_WQE_FT_META_SHIFT, }; struct mlx5_wqe_eth_seg { -- cgit v1.2.3 From 2ac207381c37eebc49559634ce5642119784bc7c Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Sep 2025 10:10:19 +0300 Subject: net/mlx5e: Prevent WQE metadata conflicts between timestamping and offloads Update the WQE metadata assignment to avoid overriding existing metadata when setting the sysport timestamp ID. Since timestamp IDs are limited to 256 values, they use only the lower 8 bits of the metadata field. To avoid conflicts, move IPsec and MACsec metadata ID to bits 8 and 9, and shift the MACsec fs_id accordingly. This ensures safe coexistence of timestamping and offload features that use the same metadata field. Signed-off-by: Carolina Jubran Reviewed-by: Jianbo Liu Reviewed-by: Patrisious Haddad Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757574619-604874-4-git-send-email-tariqt@nvidia.com Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c | 2 +- include/linux/mlx5/qp.h | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 319061d31602..6c55b67b7335 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -653,7 +653,7 @@ static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) - eseg->flow_table_metadata = + eseg->flow_table_metadata |= cpu_to_be32(mlx5e_ptp_metadata_fifo_peek(&ptpsq->metadata_freelist)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c index 9ec450603176..e6be2f01daf4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c @@ -2219,7 +2219,7 @@ static int mlx5_macsec_fs_add_roce_rule_tx(struct mlx5_macsec_fs *macsec_fs, u32 mlx5_macsec_fs_set_tx_fs_id(fs_id)); MLX5_SET(set_action_in, action, offset, MLX5_ETH_WQE_FT_META_MACSEC_SHIFT); - MLX5_SET(set_action_in, action, length, 32); + MLX5_SET(set_action_in, action, length, 8); modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, 1, action); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index b21be7630575..d67aedc6ea68 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -251,8 +251,9 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; -/* Base shift for metadata bits used by timestamping, IPsec, and MACsec */ -#define MLX5_ETH_WQE_FT_META_SHIFT 0 +/* Metadata bits 0-7 are used by timestamping */ +/* Base shift for metadata bits used by IPsec and MACsec */ +#define MLX5_ETH_WQE_FT_META_SHIFT 8 enum { MLX5_ETH_WQE_FT_META_IPSEC = BIT(0) << MLX5_ETH_WQE_FT_META_SHIFT, -- cgit v1.2.3 From 0b1619c38600fc06c73b1f59c64af0b7df08fc2c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Sep 2025 11:10:07 +0200 Subject: gpio: nomadik: fix the debugfs helper stub Commit ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") failed to also update the stub of the debugfs helper for !CONFIG_DEBUG_FS. Fix the resulting build failure. Fixes: ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509132232.12viPUPB-lkp@intel.com/ Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250915091007.28438-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/gpio-nomadik.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 7ba53b499e16..592a774a53cd 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -268,8 +268,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static inline void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, struct gpio_chip *chip, - unsigned int offset, - unsigned int gpio) + unsigned int offset) { } -- cgit v1.2.3 From 75d5546f60b36900051d75ee623fceccbeb6750c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 12 Sep 2025 18:58:51 +0200 Subject: HID: hidraw: tighten ioctl command parsing The handling for variable-length ioctl commands in hidraw_ioctl() is rather complex and the check for the data direction is incomplete. Simplify this code by factoring out the various ioctls grouped by dir and size, and using a switch() statement with the size masked out, to ensure the rest of the command is correctly matched. Fixes: 9188e79ec3fd ("HID: add phys and name ioctls to hidraw") Reported-by: Arnd Bergmann Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 224 ++++++++++++++++++++++++-------------------- include/uapi/linux/hidraw.h | 2 + 2 files changed, 124 insertions(+), 102 deletions(-) (limited to 'include') diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index c887f48756f4..bbd6f23bce78 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -394,27 +394,15 @@ static int hidraw_revoke(struct hidraw_list *list) return 0; } -static long hidraw_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long hidraw_fixed_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *arg) { - struct inode *inode = file_inode(file); - unsigned int minor = iminor(inode); - long ret = 0; - struct hidraw *dev; - struct hidraw_list *list = file->private_data; - void __user *user_arg = (void __user*) arg; - - down_read(&minors_rwsem); - dev = hidraw_table[minor]; - if (!dev || !dev->exist || hidraw_is_revoked(list)) { - ret = -ENODEV; - goto out; - } + struct hid_device *hid = dev->hid; switch (cmd) { case HIDIOCGRDESCSIZE: - if (put_user(dev->hid->rsize, (int __user *)arg)) - ret = -EFAULT; + if (put_user(hid->rsize, (int __user *)arg)) + return -EFAULT; break; case HIDIOCGRDESC: @@ -422,113 +410,145 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, __u32 len; if (get_user(len, (int __user *)arg)) - ret = -EFAULT; - else if (len > HID_MAX_DESCRIPTOR_SIZE - 1) - ret = -EINVAL; - else if (copy_to_user(user_arg + offsetof( - struct hidraw_report_descriptor, - value[0]), - dev->hid->rdesc, - min(dev->hid->rsize, len))) - ret = -EFAULT; + return -EFAULT; + + if (len > HID_MAX_DESCRIPTOR_SIZE - 1) + return -EINVAL; + + if (copy_to_user(arg + offsetof( + struct hidraw_report_descriptor, + value[0]), + hid->rdesc, + min(hid->rsize, len))) + return -EFAULT; + break; } case HIDIOCGRAWINFO: { struct hidraw_devinfo dinfo; - dinfo.bustype = dev->hid->bus; - dinfo.vendor = dev->hid->vendor; - dinfo.product = dev->hid->product; - if (copy_to_user(user_arg, &dinfo, sizeof(dinfo))) - ret = -EFAULT; + dinfo.bustype = hid->bus; + dinfo.vendor = hid->vendor; + dinfo.product = hid->product; + if (copy_to_user(arg, &dinfo, sizeof(dinfo))) + return -EFAULT; break; } case HIDIOCREVOKE: { - if (user_arg) - ret = -EINVAL; - else - ret = hidraw_revoke(list); - break; + struct hidraw_list *list = file->private_data; + + if (arg) + return -EINVAL; + + return hidraw_revoke(list); } default: - { - struct hid_device *hid = dev->hid; - if (_IOC_TYPE(cmd) != 'H') { - ret = -EINVAL; - break; - } + /* + * None of the above ioctls can return -EAGAIN, so + * use it as a marker that we need to check variable + * length ioctls. + */ + return -EAGAIN; + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); - break; - } + return 0; +} - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSINPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGINPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); - break; - } +static long hidraw_rw_variable_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *user_arg) +{ + int len = _IOC_SIZE(cmd); + + switch (cmd & ~IOCSIZE_MASK) { + case HIDIOCSFEATURE(0): + return hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); + case HIDIOCGFEATURE(0): + return hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); + case HIDIOCSINPUT(0): + return hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); + case HIDIOCGINPUT(0): + return hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); + case HIDIOCSOUTPUT(0): + return hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); + case HIDIOCGOUTPUT(0): + return hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSOUTPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGOUTPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); - break; - } + return -EINVAL; +} - /* Begin Read-only ioctls. */ - if (_IOC_DIR(cmd) != _IOC_READ) { - ret = -EINVAL; - break; - } +static long hidraw_ro_variable_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *user_arg) +{ + struct hid_device *hid = dev->hid; + int len = _IOC_SIZE(cmd); + int field_len; + + switch (cmd & ~IOCSIZE_MASK) { + case HIDIOCGRAWNAME(0): + field_len = strlen(hid->name) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; + case HIDIOCGRAWPHYS(0): + field_len = strlen(hid->phys) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; + case HIDIOCGRAWUNIQ(0): + field_len = strlen(hid->uniq) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->uniq, len) ? -EFAULT : len; + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { - int len = strlen(hid->name) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->name, len) ? - -EFAULT : len; - break; - } + return -EINVAL; +} - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { - int len = strlen(hid->phys) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->phys, len) ? - -EFAULT : len; - break; - } +static long hidraw_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + unsigned int minor = iminor(inode); + struct hidraw *dev; + struct hidraw_list *list = file->private_data; + void __user *user_arg = (void __user *)arg; + int ret; - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWUNIQ(0))) { - int len = strlen(hid->uniq) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->uniq, len) ? - -EFAULT : len; - break; - } - } + down_read(&minors_rwsem); + dev = hidraw_table[minor]; + if (!dev || !dev->exist || hidraw_is_revoked(list)) { + ret = -ENODEV; + goto out; + } + + if (_IOC_TYPE(cmd) != 'H') { + ret = -EINVAL; + goto out; + } + if (_IOC_NR(cmd) > HIDIOCTL_LAST || _IOC_NR(cmd) == 0) { ret = -ENOTTY; + goto out; } + + ret = hidraw_fixed_size_ioctl(file, dev, cmd, user_arg); + if (ret != -EAGAIN) + goto out; + + switch (_IOC_DIR(cmd)) { + case (_IOC_READ | _IOC_WRITE): + ret = hidraw_rw_variable_size_ioctl(file, dev, cmd, user_arg); + break; + case _IOC_READ: + ret = hidraw_ro_variable_size_ioctl(file, dev, cmd, user_arg); + break; + default: + /* Any other IOC_DIR is wrong */ + ret = -EINVAL; + } + out: up_read(&minors_rwsem); return ret; diff --git a/include/uapi/linux/hidraw.h b/include/uapi/linux/hidraw.h index d5ee269864e0..ebd701b3c18d 100644 --- a/include/uapi/linux/hidraw.h +++ b/include/uapi/linux/hidraw.h @@ -48,6 +48,8 @@ struct hidraw_devinfo { #define HIDIOCGOUTPUT(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x0C, len) #define HIDIOCREVOKE _IOW('H', 0x0D, int) /* Revoke device access */ +#define HIDIOCTL_LAST _IOC_NR(HIDIOCREVOKE) + #define HIDRAW_FIRST_MINOR 0 #define HIDRAW_MAX_DEVICES 64 /* number of reports to buffer */ -- cgit v1.2.3 From d1dd75c6500c74b91c5286fd3277710371d3e3ca Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Sat, 13 Sep 2025 16:12:54 +0000 Subject: HID: core: Change hid_driver to use a const char* for name name is never mutated by the core HID stack. Making name a const char* simplifies passing the string from Rust to C. Otherwise, it becomes difficult to pass a 'static lifetime CStr from Rust to a char*, rather than a const char*, due to lack of guarantee that the underlying data of the CStr will not be mutated by the C code. Signed-off-by: Rahul Rameshbabu Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hid.h b/include/linux/hid.h index 2cc4f1e4ea96..426b22ed42b4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -816,7 +816,7 @@ struct hid_usage_id { * zero from them. */ struct hid_driver { - char *name; + const char *name; const struct hid_device_id *id_table; struct list_head dyn_list; -- cgit v1.2.3 From 648dbccc03a000cd64c2a9d86012d98053545e64 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:49 +0000 Subject: crypto: ccp - Add AMD Seamless Firmware Servicing (SFS) driver AMD Seamless Firmware Servicing (SFS) is a secure method to allow non-persistent updates to running firmware and settings without requiring BIOS reflash and/or system reset. SFS does not address anything that runs on the x86 processors and it can be used to update ASP firmware, modules, register settings and update firmware for other microprocessors like TMPM, etc. SFS driver support adds ioctl support to communicate the SFS commands to the ASP/PSP by using the TEE mailbox interface. The Seamless Firmware Servicing (SFS) driver is added as a PSP sub-device. For detailed information, please look at the SFS specifications: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58604.pdf Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 20 +++ drivers/crypto/ccp/psp-dev.h | 8 +- drivers/crypto/ccp/sfs.c | 311 ++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sfs.h | 47 ++++++ include/linux/psp-platform-access.h | 2 + include/uapi/linux/psp-sfs.h | 87 ++++++++++ 7 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/crypto/ccp/sfs.c create mode 100644 drivers/crypto/ccp/sfs.h create mode 100644 include/uapi/linux/psp-sfs.h (limited to 'include') diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae..a9626b30044a 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631e..9e21da0e298a 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede76..268c83f298cb 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 000000000000..2f4beaafe7ec --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 000000000000..97704c210efd --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 000000000000..94e51670383c --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ -- cgit v1.2.3 From 45fe729be9a6be326a1ca25af82d34de32ba2ce8 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 16 Sep 2025 10:16:20 +0800 Subject: usb: typec: Stub out typec_switch APIs when CONFIG_TYPEC=n Ease driver development by adding stubs for the typec_switch APIs when CONFIG_TYPEC=n. Copy the same method used for the typec_mux APIs to be consistent. Acked-by: Greg Kroah-Hartman Reviewed-by: Heikki Krogerus Signed-off-by: Stephen Boyd Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20250916021620.1303995-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_mux.h | 46 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/usb/typec_mux.h b/include/linux/usb/typec_mux.h index 2489a7857d8e..aa9ebb7e2fe0 100644 --- a/include/linux/usb/typec_mux.h +++ b/include/linux/usb/typec_mux.h @@ -3,6 +3,7 @@ #ifndef __USB_TYPEC_MUX #define __USB_TYPEC_MUX +#include #include #include @@ -24,16 +25,13 @@ struct typec_switch_desc { void *drvdata; }; +#if IS_ENABLED(CONFIG_TYPEC) + struct typec_switch *fwnode_typec_switch_get(struct fwnode_handle *fwnode); void typec_switch_put(struct typec_switch *sw); int typec_switch_set(struct typec_switch *sw, enum typec_orientation orientation); -static inline struct typec_switch *typec_switch_get(struct device *dev) -{ - return fwnode_typec_switch_get(dev_fwnode(dev)); -} - struct typec_switch_dev * typec_switch_register(struct device *parent, const struct typec_switch_desc *desc); @@ -42,6 +40,44 @@ void typec_switch_unregister(struct typec_switch_dev *sw); void typec_switch_set_drvdata(struct typec_switch_dev *sw, void *data); void *typec_switch_get_drvdata(struct typec_switch_dev *sw); +#else + +static inline struct typec_switch * +fwnode_typec_switch_get(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline void typec_switch_put(struct typec_switch *sw) {} + +static inline int typec_switch_set(struct typec_switch *sw, + enum typec_orientation orientation) +{ + return 0; +} + +static inline struct typec_switch_dev * +typec_switch_register(struct device *parent, + const struct typec_switch_desc *desc) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void typec_switch_unregister(struct typec_switch_dev *sw) {} + +static inline void typec_switch_set_drvdata(struct typec_switch_dev *sw, void *data) {} +static inline void *typec_switch_get_drvdata(struct typec_switch_dev *sw) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +#endif /* CONFIG_TYPEC */ + +static inline struct typec_switch *typec_switch_get(struct device *dev) +{ + return fwnode_typec_switch_get(dev_fwnode(dev)); +} + struct typec_mux_state { struct typec_altmode *alt; unsigned long mode; -- cgit v1.2.3 From bfb1d99d969fe3b892db30848aeebfa19d21f57f Mon Sep 17 00:00:00 2001 From: Kuen-Han Tsai Date: Tue, 16 Sep 2025 16:21:32 +0800 Subject: usb: gadget: Store endpoint pointer in usb_request Gadget function drivers often have goto-based error handling in their bind paths, which can be bug-prone. Refactoring these paths to use __free() scope-based cleanup is desirable, but currently blocked. The blocker is that usb_ep_free_request(ep, req) requires two parameters, while the __free() mechanism can only pass a pointer to the request itself. Store an endpoint pointer in the struct usb_request. The pointer is populated centrally in usb_ep_alloc_request() on every successful allocation, making the request object self-contained. Signed-off-by: Kuen-Han Tsai Link: https://lore.kernel.org/r/20250916-ready-v1-1-4997bf277548@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250916-ready-v1-1-4997bf277548@google.com --- drivers/usb/gadget/udc/core.c | 3 +++ include/linux/usb/gadget.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d709e24c1fd4..e3d63b8fa0f4 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -194,6 +194,9 @@ struct usb_request *usb_ep_alloc_request(struct usb_ep *ep, req = ep->ops->alloc_request(ep, gfp_flags); + if (req) + req->ep = ep; + trace_usb_ep_alloc_request(ep, req, req ? 0 : -ENOMEM); return req; diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 0f28c5512fcb..0f2079476088 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -32,6 +32,7 @@ struct usb_ep; /** * struct usb_request - describes one i/o request + * @ep: The associated endpoint set by usb_ep_alloc_request(). * @buf: Buffer used for data. Always provide this; some controllers * only use PIO, or don't use DMA for some endpoints. * @dma: DMA address corresponding to 'buf'. If you don't set this @@ -98,6 +99,7 @@ struct usb_ep; */ struct usb_request { + struct usb_ep *ep; void *buf; unsigned length; dma_addr_t dma; -- cgit v1.2.3 From 201c53c687f2b55a7cc6d9f4000af4797860174b Mon Sep 17 00:00:00 2001 From: Kuen-Han Tsai Date: Tue, 16 Sep 2025 16:21:33 +0800 Subject: usb: gadget: Introduce free_usb_request helper Introduce the free_usb_request() function that frees both the request's buffer and the request itself. This function serves as the cleanup callback for DEFINE_FREE() to enable automatic, scope-based cleanup for usb_request pointers. Signed-off-by: Kuen-Han Tsai Link: https://lore.kernel.org/r/20250916-ready-v1-2-4997bf277548@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250916-ready-v1-2-4997bf277548@google.com --- include/linux/usb/gadget.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 0f2079476088..3aaf19e77558 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -15,6 +15,7 @@ #ifndef __LINUX_USB_GADGET_H #define __LINUX_USB_GADGET_H +#include #include #include #include @@ -293,6 +294,28 @@ static inline void usb_ep_fifo_flush(struct usb_ep *ep) /*-------------------------------------------------------------------------*/ +/** + * free_usb_request - frees a usb_request object and its buffer + * @req: the request being freed + * + * This helper function frees both the request's buffer and the request object + * itself by calling usb_ep_free_request(). Its signature is designed to be used + * with DEFINE_FREE() to enable automatic, scope-based cleanup for usb_request + * pointers. + */ +static inline void free_usb_request(struct usb_request *req) +{ + if (!req) + return; + + kfree(req->buf); + usb_ep_free_request(req->ep, req); +} + +DEFINE_FREE(free_usb_request, struct usb_request *, free_usb_request(_T)) + +/*-------------------------------------------------------------------------*/ + struct usb_dcd_config_params { __u8 bU1devExitLat; /* U1 Device exit Latency */ #define USB_DEFAULT_U1_DEV_EXIT_LAT 0x01 /* Less then 1 microsec */ -- cgit v1.2.3 From 5bcf9e1d0a5da750ff180e7385b0bd4041686516 Mon Sep 17 00:00:00 2001 From: Duje Mihanović Date: Sat, 13 Sep 2025 23:12:48 +0200 Subject: dt-bindings: clock: marvell,pxa1908: Add syscon compatible to apmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add required syscon compatible and #power-domain-cells to the APMU controller. This is required for the SoC's power domain controller as the registers are shared. Device tree bindings for said power domains are also added. Reviewed-by: Rob Herring (Arm) Signed-off-by: Duje Mihanović Signed-off-by: Ulf Hansson --- .../devicetree/bindings/clock/marvell,pxa1908.yaml | 30 +++++++++++++++++----- MAINTAINERS | 1 + include/dt-bindings/power/marvell,pxa1908-power.h | 17 ++++++++++++ 3 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 include/dt-bindings/power/marvell,pxa1908-power.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml b/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml index 4e78933232b6..6f3a8578fe2a 100644 --- a/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml +++ b/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml @@ -19,11 +19,14 @@ description: | properties: compatible: - enum: - - marvell,pxa1908-apbc - - marvell,pxa1908-apbcp - - marvell,pxa1908-mpmu - - marvell,pxa1908-apmu + oneOf: + - enum: + - marvell,pxa1908-apbc + - marvell,pxa1908-apbcp + - marvell,pxa1908-mpmu + - items: + - const: marvell,pxa1908-apmu + - const: syscon reg: maxItems: 1 @@ -31,6 +34,9 @@ properties: '#clock-cells': const: 1 + '#power-domain-cells': + const: 1 + required: - compatible - reg @@ -38,11 +44,23 @@ required: additionalProperties: false +if: + not: + properties: + compatible: + contains: + const: marvell,pxa1908-apmu + +then: + properties: + '#power-domain-cells': false + examples: # APMU block: - | clock-controller@d4282800 { - compatible = "marvell,pxa1908-apmu"; + compatible = "marvell,pxa1908-apmu", "syscon"; reg = <0xd4282800 0x400>; #clock-cells = <1>; + #power-domain-cells = <1>; }; diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..22aa7ce417b8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2872,6 +2872,7 @@ S: Maintained F: arch/arm64/boot/dts/marvell/mmp/ F: drivers/clk/mmp/clk-pxa1908*.c F: include/dt-bindings/clock/marvell,pxa1908.h +F: include/dt-bindings/power/marvell,pxa1908-power.h ARM/Mediatek RTC DRIVER M: Eddie Huang diff --git a/include/dt-bindings/power/marvell,pxa1908-power.h b/include/dt-bindings/power/marvell,pxa1908-power.h new file mode 100644 index 000000000000..19b088351af1 --- /dev/null +++ b/include/dt-bindings/power/marvell,pxa1908-power.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Marvell PXA1908 power domains + * + * Copyright 2025, Duje Mihanović + */ + +#ifndef __DTS_MARVELL_PXA1908_POWER_H +#define __DTS_MARVELL_PXA1908_POWER_H + +#define PXA1908_POWER_DOMAIN_VPU 0 +#define PXA1908_POWER_DOMAIN_GPU 1 +#define PXA1908_POWER_DOMAIN_GPU2D 2 +#define PXA1908_POWER_DOMAIN_DSI 3 +#define PXA1908_POWER_DOMAIN_ISP 4 + +#endif -- cgit v1.2.3 From 413187f79062634575098653c40f95d439d3b157 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Sep 2025 15:26:49 +0200 Subject: stddef: Remove token-pasting in TRAILING_OVERLAP() Currently, TRAILING_OVERLAP() token-pastes the FAM parameter into the name of internal pdding member `__offset_to_##FAM`. This forces FAM to be a single identifier, which prevents callers from using a FAM when it's a nested member. For instance, see the following scenario: | struct flex { | size_t count; | int data[]; | }; | struct foo { | int hdr_foo; | struct flex f; | }; | struct composite { | struct foo hdr; | int data[100]; | }; In this case, it'd be useful if TRAILING_OVERLAP() could be used in the following way: | struct composite { | TRAILING_OVERLAP(struct foo, hdr, f.data, | int data[100]; | ); | }; However, this is not current possible due to the token concatenation in `__offset_to_##FAM`, which fails when FAM contains a dot. So, remove token-pasting and use the fixed internal name `__offset_to_FAM` and, with this, expand the capabilities of TRAILING_OVERLAP(). :) Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/13b3e0a69aad837b4e32ca8269b9d91bf1fbe9ef.1758115257.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/stddef.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index dab49e2ec8c0..701099c67c24 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -108,7 +108,7 @@ enum { union { \ TYPE NAME; \ struct { \ - unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)]; \ + unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ }; \ } -- cgit v1.2.3 From 2bbdcf02c3f3df6c85abd0a2fee0d7f0f4113e96 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Sep 2025 15:28:07 +0200 Subject: stddef: Introduce __TRAILING_OVERLAP() Introduce underlying __TRAILING_OVERLAP() macro to let callers apply atributes to trailing overlapping members. For instance, the code below: | struct flex { | size_t count; | int data[]; | }; | struct { | struct flex f; | struct foo a; | struct boo b; | } __packed instance; can now be changed to the following, and preserve the __packed attribute: | __TRAILING_OVERLAP(struct flex, f, data, __packed, | struct foo a; | struct boo b; | ) instance; Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/f80c529b239ce11f0a51f714fe00ddf839e05f5e.1758115257.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/stddef.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 701099c67c24..80b6bfb944f0 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -94,7 +94,8 @@ enum { __DECLARE_FLEX_ARRAY(TYPE, NAME) /** - * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * __TRAILING_OVERLAP() - Overlap a flexible-array member with trailing + * members. * * Creates a union between a flexible-array member (FAM) in a struct and a set * of additional members that would otherwise follow it. @@ -102,15 +103,30 @@ enum { * @TYPE: Flexible structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @FAM: The flexible-array member within @TYPE + * @ATTRS: Any struct attributes (usually empty) * @MEMBERS: Trailing overlapping members. */ -#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ +#define __TRAILING_OVERLAP(TYPE, NAME, FAM, ATTRS, MEMBERS) \ union { \ TYPE NAME; \ struct { \ unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ - }; \ + } ATTRS; \ } +/** + * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * + * Creates a union between a flexible-array member (FAM) in a struct and a set + * of additional members that would otherwise follow it. + * + * @TYPE: Flexible structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @FAM: The flexible-array member within @TYPE + * @MEMBERS: Trailing overlapping members. + */ +#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ + __TRAILING_OVERLAP(TYPE, NAME, FAM, /* no attrs */, MEMBERS) + #endif -- cgit v1.2.3 From 9664d5810e9bc919a9a661594e01eabc80befe8a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 17 Sep 2025 10:11:28 +0100 Subject: KVM: arm64: Don't access ICC_SRE_EL2 if GICv3 doesn't support v2 compatibility We currently access ICC_SRE_EL2 at each load/put on VHE, and on each entry/exit on nVHE. Both are quite onerous on NV, as this register always traps. We do this to make sure the EL1 guest doesn't flip between v2 and v3 behind our back. But all modern implementations have dropped v2, and this is just overhead. At the same time, the GICv5 spec has been fixed to allow access to ICC_SRE_EL2 in legacy mode. Use this opportunity to replace the GICv5 checks for v2 compat checks, with an ad-hoc static key. Co-developed-by: Sascha Bischoff Signed-off-by: Sascha Bischoff Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kernel/image-vars.h | 3 +++ arch/arm64/kvm/hyp/vgic-v3-sr.c | 25 +++++++++---------------- arch/arm64/kvm/vgic/vgic-v3.c | 8 ++++++++ include/kvm/arm_vgic.h | 1 + 4 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 714b0b5ec5ac..5369763606e7 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -105,6 +105,9 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); +/* Static key indicating whether GICv3 has GICv2 compatibility */ +KVM_NVHE_ALIAS(vgic_v3_has_v2_compat); + /* Static key which is set if CNTVOFF_EL2 is unusable */ KVM_NVHE_ALIAS(broken_cntvoff_key); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index d81275790e69..acd909b7f225 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -295,12 +295,8 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } } - /* - * GICv5 BET0 FEAT_GCIE_LEGACY doesn't include ICC_SRE_EL2. This is due - * to be relaxed in a future spec release, at which point this in - * condition can be dropped. - */ - if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) { + /* Only disable SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { /* * Prevent the guest from touching the ICC_SRE_EL1 system * register. Note that this may not have any effect, as @@ -329,19 +325,16 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); } - /* - * Can be dropped in the future when GICv5 spec is relaxed. See comment - * above. - */ - if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) { + /* Only restore SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { val = read_gicreg(ICC_SRE_EL2); write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - } - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } } /* diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index b9ad7c42c5b0..f1c153106c56 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -588,6 +588,7 @@ int vgic_v3_map_resources(struct kvm *kvm) } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); +DEFINE_STATIC_KEY_FALSE(vgic_v3_has_v2_compat); static int __init early_group0_trap_cfg(char *buf) { @@ -697,6 +698,13 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); + /* + * Flip the static branch if the HW supports v2, even if we're + * not using it (such as in protected mode). + */ + if (has_v2) + static_branch_enable(&vgic_v3_has_v2_compat); + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { group0_trap = true; group1_trap = true; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 404883c7af6e..9a6340d9c91e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -375,6 +375,7 @@ struct vgic_cpu { extern struct static_key_false vgic_v2_cpuif_trap; extern struct static_key_false vgic_v3_cpuif_trap; +extern struct static_key_false vgic_v3_has_v2_compat; int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); -- cgit v1.2.3 From 5c5db9efe323dd0b0d7917dbe5b9c0999c95e79e Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Thu, 28 Aug 2025 10:59:43 +0000 Subject: irqchip/gic-v5: Drop has_gcie_v3_compat from gic_kvm_info The presence of FEAT_GCIE_LEGACY is now handled as a CPU feature. Therefore, drop the check and flag from the GIC driver and gic_kvm_info as it is no longer required or used by KVM. Signed-off-by: Sascha Bischoff Acked-by: Thomas Gleixner Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5.c | 7 ------- include/linux/irqchip/arm-vgic-info.h | 2 -- 2 files changed, 9 deletions(-) (limited to 'include') diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 4bd224f359a7..41ef286c4d78 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -1062,16 +1062,9 @@ static void gicv5_set_cpuif_idbits(void) #ifdef CONFIG_KVM static struct gic_kvm_info gic_v5_kvm_info __initdata; -static bool __init gicv5_cpuif_has_gcie_legacy(void) -{ - u64 idr0 = read_sysreg_s(SYS_ICC_IDR0_EL1); - return !!FIELD_GET(ICC_IDR0_EL1_GCIE_LEGACY, idr0); -} - static void __init gic_of_setup_kvm_info(struct device_node *node) { gic_v5_kvm_info.type = GIC_V5; - gic_v5_kvm_info.has_gcie_v3_compat = gicv5_cpuif_has_gcie_legacy(); /* GIC Virtual CPU interface maintenance interrupt */ gic_v5_kvm_info.no_maint_irq_mask = false; diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index ca1713fac6e3..a470a73a805a 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -36,8 +36,6 @@ struct gic_kvm_info { bool has_v4_1; /* Deactivation impared, subpar stuff */ bool no_hw_deactivation; - /* v3 compat support (GICv5 hosts, only) */ - bool has_gcie_v3_compat; }; #ifdef CONFIG_KVM -- cgit v1.2.3 From 652b08afba69d5d26fe91098eb832b1bcc0f91c2 Mon Sep 17 00:00:00 2001 From: Cristian Birsan Date: Thu, 21 Nov 2024 20:16:38 +0200 Subject: ARM: at91: remove default values for PMC_PLL_ACR Remove default values for PMC PLL Analog Control Register(ACR) as the values are specific for each SoC and PLL and load them from PLL characteristics structure Co-developed-by: Andrei Simion Signed-off-by: Andrei Simion Signed-off-by: Cristian Birsan [nicolas.ferre@microchip.com: fix pll acr write sequence, preserve val] Signed-off-by: Nicolas Ferre --- drivers/clk/at91/clk-sam9x60-pll.c | 7 ++----- include/linux/clk/at91_pmc.h | 2 -- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/clk/at91/clk-sam9x60-pll.c b/drivers/clk/at91/clk-sam9x60-pll.c index a035dc15454b..3dc75a394ce1 100644 --- a/drivers/clk/at91/clk-sam9x60-pll.c +++ b/drivers/clk/at91/clk-sam9x60-pll.c @@ -103,11 +103,8 @@ static int sam9x60_frac_pll_set(struct sam9x60_pll_core *core) (cmul == frac->mul && cfrac == frac->frac)) goto unlock; - /* Recommended value for PMC_PLL_ACR */ - if (core->characteristics->upll) - val = AT91_PMC_PLL_ACR_DEFAULT_UPLL; - else - val = AT91_PMC_PLL_ACR_DEFAULT_PLLA; + /* Load recommended value for PMC_PLL_ACR */ + val = core->characteristics->acr; regmap_write(regmap, AT91_PMC_PLL_ACR, val); regmap_write(regmap, AT91_PMC_PLL_CTRL1, diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h index 7af499bdbecb..d60ce9708ea2 100644 --- a/include/linux/clk/at91_pmc.h +++ b/include/linux/clk/at91_pmc.h @@ -47,8 +47,6 @@ #define AT91_PMC_PCSR 0x18 /* Peripheral Clock Status Register */ #define AT91_PMC_PLL_ACR 0x18 /* PLL Analog Control Register [for SAM9X60] */ -#define AT91_PMC_PLL_ACR_DEFAULT_UPLL UL(0x12020010) /* Default PLL ACR value for UPLL */ -#define AT91_PMC_PLL_ACR_DEFAULT_PLLA UL(0x00020010) /* Default PLL ACR value for PLLA */ #define AT91_PMC_PLL_ACR_UTMIVR (1 << 12) /* UPLL Voltage regulator Control */ #define AT91_PMC_PLL_ACR_UTMIBG (1 << 13) /* UPLL Bandgap Control */ -- cgit v1.2.3 From 4ca24d6abbca5df76c4b189dd94fb055613de297 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Sep 2025 11:08:14 -0500 Subject: lib/crypto: sha256: Add support for 2-way interleaved hashing Many arm64 and x86_64 CPUs can compute two SHA-256 hashes in nearly the same speed as one, if the instructions are interleaved. This is because SHA-256 is serialized block-by-block, and two interleaved hashes take much better advantage of the CPU's instruction-level parallelism. Meanwhile, a very common use case for SHA-256 hashing in the Linux kernel is dm-verity and fs-verity. Both use a Merkle tree that has a fixed block size, usually 4096 bytes with an empty or 32-byte salt prepended. Usually, many blocks need to be hashed at a time. This is an ideal scenario for 2-way interleaved hashing. To enable this optimization, add a new function sha256_finup_2x() to the SHA-256 library API. It computes the hash of two equal-length messages, starting from a common initial context. For now it always falls back to sequential processing. Later patches will wire up arm64 and x86_64 optimized implementations. Note that the interleaving factor could in principle be higher than 2x. However, that runs into many practical difficulties and CPU throughput limitations. Thus, both the implementations I'm adding are 2x. In the interest of using the simplest solution, the API matches that. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250915160819.140019-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 28 ++++++++++++++++++++ lib/crypto/sha256.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 94 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 15e461e568cc..e5dafb935cc8 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -375,6 +375,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); */ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); +/** + * sha256_finup_2x() - Compute two SHA-256 digests from a common initial + * context. On some CPUs, this is faster than sequentially + * computing each digest. + * @ctx: an optional initial context, which may have already processed data. If + * NULL, a default initial context is used (equivalent to sha256_init()). + * @data1: data for the first message + * @data2: data for the second message + * @len: the length of each of @data1 and @data2, in bytes + * @out1: (output) the first SHA-256 message digest + * @out2: (output) the second SHA-256 message digest + * + * Context: Any context. + */ +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +/** + * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real + * interleaved implementation, as opposed to a + * sequential fallback + * @return: true if optimized + * + * Context: Any context. + */ +bool sha256_finup_2x_is_optimized(void); + /** * struct hmac_sha256_key - Prepared key for HMAC-SHA256 * @key: private diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 8fa15165d23e..881b935418ce 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = { }, }; -static const struct sha256_block_state sha256_iv = { - .h = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, +static const struct sha256_ctx initial_sha256_ctx = { + .ctx = { + .state = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, + }, + .bytecount = 0, }, }; +#define sha256_iv (initial_sha256_ctx.ctx.state) + static const u32 sha256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256); -/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +/* + * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) + * doesn't need either HMAC support or interleaved hashing support + */ #ifndef __DISABLE_EXPORTS + +#ifndef sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + return false; +} +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return false; +} +#endif + +/* Sequential fallback implementation of sha256_finup_2x() */ +static noinline_for_stack void sha256_finup_2x_sequential( + const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, + size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) +{ + struct __sha256_ctx mut_ctx; + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data1, len); + __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data2, len); + __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); +} + +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + if (ctx == NULL) + ctx = &initial_sha256_ctx; + + if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, + out2))) + return; + sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x); + +bool sha256_finup_2x_is_optimized(void) +{ + return sha256_finup_2x_is_optimized_arch(); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); + static void __hmac_sha256_preparekey(struct sha256_block_state *istate, struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, -- cgit v1.2.3 From d4bf06592ad68ac4353a81c73e8e662cf88aa2cc Mon Sep 17 00:00:00 2001 From: Viken Dadhaniya Date: Thu, 11 Sep 2025 10:02:53 +0530 Subject: soc: qcom: geni-se: Add support to load QUP SE Firmware via Linux subsystem In Qualcomm SoCs, firmware loading for Serial Engines (SE) within the QUP hardware has traditionally been managed by TrustZone (TZ). This restriction poses a significant challenge for developers, as it limits their ability to enable various protocols on any of the SEs from the Linux side, reducing flexibility. Load the firmware to QUP SE based on the 'firmware-name' property specified in devicetree at bootup time. Co-developed-by: Mukesh Kumar Savaliya Signed-off-by: Mukesh Kumar Savaliya Signed-off-by: Viken Dadhaniya Link: https://lore.kernel.org/r/20250911043256.3523057-4-viken.dadhaniya@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/qcom-geni-se.c | 474 ++++++++++++++++++++++++++++++++++++++- include/linux/soc/qcom/geni-se.h | 4 + 2 files changed, 475 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c index e8ab2833815e..cd1779b6a91a 100644 --- a/drivers/soc/qcom/qcom-geni-se.c +++ b/drivers/soc/qcom/qcom-geni-se.c @@ -8,7 +8,9 @@ #define __DISABLE_TRACE_MMIO__ #include +#include #include +#include #include #include #include @@ -113,8 +115,80 @@ struct geni_se_desc { static const char * const icc_path_names[] = {"qup-core", "qup-config", "qup-memory"}; +static const char * const protocol_name[] = { "None", "SPI", "UART", "I2C", "I3C", "SPI SLAVE" }; + +/** + * struct se_fw_hdr - Serial Engine firmware configuration header + * + * This structure defines the SE firmware header, which together with the + * firmware payload is stored in individual ELF segments. + * + * @magic: Set to 'SEFW'. + * @version: Structure version number. + * @core_version: QUPV3 hardware version. + * @serial_protocol: Encoded in GENI_FW_REVISION. + * @fw_version: Firmware version, from GENI_FW_REVISION. + * @cfg_version: Configuration version, from GENI_INIT_CFG_REVISION. + * @fw_size_in_items: Number of 32-bit words in GENI_FW_RAM. + * @fw_offset: Byte offset to GENI_FW_RAM array. + * @cfg_size_in_items: Number of GENI_FW_CFG index/value pairs. + * @cfg_idx_offset: Byte offset to GENI_FW_CFG index array. + * @cfg_val_offset: Byte offset to GENI_FW_CFG values array. + */ +struct se_fw_hdr { + __le32 magic; + __le32 version; + __le32 core_version; + __le16 serial_protocol; + __le16 fw_version; + __le16 cfg_version; + __le16 fw_size_in_items; + __le16 fw_offset; + __le16 cfg_size_in_items; + __le16 cfg_idx_offset; + __le16 cfg_val_offset; +}; + +/*Magic numbers*/ +#define SE_MAGIC_NUM 0x57464553 + +#define MAX_GENI_CFG_RAMn_CNT 455 + +#define MI_PBT_NON_PAGED_SEGMENT 0x0 +#define MI_PBT_HASH_SEGMENT 0x2 +#define MI_PBT_NOTUSED_SEGMENT 0x3 +#define MI_PBT_SHARED_SEGMENT 0x4 + +#define MI_PBT_FLAG_PAGE_MODE BIT(20) +#define MI_PBT_FLAG_SEGMENT_TYPE GENMASK(26, 24) +#define MI_PBT_FLAG_ACCESS_TYPE GENMASK(23, 21) + +#define MI_PBT_PAGE_MODE_VALUE(x) FIELD_GET(MI_PBT_FLAG_PAGE_MODE, x) + +#define MI_PBT_SEGMENT_TYPE_VALUE(x) FIELD_GET(MI_PBT_FLAG_SEGMENT_TYPE, x) + +#define MI_PBT_ACCESS_TYPE_VALUE(x) FIELD_GET(MI_PBT_FLAG_ACCESS_TYPE, x) + +#define M_COMMON_GENI_M_IRQ_EN (GENMASK(6, 1) | \ + M_IO_DATA_DEASSERT_EN | \ + M_IO_DATA_ASSERT_EN | M_RX_FIFO_RD_ERR_EN | \ + M_RX_FIFO_WR_ERR_EN | M_TX_FIFO_RD_ERR_EN | \ + M_TX_FIFO_WR_ERR_EN) + /* Common QUPV3 registers */ #define QUPV3_HW_VER_REG 0x4 +#define QUPV3_SE_AHB_M_CFG 0x118 +#define QUPV3_COMMON_CFG 0x120 +#define QUPV3_COMMON_CGC_CTRL 0x21c + +/* QUPV3_COMMON_CFG fields */ +#define FAST_SWITCH_TO_HIGH_DISABLE BIT(0) + +/* QUPV3_SE_AHB_M_CFG fields */ +#define AHB_M_CLK_CGC_ON BIT(0) + +/* QUPV3_COMMON_CGC_CTRL fields */ +#define COMMON_CSR_SLV_CLK_CGC_ON BIT(0) /* Common SE registers */ #define SE_GENI_INIT_CFG_REVISION 0x0 @@ -122,11 +196,13 @@ static const char * const icc_path_names[] = {"qup-core", "qup-config", #define SE_GENI_CGC_CTRL 0x28 #define SE_GENI_CLK_CTRL_RO 0x60 #define SE_GENI_FW_S_REVISION_RO 0x6c +#define SE_GENI_CFG_REG0 0x100 #define SE_GENI_BYTE_GRAN 0x254 #define SE_GENI_TX_PACKING_CFG0 0x260 #define SE_GENI_TX_PACKING_CFG1 0x264 #define SE_GENI_RX_PACKING_CFG0 0x284 #define SE_GENI_RX_PACKING_CFG1 0x288 +#define SE_GENI_S_IRQ_ENABLE 0x644 #define SE_DMA_TX_PTR_L 0xc30 #define SE_DMA_TX_PTR_H 0xc34 #define SE_DMA_TX_ATTR 0xc38 @@ -148,6 +224,15 @@ static const char * const icc_path_names[] = {"qup-core", "qup-config", #define SE_GSI_EVENT_EN 0xe18 #define SE_IRQ_EN 0xe1c #define SE_DMA_GENERAL_CFG 0xe30 +#define SE_GENI_FW_REVISION 0x1000 +#define SE_GENI_S_FW_REVISION 0x1004 +#define SE_GENI_CFG_RAMN 0x1010 +#define SE_GENI_CLK_CTRL 0x2000 +#define SE_DMA_IF_EN 0x2004 +#define SE_FIFO_IF_DISABLE 0x2008 + +/* GENI_FW_REVISION_RO fields */ +#define FW_REV_VERSION_MSK GENMASK(7, 0) /* GENI_OUTPUT_CTRL fields */ #define DEFAULT_IO_OUTPUT_CTRL_MSK GENMASK(6, 0) @@ -186,6 +271,15 @@ static const char * const icc_path_names[] = {"qup-core", "qup-config", #define RX_DMA_IRQ_DELAY_MSK GENMASK(8, 6) #define RX_DMA_IRQ_DELAY_SHFT 6 +/* GENI_CLK_CTRL fields */ +#define SER_CLK_SEL BIT(0) + +/* GENI_DMA_IF_EN fields */ +#define DMA_IF_EN BIT(0) + +#define geni_setbits32(_addr, _v) writel(readl(_addr) | (_v), _addr) +#define geni_clrbits32(_addr, _v) writel(readl(_addr) & ~(_v), _addr) + /** * geni_se_get_qup_hw_version() - Read the QUP wrapper Hardware version * @se: Pointer to the corresponding serial engine. @@ -658,9 +752,12 @@ int geni_se_clk_freq_match(struct geni_se *se, unsigned long req_freq, } EXPORT_SYMBOL_GPL(geni_se_clk_freq_match); -#define GENI_SE_DMA_DONE_EN BIT(0) -#define GENI_SE_DMA_EOT_EN BIT(1) -#define GENI_SE_DMA_AHB_ERR_EN BIT(2) +#define GENI_SE_DMA_DONE_EN BIT(0) +#define GENI_SE_DMA_EOT_EN BIT(1) +#define GENI_SE_DMA_AHB_ERR_EN BIT(2) +#define GENI_SE_DMA_RESET_DONE_EN BIT(3) +#define GENI_SE_DMA_FLUSH_DONE BIT(4) + #define GENI_SE_DMA_EOT_BUF BIT(0) /** @@ -891,6 +988,377 @@ int geni_icc_disable(struct geni_se *se) } EXPORT_SYMBOL_GPL(geni_icc_disable); +/** + * geni_find_protocol_fw() - Locate and validate SE firmware for a protocol. + * @dev: Pointer to the device structure. + * @fw: Pointer to the firmware image. + * @protocol: Expected serial engine protocol type. + * + * Identifies the appropriate firmware image or configuration required for a + * specific communication protocol instance running on a Qualcomm GENI + * controller. + * + * Return: pointer to a valid 'struct se_fw_hdr' if found, or NULL otherwise. + */ +static struct se_fw_hdr *geni_find_protocol_fw(struct device *dev, const struct firmware *fw, + enum geni_se_protocol_type protocol) +{ + const struct elf32_hdr *ehdr; + const struct elf32_phdr *phdrs; + const struct elf32_phdr *phdr; + struct se_fw_hdr *sefw; + u32 fw_end, cfg_idx_end, cfg_val_end; + u16 fw_size; + int i; + + if (!fw || fw->size < sizeof(struct elf32_hdr)) + return NULL; + + ehdr = (const struct elf32_hdr *)fw->data; + phdrs = (const struct elf32_phdr *)(fw->data + ehdr->e_phoff); + + /* + * The firmware is expected to have at least two program headers (segments). + * One for metadata and the other for the actual protocol-specific firmware. + */ + if (ehdr->e_phnum < 2) { + dev_err(dev, "Invalid firmware: less than 2 program headers\n"); + return NULL; + } + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &phdrs[i]; + + if (fw->size < phdr->p_offset + phdr->p_filesz) { + dev_err(dev, "Firmware size (%zu) < expected offset (%u) + size (%u)\n", + fw->size, phdr->p_offset, phdr->p_filesz); + return NULL; + } + + if (phdr->p_type != PT_LOAD || !phdr->p_memsz) + continue; + + if (MI_PBT_PAGE_MODE_VALUE(phdr->p_flags) != MI_PBT_NON_PAGED_SEGMENT || + MI_PBT_SEGMENT_TYPE_VALUE(phdr->p_flags) == MI_PBT_HASH_SEGMENT || + MI_PBT_ACCESS_TYPE_VALUE(phdr->p_flags) == MI_PBT_NOTUSED_SEGMENT || + MI_PBT_ACCESS_TYPE_VALUE(phdr->p_flags) == MI_PBT_SHARED_SEGMENT) + continue; + + if (phdr->p_filesz < sizeof(struct se_fw_hdr)) + continue; + + sefw = (struct se_fw_hdr *)(fw->data + phdr->p_offset); + fw_size = le16_to_cpu(sefw->fw_size_in_items); + fw_end = le16_to_cpu(sefw->fw_offset) + fw_size * sizeof(u32); + cfg_idx_end = le16_to_cpu(sefw->cfg_idx_offset) + + le16_to_cpu(sefw->cfg_size_in_items) * sizeof(u8); + cfg_val_end = le16_to_cpu(sefw->cfg_val_offset) + + le16_to_cpu(sefw->cfg_size_in_items) * sizeof(u32); + + if (le32_to_cpu(sefw->magic) != SE_MAGIC_NUM || le32_to_cpu(sefw->version) != 1) + continue; + + if (le32_to_cpu(sefw->serial_protocol) != protocol) + continue; + + if (fw_size % 2 != 0) { + fw_size++; + sefw->fw_size_in_items = cpu_to_le16(fw_size); + } + + if (fw_size >= MAX_GENI_CFG_RAMn_CNT) { + dev_err(dev, + "Firmware size (%u) exceeds max allowed RAMn count (%u)\n", + fw_size, MAX_GENI_CFG_RAMn_CNT); + continue; + } + + if (fw_end > phdr->p_filesz || cfg_idx_end > phdr->p_filesz || + cfg_val_end > phdr->p_filesz) { + dev_err(dev, "Truncated or corrupt SE FW segment found at index %d\n", i); + continue; + } + + return sefw; + } + + dev_err(dev, "Failed to get %s protocol firmware\n", protocol_name[protocol]); + return NULL; +} + +/** + * geni_configure_xfer_mode() - Set the transfer mode. + * @se: Pointer to the concerned serial engine. + * @mode: SE data transfer mode. + * + * Set the transfer mode to either FIFO or DMA according to the mode specified + * by the protocol driver. + * + * Return: 0 if successful, otherwise return an error value. + */ +static int geni_configure_xfer_mode(struct geni_se *se, enum geni_se_xfer_mode mode) +{ + /* Configure SE FIFO, DMA or GSI mode. */ + switch (mode) { + case GENI_GPI_DMA: + geni_setbits32(se->base + SE_GENI_DMA_MODE_EN, GENI_DMA_MODE_EN); + writel(0x0, se->base + SE_IRQ_EN); + writel(DMA_RX_EVENT_EN | DMA_TX_EVENT_EN | GENI_M_EVENT_EN | GENI_S_EVENT_EN, + se->base + SE_GSI_EVENT_EN); + break; + + case GENI_SE_FIFO: + geni_clrbits32(se->base + SE_GENI_DMA_MODE_EN, GENI_DMA_MODE_EN); + writel(DMA_RX_IRQ_EN | DMA_TX_IRQ_EN | GENI_M_IRQ_EN | GENI_S_IRQ_EN, + se->base + SE_IRQ_EN); + writel(0x0, se->base + SE_GSI_EVENT_EN); + break; + + case GENI_SE_DMA: + geni_setbits32(se->base + SE_GENI_DMA_MODE_EN, GENI_DMA_MODE_EN); + writel(DMA_RX_IRQ_EN | DMA_TX_IRQ_EN | GENI_M_IRQ_EN | GENI_S_IRQ_EN, + se->base + SE_IRQ_EN); + writel(0x0, se->base + SE_GSI_EVENT_EN); + break; + + default: + dev_err(se->dev, "Invalid geni-se transfer mode: %d\n", mode); + return -EINVAL; + } + return 0; +} + +/** + * geni_enable_interrupts() - Enable interrupts. + * @se: Pointer to the concerned serial engine. + * + * Enable the required interrupts during the firmware load process. + */ +static void geni_enable_interrupts(struct geni_se *se) +{ + u32 val; + + /* Enable required interrupts. */ + writel(M_COMMON_GENI_M_IRQ_EN, se->base + SE_GENI_M_IRQ_EN); + + val = S_CMD_OVERRUN_EN | S_ILLEGAL_CMD_EN | S_CMD_CANCEL_EN | S_CMD_ABORT_EN | + S_GP_IRQ_0_EN | S_GP_IRQ_1_EN | S_GP_IRQ_2_EN | S_GP_IRQ_3_EN | + S_RX_FIFO_WR_ERR_EN | S_RX_FIFO_RD_ERR_EN; + writel(val, se->base + SE_GENI_S_IRQ_ENABLE); + + /* DMA mode configuration. */ + val = GENI_SE_DMA_RESET_DONE_EN | GENI_SE_DMA_AHB_ERR_EN | GENI_SE_DMA_DONE_EN; + writel(val, se->base + SE_DMA_TX_IRQ_EN_SET); + val = GENI_SE_DMA_FLUSH_DONE | GENI_SE_DMA_RESET_DONE_EN | GENI_SE_DMA_AHB_ERR_EN | + GENI_SE_DMA_DONE_EN; + writel(val, se->base + SE_DMA_RX_IRQ_EN_SET); +} + +/** + * geni_write_fw_revision() - Write the firmware revision. + * @se: Pointer to the concerned serial engine. + * @serial_protocol: serial protocol type. + * @fw_version: QUP firmware version. + * + * Write the firmware revision and protocol into the respective register. + */ +static void geni_write_fw_revision(struct geni_se *se, u16 serial_protocol, u16 fw_version) +{ + u32 reg; + + reg = FIELD_PREP(FW_REV_PROTOCOL_MSK, serial_protocol); + reg |= FIELD_PREP(FW_REV_VERSION_MSK, fw_version); + + writel(reg, se->base + SE_GENI_FW_REVISION); + writel(reg, se->base + SE_GENI_S_FW_REVISION); +} + +/** + * geni_load_se_fw() - Load Serial Engine specific firmware. + * @se: Pointer to the concerned serial engine. + * @fw: Pointer to the firmware structure. + * @mode: SE data transfer mode. + * @protocol: Protocol type to be used with the SE (e.g., UART, SPI, I2C). + * + * Load the protocol firmware into the IRAM of the Serial Engine. + * + * Return: 0 if successful, otherwise return an error value. + */ +static int geni_load_se_fw(struct geni_se *se, const struct firmware *fw, + enum geni_se_xfer_mode mode, enum geni_se_protocol_type protocol) +{ + const u32 *fw_data, *cfg_val_arr; + const u8 *cfg_idx_arr; + u32 i, reg_value; + int ret; + struct se_fw_hdr *hdr; + + hdr = geni_find_protocol_fw(se->dev, fw, protocol); + if (!hdr) + return -EINVAL; + + fw_data = (const u32 *)((u8 *)hdr + le16_to_cpu(hdr->fw_offset)); + cfg_idx_arr = (const u8 *)hdr + le16_to_cpu(hdr->cfg_idx_offset); + cfg_val_arr = (const u32 *)((u8 *)hdr + le16_to_cpu(hdr->cfg_val_offset)); + + ret = geni_icc_set_bw(se); + if (ret) + return ret; + + ret = geni_icc_enable(se); + if (ret) + return ret; + + ret = geni_se_resources_on(se); + if (ret) + goto out_icc_disable; + + /* + * Disable high-priority interrupts until all currently executing + * low-priority interrupts have been fully handled. + */ + geni_setbits32(se->wrapper->base + QUPV3_COMMON_CFG, FAST_SWITCH_TO_HIGH_DISABLE); + + /* Set AHB_M_CLK_CGC_ON to indicate hardware controls se-wrapper cgc clock. */ + geni_setbits32(se->wrapper->base + QUPV3_SE_AHB_M_CFG, AHB_M_CLK_CGC_ON); + + /* Let hardware to control common cgc. */ + geni_setbits32(se->wrapper->base + QUPV3_COMMON_CGC_CTRL, COMMON_CSR_SLV_CLK_CGC_ON); + + /* + * Setting individual bits in GENI_OUTPUT_CTRL activates corresponding output lines, + * allowing the hardware to drive data as configured. + */ + writel(0x0, se->base + GENI_OUTPUT_CTRL); + + /* Set SCLK and HCLK to program RAM */ + geni_setbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + writel(0x0, se->base + SE_GENI_CLK_CTRL); + geni_clrbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + + /* Enable required clocks for DMA CSR, TX and RX. */ + reg_value = AHB_SEC_SLV_CLK_CGC_ON | DMA_AHB_SLV_CLK_CGC_ON | + DMA_TX_CLK_CGC_ON | DMA_RX_CLK_CGC_ON; + geni_setbits32(se->base + SE_DMA_GENERAL_CFG, reg_value); + + /* Let hardware control CGC by default. */ + writel(DEFAULT_CGC_EN, se->base + SE_GENI_CGC_CTRL); + + /* Set version of the configuration register part of firmware. */ + writel(le16_to_cpu(hdr->cfg_version), se->base + SE_GENI_INIT_CFG_REVISION); + writel(le16_to_cpu(hdr->cfg_version), se->base + SE_GENI_S_INIT_CFG_REVISION); + + /* Configure GENI primitive table. */ + for (i = 0; i < le16_to_cpu(hdr->cfg_size_in_items); i++) + writel(cfg_val_arr[i], + se->base + SE_GENI_CFG_REG0 + (cfg_idx_arr[i] * sizeof(u32))); + + /* Configure condition for assertion of RX_RFR_WATERMARK condition. */ + reg_value = geni_se_get_rx_fifo_depth(se); + writel(reg_value - 2, se->base + SE_GENI_RX_RFR_WATERMARK_REG); + + /* Let hardware control CGC */ + geni_setbits32(se->base + GENI_OUTPUT_CTRL, DEFAULT_IO_OUTPUT_CTRL_MSK); + + ret = geni_configure_xfer_mode(se, mode); + if (ret) + goto out_resources_off; + + geni_enable_interrupts(se); + + geni_write_fw_revision(se, le16_to_cpu(hdr->serial_protocol), le16_to_cpu(hdr->fw_version)); + + /* Program RAM address space. */ + memcpy_toio(se->base + SE_GENI_CFG_RAMN, fw_data, + le16_to_cpu(hdr->fw_size_in_items) * sizeof(u32)); + + /* Put default values on GENI's output pads. */ + writel_relaxed(0x1, se->base + GENI_FORCE_DEFAULT_REG); + + /* Toggle SCLK/HCLK from high to low to finalize RAM programming and apply config. */ + geni_setbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + geni_setbits32(se->base + SE_GENI_CLK_CTRL, SER_CLK_SEL); + geni_clrbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + + /* Serial engine DMA interface is enabled. */ + geni_setbits32(se->base + SE_DMA_IF_EN, DMA_IF_EN); + + /* Enable or disable FIFO interface of the serial engine. */ + if (mode == GENI_SE_FIFO) + geni_clrbits32(se->base + SE_FIFO_IF_DISABLE, FIFO_IF_DISABLE); + else + geni_setbits32(se->base + SE_FIFO_IF_DISABLE, FIFO_IF_DISABLE); + +out_resources_off: + geni_se_resources_off(se); + +out_icc_disable: + geni_icc_disable(se); + return ret; +} + +/** + * geni_load_se_firmware() - Load firmware for SE based on protocol + * @se: Pointer to the concerned serial engine. + * @protocol: Protocol type to be used with the SE (e.g., UART, SPI, I2C). + * + * Retrieves the firmware name from device properties and sets the transfer mode + * (FIFO or GSI DMA) based on device tree configuration. Enforces FIFO mode for + * UART protocol due to lack of GSI DMA support. Requests the firmware and loads + * it into the SE. + * + * Return: 0 on success, negative error code on failure. + */ +int geni_load_se_firmware(struct geni_se *se, enum geni_se_protocol_type protocol) +{ + const char *fw_name; + const struct firmware *fw; + enum geni_se_xfer_mode mode = GENI_SE_FIFO; + int ret; + + if (protocol >= ARRAY_SIZE(protocol_name)) { + dev_err(se->dev, "Invalid geni-se protocol: %d", protocol); + return -EINVAL; + } + + ret = device_property_read_string(se->wrapper->dev, "firmware-name", &fw_name); + if (ret) { + dev_err(se->dev, "Failed to read firmware-name property: %d\n", ret); + return -EINVAL; + } + + if (of_property_read_bool(se->dev->of_node, "qcom,enable-gsi-dma")) + mode = GENI_GPI_DMA; + + /* GSI mode is not supported by the UART driver; therefore, setting FIFO mode */ + if (protocol == GENI_SE_UART) + mode = GENI_SE_FIFO; + + ret = request_firmware(&fw, fw_name, se->dev); + if (ret) { + if (ret == -ENOENT) + return -EPROBE_DEFER; + + dev_err(se->dev, "Failed to request firmware '%s' for protocol %d: ret: %d\n", + fw_name, protocol, ret); + return ret; + } + + ret = geni_load_se_fw(se, fw, mode, protocol); + release_firmware(fw); + + if (ret) { + dev_err(se->dev, "Failed to load SE firmware for protocol %d: ret: %d\n", + protocol, ret); + return ret; + } + + dev_dbg(se->dev, "Firmware load for %s protocol is successful for xfer mode: %d\n", + protocol_name[protocol], mode); + return 0; +} +EXPORT_SYMBOL_GPL(geni_load_se_firmware); + static int geni_se_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h index 2996a3c28ef3..0a984e2579fe 100644 --- a/include/linux/soc/qcom/geni-se.h +++ b/include/linux/soc/qcom/geni-se.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved. + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. */ #ifndef _LINUX_QCOM_GENI_SE @@ -36,6 +37,7 @@ enum geni_se_protocol_type { GENI_SE_I2C, GENI_SE_I3C, GENI_SE_SPI_SLAVE, + GENI_SE_INVALID_PROTO = 255, }; struct geni_wrapper; @@ -531,5 +533,7 @@ void geni_icc_set_tag(struct geni_se *se, u32 tag); int geni_icc_enable(struct geni_se *se); int geni_icc_disable(struct geni_se *se); + +int geni_load_se_firmware(struct geni_se *se, enum geni_se_protocol_type protocol); #endif #endif -- cgit v1.2.3 From bdf780fbcef5df4d365404a178e7f845a317b4e9 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Thu, 11 Sep 2025 19:24:07 +0800 Subject: ACPI: processor: idle: Rearrange declarations in header file Group all of the declarations of functions that belong to the ACPI processor idle driver together in one place in processor.h. While at it, drop the unnecessary extern modifier from the declaraions of two functions. Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250911112408.1668431-3-lihuisong@huawei.com [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 2976a6d0c54f..6ee4a69412de 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -425,6 +425,8 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); +int acpi_processor_ffh_lpi_probe(unsigned int cpu); +int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ @@ -447,11 +449,6 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy) } #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_ACPI_PROCESSOR_IDLE -extern int acpi_processor_ffh_lpi_probe(unsigned int cpu); -extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); -#endif - void acpi_processor_init_invariance_cppc(void); #endif -- cgit v1.2.3 From 09a1b33c080f6ac700fadc67c8471e67bf75fda4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 12:33:11 -0400 Subject: preparations to taking MNT_WRITE_HOLD out of ->mnt_flags We have an unpleasant wart in accessibility rules for struct mount. There are per-superblock lists of mounts, used by sb_prepare_remount_readonly() to check if any of those is currently claimed for write access and to block further attempts to get write access on those until we are done. As soon as it is attached to a filesystem, mount becomes reachable via that list. Only sb_prepare_remount_readonly() traverses it and it only accesses a few members of struct mount. Unfortunately, ->mnt_flags is one of those and it is modified - MNT_WRITE_HOLD set and then cleared. It is done under mount_lock, so from the locking rules POV everything's fine. However, it has easily overlooked implications - once mount has been attached to a filesystem, it has to be treated as globally visible. In particular, initializing ->mnt_flags *must* be done either prior to that point or under mount_lock. All other members are still private at that point. Life gets simpler if we move that bit (and that's *all* that can get touched by access via this list) out of ->mnt_flags. It's not even hard to do - currently the list is implemented as list_head one, anchored in super_block->s_mounts and linked via mount->mnt_instance. As the first step, switch it to hlist-like open-coded structure - address of the first mount in the set is stored in ->s_mounts and ->mnt_instance replaced with ->mnt_next_for_sb and ->mnt_pprev_for_sb - the former either NULL or pointing to the next mount in set, the latter - address of either ->s_mounts or ->mnt_next_for_sb in the previous element of the set. In the next commit we'll steal the LSB of ->mnt_pprev_for_sb as replacement for MNT_WRITE_HOLD. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 4 +++- fs/namespace.c | 38 +++++++++++++++++++++++++++++--------- fs/super.c | 3 +-- include/linux/fs.h | 4 +++- 4 files changed, 36 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/mount.h b/fs/mount.h index 04d0eadc4c10..b208f69f69d7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -64,7 +64,9 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev will be stolen */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ diff --git a/fs/namespace.c b/fs/namespace.c index 4e9ed4edd854..342dfd882b13 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -730,6 +730,27 @@ static inline void mnt_unhold_writers(struct mount *mnt) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; +} + static int mnt_make_readonly(struct mount *mnt) { int ret; @@ -743,7 +764,6 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ @@ -751,9 +771,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) return -EBUSY; lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -763,9 +783,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) + m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); @@ -1207,7 +1227,7 @@ static void setup_mnt(struct mount *m, struct dentry *root) m->mnt_parent = m; lock_mount_hash(); - list_add_tail(&m->mnt_instance, &s->s_mounts); + mnt_add_instance(m, s); unlock_mount_hash(); } @@ -1425,7 +1445,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..3b0f49e1b817 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0e9c7f1460dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1324,6 +1324,8 @@ struct sb_writers { struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; +struct mount; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1358,7 +1360,7 @@ struct super_block { __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct list_head s_mounts; /* list of mounts; _not_ for fs use */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; -- cgit v1.2.3 From 3371fa2f27134fc4ec7d40b2ae7b9e92c3b2527e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 13:37:12 -0400 Subject: struct mount: relocate MNT_WRITE_HOLD bit ... from ->mnt_flags to LSB of ->mnt_pprev_for_sb. This is safe - we always set and clear it within the same mount_lock scope, so we won't interfere with list operations - traversals are always forward, so they don't even look at ->mnt_prev_for_sb and both insertions and removals are in mount_lock scopes of their own, so that bit will be clear in *all* mount instances during those. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 25 ++++++++++++++++++++++++- fs/namespace.c | 34 +++++++++++++++++----------------- include/linux/mount.h | 3 +-- 3 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/fs/mount.h b/fs/mount.h index b208f69f69d7..40cf16544317 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -66,7 +66,8 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ struct mount * __aligned(1) *mnt_pprev_for_sb; - /* except that LSB of pprev will be stolen */ + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -244,4 +245,26 @@ static inline struct mount *topmost_overmount(struct mount *m) return m; } +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 342dfd882b13..714e159ed9cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -509,20 +509,20 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); @@ -533,7 +533,7 @@ int mnt_get_write_access(struct vfsmount *m) } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -672,15 +672,15 @@ EXPORT_SYMBOL(mnt_drop_write_file); * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -696,9 +696,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -720,14 +720,14 @@ static inline int mnt_hold_writers(struct mount *mnt) */ static inline void mnt_unhold_writers(struct mount *mnt) { - if (!(mnt->mnt_flags & MNT_WRITE_HOLD)) + if (!test_write_hold(mnt)) return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); } static inline void mnt_del_instance(struct mount *m) @@ -766,7 +766,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) { int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; @@ -784,8 +784,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + if (test_write_hold(m)) + clear_write_hold(m); } unlock_mount_hash(); diff --git a/include/linux/mount.h b/include/linux/mount.h index 18e4b97f8a98..85e97b9340ff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -33,7 +33,6 @@ enum mount_flags { MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, - MNT_WRITE_HOLD = 0x200, MNT_INTERNAL = 0x4000, @@ -52,7 +51,7 @@ enum mount_flags { | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, - MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | + MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; -- cgit v1.2.3 From a79765248649de77771c24f7be08ff4c96f16f7a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Aug 2025 02:48:13 -0400 Subject: constify {__,}mnt_is_readonly() Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 54066c9b8da0..4b74fabced43 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -428,7 +428,7 @@ out_free_cache: * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -468,7 +468,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; diff --git a/include/linux/mount.h b/include/linux/mount.h index 85e97b9340ff..acfe7ef86a1b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); -extern bool __mnt_is_readonly(struct vfsmount *mnt); +extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -- cgit v1.2.3 From b67a8631a4a8f26a18fac236aaf61aa2412c7a0d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 13 Sep 2025 23:08:17 +0200 Subject: net: phy: remove mdio_board_info support from phylib After having removed mdio_board_info usage from dsa_loop, there's no user left. So let's drop support for it from phylib. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/01542a2e-05f5-4f13-acef-72632b33b5be@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/Makefile | 2 +- drivers/net/phy/mdio-boardinfo.c | 79 ------------------------------------- drivers/net/phy/mdio-boardinfo.h | 18 --------- drivers/net/phy/mdio_bus_provider.c | 33 ---------------- include/linux/phy.h | 10 ----- 5 files changed, 1 insertion(+), 141 deletions(-) delete mode 100644 drivers/net/phy/mdio-boardinfo.c delete mode 100644 drivers/net/phy/mdio-boardinfo.h (limited to 'include') diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 402a33d559de..76e0db40f879 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -8,7 +8,7 @@ mdio-bus-y += mdio_bus.o mdio_device.o ifdef CONFIG_PHYLIB # built-in whenever PHYLIB is built-in or module -obj-y += stubs.o mdio-boardinfo.o +obj-y += stubs.o endif libphy-$(CONFIG_SWPHY) += swphy.o diff --git a/drivers/net/phy/mdio-boardinfo.c b/drivers/net/phy/mdio-boardinfo.c deleted file mode 100644 index d3184e8f12ec..000000000000 --- a/drivers/net/phy/mdio-boardinfo.c +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * mdio-boardinfo - Collect pre-declarations for MDIO devices - */ - -#include -#include -#include -#include -#include -#include - -#include "mdio-boardinfo.h" - -static LIST_HEAD(mdio_board_list); -static DEFINE_MUTEX(mdio_board_lock); - -struct mdio_board_entry { - struct list_head list; - struct mdio_board_info board_info; -}; - -/** - * mdiobus_setup_mdiodev_from_board_info - create and setup MDIO devices - * from pre-collected board specific MDIO information - * @bus: Bus the board_info belongs to - * @cb: Callback to create device on bus - * Context: can sleep - */ -void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus, - int (*cb) - (struct mii_bus *bus, - struct mdio_board_info *bi)) -{ - struct mdio_board_entry *be, *tmp; - - mutex_lock(&mdio_board_lock); - list_for_each_entry_safe(be, tmp, &mdio_board_list, list) { - struct mdio_board_info *bi = &be->board_info; - - if (strcmp(bus->id, bi->bus_id)) - continue; - - mutex_unlock(&mdio_board_lock); - cb(bus, bi); - mutex_lock(&mdio_board_lock); - } - mutex_unlock(&mdio_board_lock); -} -EXPORT_SYMBOL(mdiobus_setup_mdiodev_from_board_info); - -/** - * mdiobus_register_board_info - register MDIO devices for a given board - * @info: array of devices descriptors - * @n: number of descriptors provided - * Context: can sleep - * - * The board info passed can be marked with __initdata but be pointers - * such as platform_data etc. are copied as-is - */ -int mdiobus_register_board_info(const struct mdio_board_info *info, - unsigned int n) -{ - struct mdio_board_entry *be; - - be = kcalloc(n, sizeof(*be), GFP_KERNEL); - if (!be) - return -ENOMEM; - - for (int i = 0; i < n; i++, be++) { - be->board_info = info[i]; - mutex_lock(&mdio_board_lock); - list_add_tail(&be->list, &mdio_board_list); - mutex_unlock(&mdio_board_lock); - } - - return 0; -} -EXPORT_SYMBOL(mdiobus_register_board_info); diff --git a/drivers/net/phy/mdio-boardinfo.h b/drivers/net/phy/mdio-boardinfo.h deleted file mode 100644 index 0878b77878d4..000000000000 --- a/drivers/net/phy/mdio-boardinfo.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * mdio-boardinfo.h - board info interface internal to the mdio_bus - * component - */ - -#ifndef __MDIO_BOARD_INFO_H -#define __MDIO_BOARD_INFO_H - -struct mii_bus; -struct mdio_board_info; - -void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus, - int (*cb) - (struct mii_bus *bus, - struct mdio_board_info *bi)); - -#endif /* __MDIO_BOARD_INFO_H */ diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c index f43973e73ea3..a2391d4b7e5c 100644 --- a/drivers/net/phy/mdio_bus_provider.c +++ b/drivers/net/phy/mdio_bus_provider.c @@ -29,8 +29,6 @@ #include #include -#include "mdio-boardinfo.h" - /** * mdiobus_alloc_size - allocate a mii_bus structure * @size: extra amount of memory to allocate for private storage. @@ -132,35 +130,6 @@ static void of_mdiobus_link_mdiodev(struct mii_bus *bus, } #endif -/** - * mdiobus_create_device - create a full MDIO device given - * a mdio_board_info structure - * @bus: MDIO bus to create the devices on - * @bi: mdio_board_info structure describing the devices - * - * Returns 0 on success or < 0 on error. - */ -static int mdiobus_create_device(struct mii_bus *bus, - struct mdio_board_info *bi) -{ - struct mdio_device *mdiodev; - int ret = 0; - - mdiodev = mdio_device_create(bus, bi->mdio_addr); - if (IS_ERR(mdiodev)) - return -ENODEV; - - strscpy(mdiodev->modalias, bi->modalias, - sizeof(mdiodev->modalias)); - mdiodev->dev.platform_data = (void *)bi->platform_data; - - ret = mdio_device_register(mdiodev); - if (ret) - mdio_device_free(mdiodev); - - return ret; -} - static struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr, bool c45) { struct phy_device *phydev = ERR_PTR(-ENODEV); @@ -404,8 +373,6 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) goto error; } - mdiobus_setup_mdiodev_from_board_info(bus, mdiobus_create_device); - bus->state = MDIOBUS_REGISTERED; dev_dbg(&bus->dev, "probed\n"); return 0; diff --git a/include/linux/phy.h b/include/linux/phy.h index 6f3b25cb7f4e..7da9e19471c9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2129,16 +2129,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, extern const struct bus_type mdio_bus_type; extern const struct class mdio_bus_class; -struct mdio_board_info { - const char *bus_id; - char modalias[MDIO_NAME_SIZE]; - int mdio_addr; - const void *platform_data; -}; - -int mdiobus_register_board_info(const struct mdio_board_info *info, - unsigned int n); - /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register -- cgit v1.2.3 From d69ae81efbc95c94a2760fc82d27cdab4c26fe76 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:14 +0800 Subject: power: supply: core: Add resistance power supply property Some battery drivers provide the ability to export internal resistance as a parameter. Add internal_resistance power supply property for that purpose. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 16 ++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 87a058e14e7e..e6cce423c632 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -553,6 +553,22 @@ Description: Integer > 0: representing full cycles Integer = 0: cycle_count info is not available +What: /sys/class/power_supply//internal_resistance +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + Represent the battery's internal resistance, often referred + to as Equivalent Series Resistance (ESR). It is a dynamic + parameter that reflects the opposition to current flow within + the cell. It is not a fixed value but varies significantly + based on several operational conditions, including battery + state of charge (SoC), temperature, and whether the battery + is in a charging or discharging state. + + Access: Read + + Valid values: Represented in microohms + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 18e5e84a81c6..8ba08d456352 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -223,6 +223,7 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), + POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f21f806bfb38..f38da7c039d2 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -176,6 +176,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_YEAR, POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, -- cgit v1.2.3 From cd93fbdce5981c947f22015ded3ac6bd1939b0ad Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:15 +0800 Subject: power: supply: core: Add state_of_health power supply property Add state_of_health power supply property to represent battery health percentage. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 21 +++++++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 23 insertions(+) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index e6cce423c632..4b21d5d23251 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -569,6 +569,27 @@ Description: Valid values: Represented in microohms +What: /sys/class/power_supply//state_of_health +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + The state_of_health parameter quantifies the overall condition + of a battery as a percentage, reflecting its ability to deliver + rated performance relative to its original specifications. It is + dynamically computed using a combination of learned capacity + and impedance-based degradation indicators, both of which evolve + over the battery's lifecycle. + Note that the exact algorithms are kept secret by most battery + vendors and the value from different battery vendors cannot be + compared with each other as there is no vendor-agnostic definition + of "performance". Also this usually cannot be used for any + calculations (i.e. this is not the factor between charge_full and + charge_full_design). + + Access: Read + + Valid values: 0 - 100 (percent) + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 8ba08d456352..198405f7126f 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -224,6 +224,7 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), + POWER_SUPPLY_ATTR(STATE_OF_HEALTH), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f38da7c039d2..360ffdf272da 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -177,6 +177,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, -- cgit v1.2.3 From 05dfe654b5932322e297aba11dd6f3f26eea6ecb Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:36 +0300 Subject: net/mlx5: Remove unused 'offset' field from mlx5_sq_bfreg The 'offset' field was introduced in the original commit [1] and never used until commit [2], which added an unnecessary use. Remove the field and refactor the write-combining test to use a local variable instead. [1] commit a6d51b68611e ("net/mlx5: Introduce blue flame register allocator") [2] commit d98995b4bf98 ("net/mlx5: Reimplement write combining test") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/wc.c | 12 +++++++----- include/linux/mlx5/driver.h | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c index 2f0316616fa4..276594586404 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -255,7 +255,8 @@ static void mlx5_wc_destroy_sq(struct mlx5_wc_sq *sq) mlx5_wq_destroy(&sq->wq_ctrl); } -static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) +static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, unsigned int *offset, + bool signaled) { int buf_size = (1 << MLX5_CAP_GEN(sq->cq.mdev, log_bf_reg_size)) / 2; struct mlx5_wqe_ctrl_seg *ctrl; @@ -288,10 +289,10 @@ static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) */ wmb(); - __iowrite64_copy(sq->bfreg.map + sq->bfreg.offset, mmio_wqe, + __iowrite64_copy(sq->bfreg.map + *offset, mmio_wqe, sizeof(mmio_wqe) / 8); - sq->bfreg.offset ^= buf_size; + *offset ^= buf_size; } static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) @@ -332,6 +333,7 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) { + unsigned int offset = 0; unsigned long expires; struct mlx5_wc_sq *sq; int i, err; @@ -358,9 +360,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) goto err_create_sq; for (i = 0; i < TEST_WC_NUM_WQES - 1; i++) - mlx5_wc_post_nop(sq, false); + mlx5_wc_post_nop(sq, &offset, false); - mlx5_wc_post_nop(sq, true); + mlx5_wc_post_nop(sq, &offset, true); expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; do { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fcfc18bfeba9..5a85b6d91ba3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -434,7 +434,6 @@ struct mlx5_sq_bfreg { struct mlx5_uars_page *up; bool wc; u32 index; - unsigned int offset; }; struct mlx5_core_health { -- cgit v1.2.3 From aa4595d0ada65d5d44fa924a42a87c175d9d88e3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:38 +0300 Subject: net/mlx5: Store the global doorbell in mlx5_priv The global doorbell is used for more than just Ethernet resources, so move it out of mlx5e_hw_objs into a common place (mlx5_priv), to avoid non-Ethernet modules (e.g. HWS, ASO) depending on Ethernet structs. Use this opportunity to consolidate it with the 'uar' pointer already there, which was used as an RX doorbell. Underneath the 'uar' pointer is identical to 'bfreg->up', so store a single resource and use that instead. For CQ doorbells, care is taken to always use bfreg->up->index instead of bfreg->index, which may refer to a subsequent UAR page from the same ALLOC_UAR batch on some NICs. This paves the way for cleanly supporting multiple doorbells in the Ethernet driver. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/infiniband/hw/mlx5/cq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 11 +---------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++----- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/main.c | 11 +++++------ drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/wc.c | 4 ++-- include/linux/mlx5/driver.h | 3 +-- 12 files changed, 29 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 9c8003a78334..a23b364e24ff 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -648,7 +648,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; struct mlx5_ib_cq *cq = to_mcq(ibcq); - void __iomem *uar_page = mdev->priv.uar->map; + void __iomem *uar_page = mdev->priv.bfreg.up->map; unsigned long irq_flags; int ret = 0; @@ -923,7 +923,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, cq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); - *index = dev->mdev->priv.uar->index; + *index = dev->mdev->priv.bfreg.up->index; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 1fd403713baf..35039a95dcfd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -145,7 +145,7 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", cq->cqn); - cq->uar = dev->priv.uar; + cq->uar = dev->priv.bfreg.up; cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 31e7f59bc19b..b6b4ae7c59fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -810,7 +810,7 @@ static void mlx5e_build_common_cq_param(struct mlx5_core_dev *mdev, { void *cqc = param->cqc; - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 391b4e9c9dc4..7c1d9a9ea464 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -334,7 +334,7 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix, sq->mdev = mdev; sq->ch_ix = MLX5E_PTP_CHANNEL_IX; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->stats = &c->priv->ptp_stats.sq[tc]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 6ed3a32b7e22..e9e36358c39d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -163,17 +163,11 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) goto err_dealloc_transport_domain; } - err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false); - if (err) { - mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err); - goto err_destroy_mkey; - } - if (create_tises) { err = mlx5e_create_tises(mdev, res->tisn); if (err) { mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_bfreg; + goto err_destroy_mkey; } res->tisn_valid = true; } @@ -190,8 +184,6 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) return 0; -err_destroy_bfreg: - mlx5_free_bfreg(mdev, &res->bfreg); err_destroy_mkey: mlx5_core_destroy_mkey(mdev, res->mkey); err_dealloc_transport_domain: @@ -209,7 +201,6 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mdev->mlx5e_res.dek_priv = NULL; if (res->tisn_valid) mlx5e_destroy_tises(mdev, res->tisn); - mlx5_free_bfreg(mdev, &res->bfreg); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); mlx5_core_dealloc_pd(mdev, res->pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index fe5f5ae433b7..22e3bc72a265 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1536,7 +1536,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->pdev = c->pdev; sq->mkey_be = c->mkey_be; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN; sq->xsk_pool = xsk_pool; @@ -1621,7 +1621,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, int err; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->reserved_room = param->stop_room; param->wq.db_numa_node = cpu_to_node(c->cpu); @@ -1706,7 +1706,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->priv = c->priv; sq->ch_ix = c->ix; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); @@ -1782,7 +1782,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, csp->wq_ctrl->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, csp->wq_ctrl->db.dma); @@ -2277,7 +2277,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index f3c714ebd9cb..25499da177bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -307,7 +307,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz); - MLX5_SET(eqc, eqc, uar_page, priv->uar->index); + MLX5_SET(eqc, eqc, uar_page, priv->bfreg.up->index); MLX5_SET(eqc, eqc, intr, vecidx); MLX5_SET(eqc, eqc, log_page_size, eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); @@ -320,7 +320,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, eq->eqn = MLX5_GET(create_eq_out, out, eq_number); eq->irqn = pci_irq_vector(dev->pdev, vecidx); eq->dev = dev; - eq->doorbell = priv->uar->map + MLX5_EQ_DOORBELL_OFFSET; + eq->doorbell = priv->bfreg.up->map + MLX5_EQ_DOORBELL_OFFSET; err = mlx5_debug_eq_add(dev, eq); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c index 58bd749b5e4d..129725159a93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c @@ -100,7 +100,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -129,7 +129,7 @@ static int mlx5_aso_create_cq(struct mlx5_core_dev *mdev, int numa_node, return -ENOMEM; MLX5_SET(cqc, cqc_data, log_cq_size, 1); - MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc_data, cqe_sz, CQE_STRIDE_128_PAD); @@ -163,7 +163,7 @@ static int mlx5_aso_alloc_sq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5_wq_param param; int err; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; param.db_numa_node = numa_node; param.buf_numa_node = numa_node; @@ -203,7 +203,7 @@ static int create_aso_sq(struct mlx5_core_dev *mdev, int pdn, MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index eb3ac98a2621..6b6d6b05b893 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1316,10 +1316,9 @@ static int mlx5_load(struct mlx5_core_dev *dev) { int err; - dev->priv.uar = mlx5_get_uars_page(dev); - if (IS_ERR(dev->priv.uar)) { - mlx5_core_err(dev, "Failed allocating uar, aborting\n"); - err = PTR_ERR(dev->priv.uar); + err = mlx5_alloc_bfreg(dev, &dev->priv.bfreg, false, false); + if (err) { + mlx5_core_err(dev, "Failed allocating bfreg, %d\n", err); return err; } @@ -1430,7 +1429,7 @@ err_eq_table: err_irq_table: mlx5_pagealloc_stop(dev); mlx5_events_stop(dev); - mlx5_put_uars_page(dev, dev->priv.uar); + mlx5_free_bfreg(dev, &dev->priv.bfreg); return err; } @@ -1455,7 +1454,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) mlx5_irq_table_destroy(dev); mlx5_pagealloc_stop(dev); mlx5_events_stop(dev); - mlx5_put_uars_page(dev, dev->priv.uar); + mlx5_free_bfreg(dev, &dev->priv.bfreg); } int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index b0595c9b09e4..24ef7d66fa8a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -690,7 +690,7 @@ static int hws_send_ring_alloc_sq(struct mlx5_core_dev *mdev, size_t buf_sz; int err; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->mdev = mdev; param.db_numa_node = numa_node; @@ -764,7 +764,7 @@ static int hws_send_ring_create_sq(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); @@ -940,7 +940,7 @@ static int hws_send_ring_create_cq(struct mlx5_core_dev *mdev, (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -963,7 +963,7 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev, if (!cqc_data) return -ENOMEM; - MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries)); err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c index 276594586404..999d6216648a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -94,7 +94,7 @@ static int create_wc_cq(struct mlx5_wc_cq *cq, void *cqc_data) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -116,7 +116,7 @@ static int mlx5_wc_create_cq(struct mlx5_core_dev *mdev, struct mlx5_wc_cq *cq) return -ENOMEM; MLX5_SET(cqc, cqc, log_cq_size, TEST_WC_LOG_CQ_SZ); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5a85b6d91ba3..15c434fedff7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -612,7 +612,7 @@ struct mlx5_priv { struct mlx5_ft_pool *ft_pool; struct mlx5_bfreg_data bfregs; - struct mlx5_uars_page *uar; + struct mlx5_sq_bfreg bfreg; #ifdef CONFIG_MLX5_SF struct mlx5_vhca_state_notifier *vhca_state_notifier; struct mlx5_sf_dev_table *sf_dev_table; @@ -658,7 +658,6 @@ struct mlx5e_resources { u32 pdn; struct mlx5_td td; u32 mkey; - struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; bool tisn_valid; -- cgit v1.2.3 From a315b723e87ba4e4573e1e5c759d512f38bdc0b3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:40 +0300 Subject: net/mlx5e: Prepare for using different CQ doorbells Completion queues (CQs) in mlx5 use the same global doorbell, which may become contended when accessed concurrently from many cores. This patch prepares the CQ management code for supporting different doorbells per CQ. This will be used in downstream patches to allow separate doorbells to be used by channels CQs. The main change is moving the 'uar' pointer from struct mlx5_core_cq to struct mlx5e_cq, as the uar page to be used is better off stored directly there. Other users of mlx5_core_cq also store the UAR to be used separately and therefore the pointer being removed is dead weight for them. As evidence, in this patch there are two users which set the mcq.uar pointer but didn't use it, Software Steering and old Innova CQ creation code. Instead, they rang the doorbell directly from another pointer. The 'uar' pointer added to struct mlx5e_cq remains in a hot cacheline (as before), because it may get accessed for each packet. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 5 +---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++++--- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c | 1 - include/linux/mlx5/cq.h | 1 - 7 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 35039a95dcfd..e9f319a9bdd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -145,7 +145,6 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", cq->cqn); - cq->uar = dev->priv.bfreg.up; cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 9c73165653bf..1cbe3f3037bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -344,6 +344,7 @@ struct mlx5e_cq { /* data path - accessed per napi poll */ u16 event_ctr; struct napi_struct *napi; + struct mlx5_uars_page *uar; struct mlx5_core_cq mcq; struct mlx5e_ch_stats *ch_stats; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 5dc04bbfc71b..6760bb0336df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -309,10 +309,7 @@ mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) { - struct mlx5_core_cq *mcq; - - mcq = &cq->mcq; - mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc); + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, cq->uar->map, cq->wq.cc); } static inline struct mlx5e_sq_dma * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c1491ed3db1c..93505c82d6f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2189,6 +2189,7 @@ static void mlx5e_close_xdpredirect_sq(struct mlx5e_xdpsq *xdpsq) static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, struct net_device *netdev, struct workqueue_struct *workqueue, + struct mlx5_uars_page *uar, struct mlx5e_cq_param *param, struct mlx5e_cq *cq) { @@ -2220,6 +2221,7 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, cq->mdev = mdev; cq->netdev = netdev; cq->workqueue = workqueue; + cq->uar = uar; return 0; } @@ -2235,7 +2237,8 @@ static int mlx5e_alloc_cq(struct mlx5_core_dev *mdev, param->wq.db_numa_node = ccp->node; param->eq_ix = ccp->ix; - err = mlx5e_alloc_cq_common(mdev, ccp->netdev, ccp->wq, param, cq); + err = mlx5e_alloc_cq_common(mdev, ccp->netdev, ccp->wq, + mdev->priv.bfreg.up, param, cq); cq->napi = ccp->napi; cq->ch_stats = ccp->ch_stats; @@ -2280,7 +2283,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); + MLX5_SET(cqc, cqc, uar_page, cq->uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -3593,7 +3596,8 @@ static int mlx5e_alloc_drop_cq(struct mlx5e_priv *priv, param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); param->wq.db_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); - return mlx5e_alloc_cq_common(priv->mdev, priv->netdev, priv->wq, param, cq); + return mlx5e_alloc_cq_common(priv->mdev, priv->netdev, priv->wq, + mdev->priv.bfreg.up, param, cq); } int mlx5e_open_drop_rq(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c4de6bf8d1b6..cb1319974f83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -475,7 +475,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) *conn->cq.mcq.arm_db = 0; conn->cq.mcq.vector = 0; conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; - conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 4fd4e8483382..077a77fde670 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1131,7 +1131,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, *cq->mcq.arm_db = cpu_to_be32(2 << 28); cq->mcq.vector = 0; - cq->mcq.uar = uar; cq->mdev = mdev; return cq; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 991526039ccb..7ef2c7c7d803 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -41,7 +41,6 @@ struct mlx5_core_cq { int cqe_sz; __be32 *set_ci_db; __be32 *arm_db; - struct mlx5_uars_page *uar; refcount_t refcount; struct completion free; unsigned vector; -- cgit v1.2.3 From 71fb4832d50b01f0af2d257360c239879ce93a8e Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:41 +0300 Subject: net/mlx5e: Use multiple TX doorbells First, allocate more doorbells in mlx5e_create_mdev_resources: - one doorbell remains 'global' and will be used by all non-channel associated SQs (e.g. ASO, HWS, PTP, ...). - allocate additional 'num_doorbells' doorbells. This defaults to minimum between 8 and max number of channels. mlx5e_channel_pick_doorbell() now spreads out channel SQs across available doorbells. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_common.c | 29 +++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++++- include/linux/mlx5/driver.h | 4 +++ 3 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index e9e36358c39d..d13cebbc763a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -143,6 +143,7 @@ err_close_tises: int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) { struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; + unsigned int num_doorbells, i; int err; err = mlx5_core_alloc_pd(mdev, &res->pdn); @@ -163,11 +164,30 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) goto err_dealloc_transport_domain; } + num_doorbells = min(MLX5_DEFAULT_NUM_DOORBELLS, + mlx5e_get_max_num_channels(mdev)); + res->bfregs = kcalloc(num_doorbells, sizeof(*res->bfregs), GFP_KERNEL); + if (!res->bfregs) { + err = -ENOMEM; + goto err_destroy_mkey; + } + + for (i = 0; i < num_doorbells; i++) { + err = mlx5_alloc_bfreg(mdev, res->bfregs + i, false, false); + if (err) { + mlx5_core_warn(mdev, + "could only allocate %d/%d doorbells, err %d.\n", + i, num_doorbells, err); + break; + } + } + res->num_bfregs = i; + if (create_tises) { err = mlx5e_create_tises(mdev, res->tisn); if (err) { mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_mkey; + goto err_destroy_bfregs; } res->tisn_valid = true; } @@ -184,6 +204,10 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) return 0; +err_destroy_bfregs: + for (i = 0; i < res->num_bfregs; i++) + mlx5_free_bfreg(mdev, res->bfregs + i); + kfree(res->bfregs); err_destroy_mkey: mlx5_core_destroy_mkey(mdev, res->mkey); err_dealloc_transport_domain: @@ -201,6 +225,9 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mdev->mlx5e_res.dek_priv = NULL; if (res->tisn_valid) mlx5e_destroy_tises(mdev, res->tisn); + for (unsigned int i = 0; i < res->num_bfregs; i++) + mlx5_free_bfreg(mdev, res->bfregs + i); + kfree(res->bfregs); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); mlx5_core_dealloc_pd(mdev, res->pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 93505c82d6f0..0700656b3c3e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2752,7 +2752,16 @@ void mlx5e_trigger_napi_sched(struct napi_struct *napi) static void mlx5e_channel_pick_doorbell(struct mlx5e_channel *c) { - c->bfreg = &c->mdev->priv.bfreg; + struct mlx5e_hw_objs *hw_objs = &c->mdev->mlx5e_res.hw_objs; + + /* No dedicated Ethernet doorbells, use the global one. */ + if (hw_objs->num_bfregs == 0) { + c->bfreg = &c->mdev->priv.bfreg; + return; + } + + /* Round-robin between doorbells. */ + c->bfreg = hw_objs->bfregs + c->vec_ix % hw_objs->num_bfregs; } static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 15c434fedff7..99b34e4809ae 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -658,6 +658,8 @@ struct mlx5e_resources { u32 pdn; struct mlx5_td td; u32 mkey; + struct mlx5_sq_bfreg *bfregs; + unsigned int num_bfregs; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; bool tisn_valid; @@ -801,6 +803,8 @@ struct mlx5_db { int index; }; +#define MLX5_DEFAULT_NUM_DOORBELLS 8 + enum { MLX5_COMP_EQ_SIZE = 1024, }; -- cgit v1.2.3 From 6bdcb735fec6cb866b0d40634d4f23effba81074 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:43 +0300 Subject: devlink: Add a 'num_doorbells' driverinit param This parameter can be used by drivers to configure a different number of doorbells. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Reviewed-by: Jiri Pirko Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 3 +++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index c51da4fba7e7..0a9c20d70122 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -148,3 +148,6 @@ own name. - The max number of Virtual Functions (VFs) exposed by the PF. after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs directory will report this value. + * - ``num_doorbells`` + - u32 + - Controls the number of doorbells used by the device. diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d4362f010e4..9e824f61e40f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -531,6 +531,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -598,6 +599,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 33134940c266..70e69523412c 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -107,6 +107,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, + .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 542a495cbaa6dc57a310da62b501fdf318657cad Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:25 +0200 Subject: tcp: AccECN core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements Accurate ECN without negotiation and AccECN Option (that will be added by later changes). Based on AccECN specifications: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN allows feeding back the number of CE (congestion experienced) marks accurately to the sender in contrast to RFC3168 ECN that can only signal one marks-seen-yes/no per RTT. Congestion control algorithms can take advantage of the accurate ECN information to fine-tune their congestion response to avoid drastic rate reduction when only mild congestion is encountered. With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps track of how many segments have arrived with a CE mark. Accurate ECN uses ACE field (ECE, CWR, AE) to communicate the value back to the sender which updates tp->delivered_ce (s.cep) based on the feedback. This signalling channel is lossy when ACE field overflow occurs. Conservative strategy is selected here to deal with the ACE overflow, however, some strategies using the AccECN option later in the overall patchset mitigate against false overflows detected. The ACE field values on the wire are offset by TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the real CE marks rather than forcing all downstream users to adapt to the wire offset. This patch uses the first 1-byte hole and the last 4-byte hole of the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'. Also, the group size of tcp_sock_write_txrx is increased from 91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are the trimmed pahole outcomes before and after this patch. [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 app_limited; /* 2580 4 */ u32 rcv_wnd; /* 2684 4 */ struct tcp_options_received rx_opt; /* 2688 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 164 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 2 + include/linux/tcp.h | 3 + include/net/tcp.h | 15 ++++ include/net/tcp_ecn.h | 53 ++++++++++- net/ipv4/tcp.c | 5 +- net/ipv4/tcp_input.c | 100 +++++++++++++++++++-- net/ipv4/tcp_output.c | 9 +- 7 files changed, 175 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 7bbda5944ee2..31313a9adccc 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -101,6 +101,8 @@ u32 prr_delivered u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx) u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) +u32 received_ce read_mostly read_write +u8:4 received_ce_pending read_mostly read_write u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d103cc0e7a35..90cee6e53527 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -287,6 +287,8 @@ struct tcp_sock { */ u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -299,6 +301,7 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index e25340459ce4..bc5159fe842e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -973,6 +973,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 + /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ @@ -1782,11 +1790,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + u32 ace; + /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | ntohl(TCP_FLAG_ACK) | snd_wnd); } diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b3430557676b..b0ed89dbad41 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -12,6 +12,7 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { + /* Do not set CWR if in AccECN mode! */ if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -19,8 +20,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) static inline void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK @@ -36,6 +39,52 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + } +} + +static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +{ + u32 wire_ace; + + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; +} + static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1f643faa8b93..16456c10e5e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include #include #include +#include #include #include #include @@ -3406,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5138,6 +5140,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5145,7 +5148,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b2793e749cfd..98782134c2f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,16 +360,25 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } @@ -385,10 +394,64 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int flag) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -3744,7 +3807,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3752,8 +3816,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3774,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; @@ -3881,6 +3950,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3905,7 +3979,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3914,12 +3989,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3940,7 +4019,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -6071,6 +6150,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb); + /* We know that such packets are checksummed * on entry. */ @@ -6119,6 +6200,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6159,6 +6241,8 @@ validate: return; step5: + tcp_ecn_received_counters(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index be8ceefa5332..a3a6d3e91d84 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -328,7 +328,14 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + INET_ECN_xmit(sk); + tcp_accecn_set_ace(th, tp); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { -- cgit v1.2.3 From 3cae34274c79e0c60ccd1c10516973af1aed2a7c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:26 +0200 Subject: tcp: accecn: AccECN negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accurate ECN negotiation parts based on the specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN is negotiated using ECE, CWR and AE flags in the TCP header. TCP falls back into using RFC3168 ECN if one of the ends supports only RFC3168-style ECN. The AccECN negotiation includes reflecting IP ECN field value seen in SYN and SYNACK back using the same bits as negotiation to allow responding to SYN CE marks and to detect ECN field mangling. CE marks should not occur currently because SYN=1 segments are sent with Non-ECT in IP ECN field (but proposal exists to remove this restriction). Reflecting SYN IP ECN field in SYNACK is relatively simple. Reflecting SYNACK IP ECN field in the final/third ACK of the handshake is more challenging. Linux TCP code is not well prepared for using the final/third ACK a signalling channel which makes things somewhat complicated here. tcp_ecn sysctl can be used to select the highest ECN variant (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and requested for incoming connection and outgoing connection: TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN, TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN, TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN. After this patch, the size of tcp_request_sock remains unchanged and no new holes are added. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ /* size: 360, cachelines: 6, members: 16 */ } [AFTER THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ bool accecn_ok; /* 357 1 */ u8 syn_ect_snt:2; /* 358: 0 1 */ u8 syn_ect_rcv:2; /* 358: 2 1 */ u8 accecn_fail_mode:4; /* 358: 4 1 */ /* size: 360, cachelines: 6, members: 20 */ } After this patch, the size of tcp_sock remains unchanged and no new holes are added. Also, 4 bits of the existing 2-byte hole are exploited. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 unused:5; /* 2761: 3 1 */ u8 thin_lto:1; /* 2762: 0 1 */ u8 fastopen_connect:1; /* 2762: 1 1 */ u8 fastopen_no_cookie:1; /* 2762: 2 1 */ u8 fastopen_client_fail:2; /* 2762: 3 1 */ u8 frto:1; /* 2762: 5 1 */ /* XXX 2 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ /* XXX 2 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 164 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 syn_ect_snt:2; /* 2761: 3 1 */ u8 syn_ect_rcv:2; /* 2761: 5 1 */ u8 thin_lto:1; /* 2761: 7 1 */ u8 fastopen_connect:1; /* 2762: 0 1 */ u8 fastopen_no_cookie:1; /* 2762: 1 1 */ u8 fastopen_client_fail:2; /* 2762: 2 1 */ u8 frto:1; /* 2762: 4 1 */ /* XXX 3 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ u8 accecn_fail_mode:4; /* 2766: 0 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 166 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 36 ++- .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 8 +- include/net/tcp.h | 1 + include/net/tcp_ecn.h | 310 +++++++++++++++++++-- net/ipv4/syncookies.c | 4 + net/ipv4/sysctl_net_ipv4.c | 3 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 50 +++- net/ipv4/tcp_ipv4.c | 6 +- net/ipv4/tcp_minisocks.c | 24 +- net/ipv4/tcp_output.c | 10 +- net/ipv6/syncookies.c | 2 + net/ipv6/tcp_ipv6.c | 1 + 14 files changed, 400 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9f5891c9b07b..3d8eb54b84f9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -443,20 +443,28 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. - - Possible values are: - - = ===================================================== - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. - = ===================================================== + ECN is used only when both ends of the TCP connection indicate support + for it. This feature is useful in avoiding losses due to congestion by + allowing supporting routers to signal congestion before having to drop + packets. A host that supports ECN both sends ECN at the IP layer and + feeds back ECN at the TCP layer. The highest variant of ECN feedback + that both peers support is chosen by the ECN negotiation (Accurate ECN, + ECN, or no ECN). + + The highest negotiated variant for incoming connection requests + and the highest variant requested by outgoing connection + attempts: + + ===== ==================== ==================== + Value Incoming connections Outgoing connections + ===== ==================== ==================== + 0 No ECN No ECN + 1 ECN ECN + 2 ECN No ECN + 3 AccECN AccECN + 4 AccECN ECN + 5 AccECN No ECN + ===== ==================== ==================== Default: 2 diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 31313a9adccc..4f71ece7c655 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -103,6 +103,9 @@ u32 delivered read_mostly read_w u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u8:2 syn_ect_snt write_mostly read_write +u8:2 syn_ect_rcv read_mostly read_write +u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90cee6e53527..b8432bed546d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -168,6 +168,10 @@ struct tcp_request_sock { * after data-in-SYN. */ u8 syn_tos; + bool accecn_ok; + u8 syn_ect_snt: 2, + syn_ect_rcv: 2, + accecn_fail_mode:4; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -375,7 +379,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + syn_ect_snt:2, /* AccECN ECT memory, only */ + syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ @@ -391,6 +396,7 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ + u8 accecn_fail_mode:4; /* AccECN failure handling */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index bc5159fe842e..da8c6640ead3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -972,6 +972,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) #define TCP_ACCECN_CEP_ACE_MASK 0x7 #define TCP_ACCECN_ACE_MAX_DELTA 6 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b0ed89dbad41..da0b355418bd 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -4,12 +4,26 @@ #include #include +#include #include #include #include #include +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is + * attemped to be negotiated and requested for incoming connection + * and outgoing connection, respectively. + */ +enum tcp_ecn_mode { + TCP_ECN_IN_NOECN_OUT_NOECN = 0, + TCP_ECN_IN_ECN_OUT_ECN = 1, + TCP_ECN_IN_ECN_OUT_NOECN = 2, + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, + TCP_ECN_IN_ACCECN_OUT_ECN = 4, + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -39,19 +53,125 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +/* tp->accecn_fail_mode */ +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3) + +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; +} + +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; +} + +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; +} + +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; +} + +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->accecn_fail_mode |= mode; +} + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; } -static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) { - tp->received_ce = 0; - tp->received_ce_pending = 0; + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ + static const int ace_to_ecn[8] = { + INET_ECN_ECT_0, /* 0b000 (Undefined) */ + INET_ECN_ECT_1, /* 0b001 (Undefined) */ + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ + INET_ECN_ECT_1, /* 0b101 (Reserved) */ + INET_ECN_CE, /* 0b110 (CE is received) */ + INET_ECN_ECT_1 /* 0b111 (Undefined) */ + }; + + return ace_to_ecn[ace & 0x7]; +} + +/* Check ECN field transition to detect invalid transitions */ +static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) +{ + if (rcv == snt) + return true; + + /* Non-ECT altered to something or something became non-ECT */ + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) + return false; + /* CE -> ECT(0/1)? */ + if (snt == INET_ECN_CE) + return false; + return true; +} + +static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, + u8 sent_ect) +{ + u8 ect = tcp_accecn_extract_syn_ect(ace); + struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + return true; + + if (!tcp_ect_transition_valid(sent_ect, ect)) { + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + return false; + } + + return true; +} + +/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ +static inline void tcp_accecn_third_ack(struct sock *sk, + const struct sk_buff *skb, u8 sent_ect) +{ + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); + struct tcp_sock *tp = tcp_sk(sk); + + switch (ace) { + case 0x0: + /* Invalid value */ + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + break; + case 0x7: + case 0x5: + case 0x1: + /* Unused but legal values */ + break; + default: + /* Validation only applies to first non-data packet */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !TCP_SKB_CB(skb)->sacked && + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && + !tp->delivered_ce) + tp->delivered_ce++; + } + break; + } } /* Updates Accurate ECN received counters from the received IP ECN field */ -static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_ecn_received_counters(struct sock *sk, + const struct sk_buff *skb) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -74,27 +194,152 @@ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_bu } } -static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) { - u32 wire_ace; + return tcp_accecn_ace(th) > 0x1; +} + +/* Used to form the ACE flags for SYN/ACK */ +static inline u16 tcp_accecn_reflector_flags(u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE + */ + static const u8 ecn_to_ace_flags[4] = { + 0b010, /* Not-ECT is received */ + 0b011, /* ECT(1) is received */ + 0b100, /* ECT(0) is received */ + 0b110 /* CE is received */ + }; + + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); +} - wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; - th->ece = !!(wire_ace & 0x1); - th->cwr = !!(wire_ace & 0x2); - th->ae = !!(wire_ace & 0x4); +/* AccECN specification, 3.1.2: If a TCP server that implements AccECN + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set + * to any combination other than 000, 011 or 111, it MUST negotiate the + * use of AccECN as if they had been set to 111. + */ +static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) +{ + u8 ace = tcp_accecn_ace(th); + + return ace && ace != 0x3; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; tp->received_ce_pending = 0; } -static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, - const struct tcphdr *th) +/* Used for make_synack to form the ACE flags */ +static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: + * +====================+====================================+ + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | + * | received on SYN | AE CWR ECE | + * +====================+====================================+ + * | Not-ECT | 0 1 0 | + * | ECT(1) | 0 1 1 | + * | ECT(0) | 1 0 0 | + * | CE | 1 1 0 | + * +====================+====================================+ + */ + th->ae = !!(ect & INET_ECN_ECT_0); + th->cwr = ect != INET_ECN_ECT_0; + th->ece = ect == INET_ECN_ECT_1; +} + +static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, + struct tcphdr *th) +{ + u32 wire_ace; + + /* The final packet of the 3WHS or anything like it must reflect + * the SYN/ACK ECT instead of putting CEP into ACE field, such + * case show up in tcp_flags. + */ + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; + } +} + +/* See Table 2 of the AccECN draft */ +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, + u8 ip_dsfield) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 ace = tcp_accecn_ace(th); + + switch (ace) { + case 0x0: + case 0x7: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | No ECN | 0 0 0 | Not ECN | + * | AccECN | Broken | 1 1 1 | Not ECN | + * +========+========+============+=============+ + */ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + break; + case 0x1: + case 0x5: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | Nonce | 1 0 1 | (Reserved) | + * | AccECN | ECN | 0 0 1 | Classic ECN | + * | Nonce | AccECN | 0 0 1 | Classic ECN | + * | ECN | AccECN | 0 0 1 | Classic ECN | + * +========+========+============+=============+ + */ + if (tcp_ecn_mode_pending(tp)) + /* Downgrade from AccECN, or requested initially */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + default: + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (INET_ECN_is_ce(ip_dsfield) && + tcp_accecn_validate_syn_feedback(sk, ace, + tp->syn_ect_snt)) { + tp->received_ce++; + tp->received_ce_pending++; + } + break; + } } -static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, - const struct tcphdr *th) +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, + const struct sk_buff *skb) { + if (tcp_ecn_mode_pending(tp)) { + if (!tcp_accecn_syn_requested(th)) { + /* Downgrade to classic ECN feedback */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } else { + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + } + } if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } @@ -110,7 +355,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, /* Packet ECN state for a SYN-ACK */ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (tcp_ecn_disabled(tp)) @@ -118,6 +363,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); + + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + TCP_SKB_CB(skb)->tcp_flags |= + tcp_accecn_reflector_flags(tp->syn_ect_rcv); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } } /* Packet ECN state for a SYN. */ @@ -125,8 +377,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + bool use_ecn, use_accecn; + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); + + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); @@ -142,23 +399,32 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) INET_ECN_xmit(sk); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (use_accecn) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } } } static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + } } static inline void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) th->ece = 1; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb0819463fae..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static siphash_aligned_key_t syncookie_secret[2]; @@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); + struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; @@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } ireq = inet_rsk(req); + treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..268f8b86e8a7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 2; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -728,7 +729,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = &tcp_ecn_mode_max, }, { .procname = "tcp_ecn_fallback", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 16456c10e5e8..7261ee6dd875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3407,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98782134c2f4..8449a5a3e368 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3665,8 +3665,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3696,7 +3706,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3863,7 +3873,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -5907,6 +5917,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -6004,7 +6015,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -6015,6 +6026,8 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) + accecn_reflector = true; if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -6024,7 +6037,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6493,7 +6506,8 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6565,7 +6579,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6624,7 +6638,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6806,7 +6820,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6853,9 +6867,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -7025,6 +7042,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -7032,7 +7058,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7050,6 +7077,9 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2a0602035729..6162f8dbe9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -1189,7 +1190,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { - const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; @@ -1202,6 +1203,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); @@ -3558,7 +3560,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c2ae07d8d5d..a4b8be6fdcdc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -451,12 +452,23 @@ void tcp_openreq_init_rwin(struct request_sock *req, ireq->rcv_wscale = rcv_wscale; } -static void tcp_ecn_openreq_child(struct tcp_sock *tp, - const struct request_sock *req) +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tcp_ecn_received_counters(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -621,7 +633,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); + tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3a6d3e91d84..deb9b085a8a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -332,8 +332,9 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - INET_ECN_xmit(sk); - tcp_accecn_set_ace(th, tp); + if (!tcp_accecn_ace_fail_recv(tp)) + INET_ECN_xmit(sk); + tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ @@ -3356,7 +3357,10 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback */ + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0ee1a909771..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include #include #include +#include #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08dabc47a6e7..5f0a138f4220 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -544,6 +544,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); -- cgit v1.2.3 From 9a011277445583bab002fbf5043fab0ea03dc5dd Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:27 +0200 Subject: tcp: accecn: add AccECN rx byte counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_txrx is increased from 95 + 4 to 107 + 4 and an extra 4-byte hole is created but will be exploited in later patches: [BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 166 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 received_ecn_bytes[3];/* 2584 12 */ u32 app_limited; /* 2596 4 */ u32 rcv_wnd; /* 2600 4 */ struct tcp_options_received rx_opt; /* 2604 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 167 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 4 +++ include/net/tcp_ecn.h | 29 +++++++++++++++++++++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 7 +++--- net/ipv4/tcp_minisocks.c | 2 +- 6 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 4f71ece7c655..5a2b0af57364 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write +u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b8432bed546d..012d01347b3c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -306,6 +306,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index da0b355418bd..1a41a459aa07 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -171,7 +171,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk, /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, u32 len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -191,9 +191,24 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); + + if (len > 0) + tp->received_ecn_bytes[ecnfield - 1] += len; } } +/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters + * initialized at the start of the half-connection. [...] These byte counters + * reflect only the TCP payload length, excluding TCP header and TCP options. + */ +static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + /* AccECN specification, 5.1: [...] a server can determine that it * negotiated AccECN as [...] if the ACK contains an ACE field with * the value 0b010 to 0b111 (decimal 2 to 7). @@ -232,10 +247,22 @@ static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) return ace && ace != 0x3; } +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); } /* Used for make_synack to form the ACE flags */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7261ee6dd875..a45a4184b603 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5142,6 +5142,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5149,7 +5150,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8449a5a3e368..636a63383412 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6163,7 +6163,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, 0); /* We know that such packets are checksummed * on entry. @@ -6213,7 +6213,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6254,7 +6255,7 @@ validate: return; step5: - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a4b8be6fdcdc..1dbcc09ff7a9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,7 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : -- cgit v1.2.3 From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 19 +++ .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 9 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 13 ++ include/net/tcp_ecn.h | 89 ++++++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/sysctl_net_ipv4.c | 9 ++ net/ipv4/tcp.c | 15 +- net/ipv4/tcp_input.c | 94 +++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 165 ++++++++++++++++++++- 12 files changed, 412 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d8eb54b84f9..1c206501b973 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -468,6 +468,25 @@ tcp_ecn - INTEGER Default: 2 +tcp_ecn_option - INTEGER + Control Accurate ECN (AccECN) option sending when AccECN has been + successfully negotiated during handshake. Send logic inhibits + sending AccECN options regarless of this setting when no AccECN + option has been seen for the reverse direction. + + Possible values are: + + = ============================================================ + 0 Never send AccECN option. This also disables sending AccECN + option in SYN/ACK during handshake. + 1 Send AccECN option sparingly according to the minimum option + rules outlined in draft-ietf-tcpm-accurate-ecn. + 2 Send AccECN option on every packet whenever it fits into TCP + option space. + = ============================================================ + + Default: 2 + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 5a2b0af57364..b941151f8c0a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w u32 received_ce read_mostly read_write u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u32[3] delivered_ecn_bytes read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write +u8:2 accecn_minlen write_mostly read_write +u8:2 est_ecnfield read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54a7d187f62a..acbb7dd497e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index da8c6640ead3..6be29129465e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 1a41a459aa07..08c7f4757e4e 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -24,6 +24,13 @@ enum tcp_ecn_mode { TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -169,6 +176,79 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) @@ -192,8 +272,12 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (len > 0) + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -263,6 +347,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index bdac8c42fa82..53e0e85b52be 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -316,6 +316,13 @@ struct tcp_info { * in milliseconds, including any * unfinished recovery. */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 268f8b86e8a7..4a697acb4e85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a45a4184b603..8c4a4b8666fc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -4155,6 +4156,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4281,6 +4285,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -5162,12 +5174,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b48a3c00945..e898a76c485e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,73 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +468,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +480,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -436,12 +507,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -3973,6 +4046,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4012,6 +4086,7 @@ no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4139,6 +4214,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4230,6 +4306,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4290,11 +4372,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6119,6 +6204,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6162f8dbe9d2..aa8dbfe20924 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3561,6 +3561,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5be2b3eb73d3..34e5c83bbace 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int max_combine_saving; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (remaining >= align_size) { + size = align_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- cgit v1.2.3 From aa55a7dde7ec506bb23448a5005ae3f4f809d022 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:31 +0200 Subject: tcp: accecn: AccECN option send control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member: [BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ struct list_head tsorted_sent_queue; /* 2496 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ u64 accecn_opt_tstamp; /* 2596 8 */ struct list_head tsorted_sent_queue; /* 2504 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */ u8 nonagle:4; /* 2529: 0 1 */ u8 rate_app_limited:1; /* 2529: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2530: 0 1 */ u8 unused2:4; /* 2530: 4 1 */ u8 accecn_minlen:2; /* 2531: 0 1 */ u8 est_ecnfield:2; /* 2531: 2 1 */ u8 accecn_opt_demand:2; /* 2531: 4 1 */ u8 prev_ecnfield:2; /* 2531: 6 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */ [...] /* size: 3200, cachelines: 50, members: 173 */ } Signed-off-by: Chia-Yu Chang Co-developed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 6 +++ .../networking/net_cachelines/tcp_sock.rst | 3 ++ include/linux/tcp.h | 4 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 ++ include/net/tcp_ecn.h | 52 ++++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 ++++ net/ipv4/tcp.c | 5 ++- net/ipv4/tcp_input.c | 4 +- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 + net/ipv4/tcp_output.c | 26 ++++++++--- 12 files changed, 107 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 1c206501b973..a06cb99d66dc 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER Default: 2 +tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + takes effect only when tcp_ecn_option is set to 2. + + Default: 3 (AccECN will be send at least 3 times per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index b941151f8c0a..d4dc01800945 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -109,6 +109,9 @@ u8:2 syn_ect_snt write_mostly read_w u8:2 syn_ect_rcv read_mostly read_write u8:2 accecn_minlen write_mostly read_write u8:2 est_ecnfield read_write +u8:2 accecn_opt_demand read_mostly read_write +u8:2 prev_ecnfield read_write +u64 accecn_opt_tstamp read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 73557656cb2d..f637b659b35a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -275,6 +275,7 @@ struct tcp_sock { u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -296,7 +297,8 @@ struct tcp_sock { unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ est_ecnfield:2,/* ECN field for AccECN delivered estimates */ - unused3:4; + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ + prev_ecnfield:2; /* ECN bits from the previous segment */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index acbb7dd497e1..34eb3aecb3f2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 { u8 sysctl_tcp_ecn; u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6be29129465e..78dd7b8a4145 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U +/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 08c7f4757e4e..133fb6b79500 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -176,6 +176,17 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Demand the minimum # to send AccECN optnio */ +static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + /* Maps IP ECN field ECT/CE code point to AccECN option field number, given * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). */ @@ -256,6 +267,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk, u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge; if (!INET_ECN_is_not_ect(ecnfield)) { u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); @@ -274,9 +286,34 @@ static inline void tcp_ecn_received_counters(struct sock *sk, if (len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + u32 bytes_mask = GENMASK_U32(31, 22); + tp->received_ecn_bytes[ecnfield - 1] += len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen); + + /* Send AccECN option at least once per 2^22-byte + * increase in any ECN byte counter. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + bytes_mask) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; } } } @@ -349,6 +386,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; tp->est_ecnfield = 0; } @@ -431,6 +469,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + tp->accecn_opt_demand = 2; if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { @@ -451,6 +490,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, } else { tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); } } @@ -542,4 +582,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) th->ece = 1; } +static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); + const struct tcp_sock *tp = tcp_sk(sk); + + if (!ecn_beacon) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= + (tp->srtt_us >> 3); +} + #endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4a697acb4e85..24dbc603cc44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c4a4b8666fc..090f9ac43d4c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e898a76c485e..87154fd86167 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6121,8 +6121,10 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tcp_accecn_opt_demand_min(sk, 1); + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aa8dbfe20924..6a63be1f6461 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dbcc09ff7a9..193343494558 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34e5c83bbace..f897c2594954 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } - if (tp) + if (tp) { tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } - if (tcp_ecn_mode_accecn(tp) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { - opts->use_synack_ecn_bytes = 0; - size += tcp_options_fit_accecn(opts, tp->accecn_minlen, - MAX_TCP_OPTION_SPACE - size); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); + + if (ecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); -- cgit v1.2.3 From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- include/net/tcp_ecn.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 14 +++++++++++++ net/ipv4/tcp_output.c | 11 ++++++++--- 7 files changed, 111 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 133fb6b79500..f13e5cd2b1ac 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; @@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, return true; } +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) @@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, } } +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* See Table 2 of the AccECN draft */ -static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 53e0e85b52be..dce3113787a7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -323,6 +323,8 @@ struct tcp_info { __u32 tcpi_received_e1_bytes; __u32 tcpi_received_e0_bytes; __u32 tcpi_received_ce_bytes; + __u16 tcpi_accecn_fail_mode; + __u16 tcpi_accecn_opt_seen; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090f9ac43d4c..5b5c655ded1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; @@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87154fd86167..5732f2d4329c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr; + if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; @@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6123,7 +6145,13 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6606,7 +6634,8 @@ consume: */ if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 193343494558..327095ef95ef 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -678,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -855,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f897c2594954..65b90f73daa0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,9 +985,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; @@ -1076,7 +1081,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1156,7 +1161,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; -- cgit v1.2.3 From fe2cddc648f0d7cdf7377e1cb5a8c7dc5547e290 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:33 +0200 Subject: tcp: accecn: AccECN option ceb/cep and ACE field multi-wrap heuristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AccECN option ceb/cep heuristic algorithm is from AccECN spec Appendix A.2.2 to mitigate against false ACE field overflows. Armed with ceb delta from option, delivered bytes, and delivered packets it is possible to estimate how many times ACE field wrapped. This calculation is necessary only if more than one wrap is possible. Without SACK, delivered bytes and packets are not always trustworthy in which case TCP falls back to the simpler no-or-all wraps ceb algorithm. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-10-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 78dd7b8a4145..7c51a0a5ace8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -256,6 +256,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ TCPOLEN_ACCECN_PERFIELD * \ TCP_ACCECN_NUMFIELDS) +#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5732f2d4329c..9fdc6ce25eb1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -493,16 +493,19 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int flag) { + u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); - u32 delta, safe_delta; + u32 delta, safe_delta, d_ceb; + bool opt_deltas_valid; u32 corrected_ace; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; - tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + opt_deltas_valid = tcp_accecn_process_option(tp, skb, + delivered_bytes, flag); if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ @@ -525,6 +528,35 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, safe_delta = delivered_pkts - ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + if (opt_deltas_valid) { + d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; + if (!d_ceb) + return delta; + + if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && + (tcp_is_sack(tp) || + ((1 << inet_csk(sk)->icsk_ca_state) & + (TCPF_CA_Open | TCPF_CA_CWR)))) { + u32 est_d_cep; + + if (delivered_bytes <= d_ceb) + return safe_delta; + + est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * + delivered_pkts, + delivered_bytes); + return min(safe_delta, + delta + + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); + } + + if (d_ceb > delta * tp->mss_cache) + return safe_delta; + if (d_ceb < + safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) + return delta; + } + return safe_delta; } -- cgit v1.2.3 From 3fbb2a6f3a70c27a6a2be80d131970608c0f84d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:42 +0000 Subject: ipv6: make ipv6_pinfo.saddr_cache a boolean ipv6_pinfo.saddr_cache is either NULL or &np->saddr. We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-2-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 4 ++-- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f43314517396..55c4d1e4dd7d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -216,10 +216,10 @@ struct inet6_cork { struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; - const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES - const struct in6_addr *saddr_cache; + bool saddr_cache; #endif + const struct in6_addr *daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59f48ca3abdf..223c02d42688 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -230,7 +230,7 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -238,7 +238,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f..c342f8daea7f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd7..1947ccdb00df 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d64c13bab5e..82ff6e1293d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1102,7 +1102,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3371f16b7a3e..e1b0aebf8bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3036,9 +3036,9 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..ecc3a87cd8c4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From 5489f333ef993bfceebce9ae98944f04eaafcc30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:43 +0000 Subject: ipv6: make ipv6_pinfo.daddr_cache a boolean ipv6_pinfo.daddr_cache is either NULL or &sk->sk_v6_daddr We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-3-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 3 +-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 55c4d1e4dd7d..8e6d9f8b3dc8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -219,7 +219,7 @@ struct ipv6_pinfo { #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif - const struct in6_addr *daddr_cache; + bool daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 223c02d42688..7c5512baa4b2 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -229,14 +229,14 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, + bool daddr_set, bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr_set; #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c342f8daea7f..1b0314644e0c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 1947ccdb00df..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 82ff6e1293d0..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1100,7 +1100,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache ? &np->saddr : NULL) || diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e1b0aebf8bf9..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3032,8 +3032,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ecc3a87cd8c4..c7271f6359db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, false); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From b76543b21fbcfbb96332fd80cc0d85bbcd72d8f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:45 +0000 Subject: ipv6: reorganise struct ipv6_pinfo Move fields used in tx fast path at the beginning of the structure, and seldom used ones at the end. Note that rxopt is also in the first cache line. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-5-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8e6d9f8b3dc8..43b7bb828738 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -214,18 +214,21 @@ struct inet6_cork { /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { + /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ struct in6_addr saddr; - struct in6_pktinfo sticky_pktinfo; + __be32 flow_label; + u32 dst_cookie; + struct ipv6_txoptions __rcu *opt; + s16 hop_limit; + u8 pmtudisc; + u8 tclass; #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif bool daddr_cache; - __be32 flow_label; - __u32 frag_size; - - s16 hop_limit; u8 mcast_hops; + u32 frag_size; int ucast_oif; int mcast_oif; @@ -233,7 +236,7 @@ struct ipv6_pinfo { /* pktoption flags */ union { struct { - __u16 srcrt:1, + u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, @@ -250,29 +253,25 @@ struct ipv6_pinfo { recvfragsize:1; /* 1 bits hole */ } bits; - __u16 all; + u16 all; } rxopt; /* sockopt flags */ - __u8 srcprefs; /* 001: prefer temporary address + u8 srcprefs; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ - __u8 pmtudisc; - __u8 min_hopcount; - __u8 tclass; + u8 min_hopcount; __be32 rcv_flowinfo; + struct in6_pktinfo sticky_pktinfo; - __u32 dst_cookie; + struct sk_buff *pktoptions; + struct sk_buff *rxpmtu; + struct inet6_cork cork; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; - - struct ipv6_txoptions __rcu *opt; - struct sk_buff *pktoptions; - struct sk_buff *rxpmtu; - struct inet6_cork cork; }; /* We currently use available bits from inet_sk(sk)->inet_flags, -- cgit v1.2.3 From 4effb335b5dab08cb6e2c38d038910f8b527cfc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:48 +0000 Subject: net: group sk_backlog and sk_receive_queue UDP receivers suffer from sk_rmem_alloc updates, currently sharing a cache line with fields that need to be read-mostly (sock_read_rx group): 1) RFS enabled hosts read sk_napi_id from __udpv6_queue_rcv_skb(). 2) sk->sk_rcvbuf is read from __udp_enqueue_schedule_skb() /* --- cacheline 3 boundary (192 bytes) --- */ struct { atomic_t rmem_alloc; /* 0xc0 0x4 */ // Oops int len; /* 0xc4 0x4 */ struct sk_buff * head; /* 0xc8 0x8 */ struct sk_buff * tail; /* 0xd0 0x8 */ } sk_backlog; /* 0xc0 0x18 */ __u8 __cacheline_group_end__sock_write_rx[0]; /* 0xd8 0 */ __u8 __cacheline_group_begin__sock_read_rx[0]; /* 0xd8 0 */ struct dst_entry * sk_rx_dst; /* 0xd8 0x8 */ int sk_rx_dst_ifindex;/* 0xe0 0x4 */ u32 sk_rx_dst_cookie; /* 0xe4 0x4 */ unsigned int sk_ll_usec; /* 0xe8 0x4 */ unsigned int sk_napi_id; /* 0xec 0x4 */ u16 sk_busy_poll_budget;/* 0xf0 0x2 */ u8 sk_prefer_busy_poll;/* 0xf2 0x1 */ u8 sk_userlocks; /* 0xf3 0x1 */ int sk_rcvbuf; /* 0xf4 0x4 */ struct sk_filter * sk_filter; /* 0xf8 0x8 */ Move sk_error (which is less often dirtied) there. Alternative would be to cache align sock_read_rx but this has more implications/risks. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-8-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334..867dc44140d4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -394,7 +394,6 @@ struct sock { atomic_t sk_drops; __s32 sk_peek_off; - struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -412,6 +411,7 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc + struct sk_buff_head sk_error_queue; __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); -- cgit v1.2.3 From 9db27c80622bd612549ea213390500f7377ee3e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:49 +0000 Subject: udp: add udp_drops_inc() helper Generic sk_drops_inc() reads sk->sk_drop_counters. We know the precise location for UDP sockets. Move sk_drop_counters out of sock_read_rxtx so that sock_write_rxtx starts at a cache line boundary. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-9-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/sock.h | 2 +- include/net/udp.h | 5 +++++ net/core/sock.c | 1 - net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 6 +++--- 5 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 867dc44140d4..82bcdb7d7e67 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -451,7 +451,6 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -568,6 +567,7 @@ struct sock { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif + struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; diff --git a/include/net/udp.h b/include/net/udp.h index 93b159f30e88..a08822e294b0 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -295,6 +295,11 @@ static inline void udp_lib_init_sock(struct sock *sk) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } +static inline void udp_drops_inc(struct sock *sk) +{ + numa_drop_add(&udp_sk(sk)->drop_counters, 1); +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { diff --git a/net/core/sock.c b/net/core/sock.c index 1f8ef4d8bcd9..21742da19e45 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4444,7 +4444,6 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 658ae8782799..25143f932447 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1790,7 +1790,7 @@ uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - sk_drops_inc(sk); + udp_drops_inc(sk); busylock_release(busy); return err; } @@ -1855,7 +1855,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - sk_drops_inc(sk); + udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2011,7 +2011,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2081,7 +2081,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2452,7 +2452,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2537,7 +2537,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e87d0ef861f8..9f4d340d1e3a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, -- cgit v1.2.3 From 3cd04c8f4afed71a48edef0db5255afc249c2feb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:50 +0000 Subject: udp: make busylock per socket While having all spinlocks packed into an array was a space saver, this also caused NUMA imbalance and hash collisions. UDPv6 socket size becomes 1600 after this patch. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-10-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + net/ipv4/udp.c | 20 ++------------------ 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 6ed008ab1665..e554890c4415 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -109,6 +109,7 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; + spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index a08822e294b0..eecd64097f91 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -289,6 +289,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; + spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25143f932447..7d1444821ee5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1689,17 +1689,11 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) +static spinlock_t *busylock_acquire(struct sock *sk) { - spinlock_t *busy; + spinlock_t *busy = &udp_sk(sk)->busylock; - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } @@ -3997,7 +3991,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4006,15 +3999,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); -- cgit v1.2.3 From 6b88293aae7fb78872e5cc1ec36e2f750ae12e38 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Wed, 10 Sep 2025 14:32:58 -0400 Subject: mtd: nand: move nand_check_erased_ecc_chunk() to nand/core The check function for bitflips in erased blocks will be needed by the Realtek ECC engine driver (which is currently under development). Right now it is located in raw/nand_base.c. While this is sufficient for the current usecases, there is no real dependency for an ECC engine on the raw nand library. Move the function over to a more generic place in core library. Suggested-by: Miquel Raynal Signed-off-by: Markus Stockhausen Signed-off-by: Miquel Raynal --- drivers/mtd/nand/core.c | 131 +++++++++++++++++++++++++++++++++++++++ drivers/mtd/nand/raw/nand_base.c | 131 --------------------------------------- include/linux/mtd/nand.h | 5 ++ include/linux/mtd/rawnand.h | 5 -- 4 files changed, 136 insertions(+), 136 deletions(-) (limited to 'include') diff --git a/drivers/mtd/nand/core.c b/drivers/mtd/nand/core.c index 7737b1a4a177..3e76d127715f 100644 --- a/drivers/mtd/nand/core.c +++ b/drivers/mtd/nand/core.c @@ -12,6 +12,137 @@ #include #include +/** + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data + * @buf: buffer to test + * @len: buffer length + * @bitflips_threshold: maximum number of bitflips + * + * Check if a buffer contains only 0xff, which means the underlying region + * has been erased and is ready to be programmed. + * The bitflips_threshold specify the maximum number of bitflips before + * considering the region is not erased. + * Note: The logic of this function has been extracted from the memweight + * implementation, except that nand_check_erased_buf function exit before + * testing the whole buffer if the number of bitflips exceed the + * bitflips_threshold value. + * + * Returns a positive number of bitflips less than or equal to + * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the + * threshold. + */ +static int nand_check_erased_buf(void *buf, int len, int bitflips_threshold) +{ + const unsigned char *bitmap = buf; + int bitflips = 0; + int weight; + + for (; len && ((uintptr_t)bitmap) % sizeof(long); + len--, bitmap++) { + weight = hweight8(*bitmap); + bitflips += BITS_PER_BYTE - weight; + if (unlikely(bitflips > bitflips_threshold)) + return -EBADMSG; + } + + for (; len >= sizeof(long); + len -= sizeof(long), bitmap += sizeof(long)) { + unsigned long d = *((unsigned long *)bitmap); + if (d == ~0UL) + continue; + weight = hweight_long(d); + bitflips += BITS_PER_LONG - weight; + if (unlikely(bitflips > bitflips_threshold)) + return -EBADMSG; + } + + for (; len > 0; len--, bitmap++) { + weight = hweight8(*bitmap); + bitflips += BITS_PER_BYTE - weight; + if (unlikely(bitflips > bitflips_threshold)) + return -EBADMSG; + } + + return bitflips; +} + +/** + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only + * 0xff data + * @data: data buffer to test + * @datalen: data length + * @ecc: ECC buffer + * @ecclen: ECC length + * @extraoob: extra OOB buffer + * @extraooblen: extra OOB length + * @bitflips_threshold: maximum number of bitflips + * + * Check if a data buffer and its associated ECC and OOB data contains only + * 0xff pattern, which means the underlying region has been erased and is + * ready to be programmed. + * The bitflips_threshold specify the maximum number of bitflips before + * considering the region as not erased. + * + * Note: + * 1/ ECC algorithms are working on pre-defined block sizes which are usually + * different from the NAND page size. When fixing bitflips, ECC engines will + * report the number of errors per chunk, and the NAND core infrastructure + * expect you to return the maximum number of bitflips for the whole page. + * This is why you should always use this function on a single chunk and + * not on the whole page. After checking each chunk you should update your + * max_bitflips value accordingly. + * 2/ When checking for bitflips in erased pages you should not only check + * the payload data but also their associated ECC data, because a user might + * have programmed almost all bits to 1 but a few. In this case, we + * shouldn't consider the chunk as erased, and checking ECC bytes prevent + * this case. + * 3/ The extraoob argument is optional, and should be used if some of your OOB + * data are protected by the ECC engine. + * It could also be used if you support subpages and want to attach some + * extra OOB data to an ECC chunk. + * + * Returns a positive number of bitflips less than or equal to + * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the + * threshold. In case of success, the passed buffers are filled with 0xff. + */ +int nand_check_erased_ecc_chunk(void *data, int datalen, + void *ecc, int ecclen, + void *extraoob, int extraooblen, + int bitflips_threshold) +{ + int data_bitflips = 0, ecc_bitflips = 0, extraoob_bitflips = 0; + + data_bitflips = nand_check_erased_buf(data, datalen, + bitflips_threshold); + if (data_bitflips < 0) + return data_bitflips; + + bitflips_threshold -= data_bitflips; + + ecc_bitflips = nand_check_erased_buf(ecc, ecclen, bitflips_threshold); + if (ecc_bitflips < 0) + return ecc_bitflips; + + bitflips_threshold -= ecc_bitflips; + + extraoob_bitflips = nand_check_erased_buf(extraoob, extraooblen, + bitflips_threshold); + if (extraoob_bitflips < 0) + return extraoob_bitflips; + + if (data_bitflips) + memset(data, 0xff, datalen); + + if (ecc_bitflips) + memset(ecc, 0xff, ecclen); + + if (extraoob_bitflips) + memset(extraoob, 0xff, extraooblen); + + return data_bitflips + ecc_bitflips + extraoob_bitflips; +} +EXPORT_SYMBOL(nand_check_erased_ecc_chunk); + /** * nanddev_isbad() - Check if a block is bad * @nand: NAND device diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 13e4060bd1b6..c7d9501f646b 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -2783,137 +2783,6 @@ int nand_set_features(struct nand_chip *chip, int addr, return nand_set_features_op(chip, addr, subfeature_param); } -/** - * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data - * @buf: buffer to test - * @len: buffer length - * @bitflips_threshold: maximum number of bitflips - * - * Check if a buffer contains only 0xff, which means the underlying region - * has been erased and is ready to be programmed. - * The bitflips_threshold specify the maximum number of bitflips before - * considering the region is not erased. - * Note: The logic of this function has been extracted from the memweight - * implementation, except that nand_check_erased_buf function exit before - * testing the whole buffer if the number of bitflips exceed the - * bitflips_threshold value. - * - * Returns a positive number of bitflips less than or equal to - * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the - * threshold. - */ -static int nand_check_erased_buf(void *buf, int len, int bitflips_threshold) -{ - const unsigned char *bitmap = buf; - int bitflips = 0; - int weight; - - for (; len && ((uintptr_t)bitmap) % sizeof(long); - len--, bitmap++) { - weight = hweight8(*bitmap); - bitflips += BITS_PER_BYTE - weight; - if (unlikely(bitflips > bitflips_threshold)) - return -EBADMSG; - } - - for (; len >= sizeof(long); - len -= sizeof(long), bitmap += sizeof(long)) { - unsigned long d = *((unsigned long *)bitmap); - if (d == ~0UL) - continue; - weight = hweight_long(d); - bitflips += BITS_PER_LONG - weight; - if (unlikely(bitflips > bitflips_threshold)) - return -EBADMSG; - } - - for (; len > 0; len--, bitmap++) { - weight = hweight8(*bitmap); - bitflips += BITS_PER_BYTE - weight; - if (unlikely(bitflips > bitflips_threshold)) - return -EBADMSG; - } - - return bitflips; -} - -/** - * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only - * 0xff data - * @data: data buffer to test - * @datalen: data length - * @ecc: ECC buffer - * @ecclen: ECC length - * @extraoob: extra OOB buffer - * @extraooblen: extra OOB length - * @bitflips_threshold: maximum number of bitflips - * - * Check if a data buffer and its associated ECC and OOB data contains only - * 0xff pattern, which means the underlying region has been erased and is - * ready to be programmed. - * The bitflips_threshold specify the maximum number of bitflips before - * considering the region as not erased. - * - * Note: - * 1/ ECC algorithms are working on pre-defined block sizes which are usually - * different from the NAND page size. When fixing bitflips, ECC engines will - * report the number of errors per chunk, and the NAND core infrastructure - * expect you to return the maximum number of bitflips for the whole page. - * This is why you should always use this function on a single chunk and - * not on the whole page. After checking each chunk you should update your - * max_bitflips value accordingly. - * 2/ When checking for bitflips in erased pages you should not only check - * the payload data but also their associated ECC data, because a user might - * have programmed almost all bits to 1 but a few. In this case, we - * shouldn't consider the chunk as erased, and checking ECC bytes prevent - * this case. - * 3/ The extraoob argument is optional, and should be used if some of your OOB - * data are protected by the ECC engine. - * It could also be used if you support subpages and want to attach some - * extra OOB data to an ECC chunk. - * - * Returns a positive number of bitflips less than or equal to - * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the - * threshold. In case of success, the passed buffers are filled with 0xff. - */ -int nand_check_erased_ecc_chunk(void *data, int datalen, - void *ecc, int ecclen, - void *extraoob, int extraooblen, - int bitflips_threshold) -{ - int data_bitflips = 0, ecc_bitflips = 0, extraoob_bitflips = 0; - - data_bitflips = nand_check_erased_buf(data, datalen, - bitflips_threshold); - if (data_bitflips < 0) - return data_bitflips; - - bitflips_threshold -= data_bitflips; - - ecc_bitflips = nand_check_erased_buf(ecc, ecclen, bitflips_threshold); - if (ecc_bitflips < 0) - return ecc_bitflips; - - bitflips_threshold -= ecc_bitflips; - - extraoob_bitflips = nand_check_erased_buf(extraoob, extraooblen, - bitflips_threshold); - if (extraoob_bitflips < 0) - return extraoob_bitflips; - - if (data_bitflips) - memset(data, 0xff, datalen); - - if (ecc_bitflips) - memset(ecc, 0xff, ecclen); - - if (extraoob_bitflips) - memset(extraoob, 0xff, extraooblen); - - return data_bitflips + ecc_bitflips + extraoob_bitflips; -} -EXPORT_SYMBOL(nand_check_erased_ecc_chunk); - /** * nand_read_page_raw_notsupp - dummy read raw page function * @chip: nand chip info structure diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 07486168d104..09c8c93e4dba 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1136,4 +1136,9 @@ static inline bool nanddev_bbt_is_initialized(struct nand_device *nand) int nanddev_mtd_erase(struct mtd_info *mtd, struct erase_info *einfo); int nanddev_mtd_max_bad_blocks(struct mtd_info *mtd, loff_t offs, size_t len); +int nand_check_erased_ecc_chunk(void *data, int datalen, + void *ecc, int ecclen, + void *extraoob, int extraooblen, + int threshold); + #endif /* __LINUX_MTD_NAND_H */ diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index e84522e31301..d30bdc3fcfd7 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1519,11 +1519,6 @@ int rawnand_sw_bch_correct(struct nand_chip *chip, unsigned char *buf, unsigned char *read_ecc, unsigned char *calc_ecc); void rawnand_sw_bch_cleanup(struct nand_chip *chip); -int nand_check_erased_ecc_chunk(void *data, int datalen, - void *ecc, int ecclen, - void *extraoob, int extraooblen, - int threshold); - int nand_ecc_choose_conf(struct nand_chip *chip, const struct nand_ecc_caps *caps, int oobavail); -- cgit v1.2.3 From 2ad49ae7330b8a456edf639c92241a343641a763 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 27 Aug 2025 10:25:36 -0500 Subject: RDMA/irdma: Introduce GEN3 vPort driver support In the IPU model, a function can host one or more logical network endpoints called vPorts. Each vPort may be associated with either a physical or an internal communication port, and can be RDMA capable. A vPort features a netdev and, if RDMA capable, must have an associated ib_dev. This change introduces a GEN3 auxiliary vPort driver responsible for registering a verbs device for every RDMA-capable vPort. Additionally, the UAPI is updated to prevent the binding of GEN3 devices to older user-space providers. Signed-off-by: Mustafa Ismail Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20250827152545.2056-8-tatyana.e.nikolova@intel.com Tested-by: Jacob Moroni Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/main.c | 120 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/irdma/main.h | 2 + drivers/infiniband/hw/irdma/verbs.c | 10 ++- include/uapi/rdma/irdma-abi.h | 1 + 4 files changed, 132 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 162f1fab32b5..95957d52883d 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "main.h" +#include MODULE_ALIAS("i40iw"); MODULE_DESCRIPTION("Intel(R) Ethernet Protocol Driver for RDMA"); @@ -46,6 +47,113 @@ void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) ibdev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 1280 for IPv6\\n", mtu); } +static void ig3rdma_idc_vport_event_handler(struct iidc_rdma_vport_dev_info *cdev_info, + struct iidc_rdma_event *event) +{ + struct irdma_device *iwdev = auxiliary_get_drvdata(cdev_info->adev); + struct irdma_l2params l2params = {}; + + if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { + ibdev_dbg(&iwdev->ibdev, "CLNT: new MTU = %d\n", iwdev->netdev->mtu); + if (iwdev->vsi.mtu != iwdev->netdev->mtu) { + l2params.mtu = iwdev->netdev->mtu; + l2params.mtu_changed = true; + irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); + irdma_change_l2params(&iwdev->vsi, &l2params); + } + } +} + +static int ig3rdma_vport_probe(struct auxiliary_device *aux_dev, + const struct auxiliary_device_id *id) +{ + struct iidc_rdma_vport_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_vport_auxiliary_dev, adev); + struct auxiliary_device *aux_core_dev = idc_adev->vdev_info->core_adev; + struct irdma_pci_f *rf = auxiliary_get_drvdata(aux_core_dev); + struct irdma_l2params l2params = {}; + struct irdma_device *iwdev; + int err; + + if (!rf) { + WARN_ON_ONCE(1); + return -ENOMEM; + } + iwdev = ib_alloc_device(irdma_device, ibdev); + /* Fill iwdev info */ + iwdev->is_vport = true; + iwdev->rf = rf; + iwdev->vport_id = idc_adev->vdev_info->vport_id; + iwdev->netdev = idc_adev->vdev_info->netdev; + iwdev->init_state = INITIAL_STATE; + iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; + iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; + iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED; + iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE; + iwdev->roce_mode = true; + iwdev->push_mode = false; + + l2params.mtu = iwdev->netdev->mtu; + + err = irdma_rt_init_hw(iwdev, &l2params); + if (err) + goto err_rt_init; + + err = irdma_ib_register_device(iwdev); + if (err) + goto err_ibreg; + + auxiliary_set_drvdata(aux_dev, iwdev); + + ibdev_dbg(&iwdev->ibdev, + "INIT: Gen[%d] vport[%d] probe success. dev_name = %s, core_dev_name = %s, netdev=%s\n", + rf->rdma_ver, idc_adev->vdev_info->vport_id, + dev_name(&aux_dev->dev), + dev_name(&idc_adev->vdev_info->core_adev->dev), + netdev_name(idc_adev->vdev_info->netdev)); + + return 0; +err_ibreg: + irdma_rt_deinit_hw(iwdev); +err_rt_init: + ib_dealloc_device(&iwdev->ibdev); + + return err; +} + +static void ig3rdma_vport_remove(struct auxiliary_device *aux_dev) +{ + struct iidc_rdma_vport_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_vport_auxiliary_dev, adev); + struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); + + ibdev_dbg(&iwdev->ibdev, + "INIT: Gen[%d] dev_name = %s, core_dev_name = %s, netdev=%s\n", + iwdev->rf->rdma_ver, dev_name(&aux_dev->dev), + dev_name(&idc_adev->vdev_info->core_adev->dev), + netdev_name(idc_adev->vdev_info->netdev)); + + irdma_ib_unregister_device(iwdev); +} + +static const struct auxiliary_device_id ig3rdma_vport_auxiliary_id_table[] = { + {.name = "idpf.8086.rdma.vdev", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, ig3rdma_vport_auxiliary_id_table); + +static struct iidc_rdma_vport_auxiliary_drv ig3rdma_vport_auxiliary_drv = { + .adrv = { + .name = "vdev", + .id_table = ig3rdma_vport_auxiliary_id_table, + .probe = ig3rdma_vport_probe, + .remove = ig3rdma_vport_remove, + }, + .event_handler = ig3rdma_idc_vport_event_handler, +}; + + static int __init irdma_init_module(void) { int ret; @@ -74,6 +182,17 @@ static int __init irdma_init_module(void) return ret; } + + ret = auxiliary_driver_register(&ig3rdma_vport_auxiliary_drv.adrv); + if (ret) { + auxiliary_driver_unregister(&ig3rdma_core_auxiliary_drv.adrv); + auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv); + auxiliary_driver_unregister(&i40iw_auxiliary_drv); + pr_err("Failed ig3rdma vport auxiliary_driver_register() ret=%d\n", + ret); + + return ret; + } irdma_register_notifiers(); return 0; @@ -85,6 +204,7 @@ static void __exit irdma_exit_module(void) auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv); auxiliary_driver_unregister(&i40iw_auxiliary_drv); auxiliary_driver_unregister(&ig3rdma_core_auxiliary_drv.adrv); + auxiliary_driver_unregister(&ig3rdma_vport_auxiliary_drv.adrv); } module_init(irdma_init_module); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 7300f8ab49ca..c64d772dc644 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -354,12 +354,14 @@ struct irdma_device { u32 rcv_wnd; u16 mac_ip_table_idx; u16 vsi_num; + u16 vport_id; u8 rcv_wscale; u8 iw_status; bool roce_mode:1; bool roce_dcqcn_en:1; bool dcb_vlan_mode:1; bool iw_ooo:1; + bool is_vport:1; enum init_completion_state init_state; wait_queue_head_t suspend_wq; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 894c1f7bcb43..7f59bdd85da0 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -292,6 +292,10 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx, ucontext->iwdev = iwdev; ucontext->abi_ver = req.userspace_ver; + if (!(req.comp_mask & IRDMA_SUPPORT_WQE_FORMAT_V2) && + uk_attrs->hw_rev >= IRDMA_GEN_3) + return -EOPNOTSUPP; + if (req.comp_mask & IRDMA_ALLOC_UCTX_USE_RAW_ATTR) ucontext->use_raw_attrs = true; @@ -4891,5 +4895,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) struct irdma_device *iwdev = to_iwdev(ibdev); irdma_rt_deinit_hw(iwdev); - irdma_ctrl_deinit_hw(iwdev->rf); + if (!iwdev->is_vport) { + irdma_ctrl_deinit_hw(iwdev->rf); + if (iwdev->rf->vchnl_wq) + destroy_workqueue(iwdev->rf->vchnl_wq); + } } diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h index bb18f15489e3..4e42054cca33 100644 --- a/include/uapi/rdma/irdma-abi.h +++ b/include/uapi/rdma/irdma-abi.h @@ -25,6 +25,7 @@ enum irdma_memreg_type { enum { IRDMA_ALLOC_UCTX_USE_RAW_ATTR = 1 << 0, IRDMA_ALLOC_UCTX_MIN_HW_WQ_SIZE = 1 << 1, + IRDMA_SUPPORT_WQE_FORMAT_V2 = 1 << 3, }; struct irdma_alloc_ucontext_req { -- cgit v1.2.3 From 563e1feb5f6ed579acb55850f1bbb831aecf645a Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 27 Aug 2025 10:25:41 -0500 Subject: RDMA/irdma: Add SRQ support Implement verb API and UAPI changes to support SRQ functionality in GEN3 devices. Signed-off-by: Faisal Latif Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20250827152545.2056-13-tatyana.e.nikolova@intel.com Tested-by: Jacob Moroni Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 236 +++++++++++++++++- drivers/infiniband/hw/irdma/defs.h | 36 ++- drivers/infiniband/hw/irdma/hw.c | 21 +- drivers/infiniband/hw/irdma/irdma.h | 1 + drivers/infiniband/hw/irdma/main.h | 12 +- drivers/infiniband/hw/irdma/type.h | 66 +++++ drivers/infiniband/hw/irdma/uk.c | 162 +++++++++++- drivers/infiniband/hw/irdma/user.h | 41 ++++ drivers/infiniband/hw/irdma/utils.c | 27 ++ drivers/infiniband/hw/irdma/verbs.c | 475 +++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/irdma/verbs.h | 25 ++ include/uapi/rdma/irdma-abi.h | 15 +- 12 files changed, 1103 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 8cc457214537..ef2e46a22c3f 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -412,7 +412,8 @@ int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info) pble_obj_cnt = info->pd->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if ((info->virtual_map && info->sq_pa >= pble_obj_cnt) || - (info->virtual_map && info->rq_pa >= pble_obj_cnt)) + (!info->qp_uk_init_info.srq_uk && + info->virtual_map && info->rq_pa >= pble_obj_cnt)) return -EINVAL; qp->llp_stream_handle = (void *)(-1); @@ -446,6 +447,208 @@ int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info) return 0; } +/** + * irdma_sc_srq_init - init sc_srq structure + * @srq: srq sc struct + * @info: parameters for srq init + */ +int irdma_sc_srq_init(struct irdma_sc_srq *srq, + struct irdma_srq_init_info *info) +{ + u32 srq_size_quanta; + int ret_code; + + ret_code = irdma_uk_srq_init(&srq->srq_uk, &info->srq_uk_init_info); + if (ret_code) + return ret_code; + + srq->dev = info->pd->dev; + srq->pd = info->pd; + srq->vsi = info->vsi; + srq->srq_pa = info->srq_pa; + srq->first_pm_pbl_idx = info->first_pm_pbl_idx; + srq->pasid = info->pasid; + srq->pasid_valid = info->pasid_valid; + srq->srq_limit = info->srq_limit; + srq->leaf_pbl_size = info->leaf_pbl_size; + srq->virtual_map = info->virtual_map; + srq->tph_en = info->tph_en; + srq->arm_limit_event = info->arm_limit_event; + srq->tph_val = info->tph_value; + srq->shadow_area_pa = info->shadow_area_pa; + + /* Smallest SRQ size is 256B i.e. 8 quanta */ + srq_size_quanta = max((u32)IRDMA_SRQ_MIN_QUANTA, + srq->srq_uk.srq_size * + srq->srq_uk.wqe_size_multiplier); + srq->hw_srq_size = irdma_get_encoded_wqe_size(srq_size_quanta, + IRDMA_QUEUE_TYPE_SRQ); + + return 0; +} + +/** + * irdma_sc_srq_create - send srq create CQP WQE + * @srq: srq sc struct + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static int irdma_sc_srq_create(struct irdma_sc_srq *srq, u64 scratch, + bool post_sq) +{ + struct irdma_sc_cqp *cqp; + __le64 *wqe; + u64 hdr; + + cqp = srq->pd->dev->cqp; + if (srq->srq_uk.srq_id < cqp->dev->hw_attrs.min_hw_srq_id || + srq->srq_uk.srq_id > + (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].max_cnt - 1)) + return -EINVAL; + + wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 0, + FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQ_LIMIT, srq->srq_limit) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQSIZE, srq->hw_srq_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE, srq->srq_uk.wqe_size)); + set_64bit_val(wqe, 8, (uintptr_t)srq); + set_64bit_val(wqe, 16, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PD_ID, srq->pd->pd_id)); + set_64bit_val(wqe, 32, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR, + srq->srq_pa >> + IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S)); + set_64bit_val(wqe, 40, + FIELD_PREP(IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR, + srq->shadow_area_pa >> + IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S)); + set_64bit_val(wqe, 48, + FIELD_PREP(IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX, + srq->first_pm_pbl_idx)); + + hdr = srq->srq_uk.srq_id | + FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_SRQ) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE, srq->leaf_pbl_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_VIRTMAP, srq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT, + srq->arm_limit_event) | + FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + + dma_wmb(); /* make sure WQE is written before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + print_hex_dump_debug("WQE: SRQ_CREATE WQE", DUMP_PREFIX_OFFSET, 16, 8, + wqe, IRDMA_CQP_WQE_SIZE * 8, false); + if (post_sq) + irdma_sc_cqp_post_sq(cqp); + + return 0; +} + +/** + * irdma_sc_srq_modify - send modify_srq CQP WQE + * @srq: srq sc struct + * @info: parameters for srq modification + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static int irdma_sc_srq_modify(struct irdma_sc_srq *srq, + struct irdma_modify_srq_info *info, u64 scratch, + bool post_sq) +{ + struct irdma_sc_cqp *cqp; + __le64 *wqe; + u64 hdr; + + cqp = srq->dev->cqp; + if (srq->srq_uk.srq_id < cqp->dev->hw_attrs.min_hw_srq_id || + srq->srq_uk.srq_id > + (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].max_cnt - 1)) + return -EINVAL; + + wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 0, + FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQ_LIMIT, info->srq_limit) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQSIZE, srq->hw_srq_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE, srq->srq_uk.wqe_size)); + set_64bit_val(wqe, 8, + FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQCTX, srq->srq_uk.srq_id)); + set_64bit_val(wqe, 16, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PD_ID, srq->pd->pd_id)); + set_64bit_val(wqe, 32, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR, + srq->srq_pa >> + IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S)); + set_64bit_val(wqe, 40, + FIELD_PREP(IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR, + srq->shadow_area_pa >> + IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S)); + set_64bit_val(wqe, 48, + FIELD_PREP(IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX, + srq->first_pm_pbl_idx)); + + hdr = srq->srq_uk.srq_id | + FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_MODIFY_SRQ) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE, srq->leaf_pbl_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_VIRTMAP, srq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT, + info->arm_limit_event) | + FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + dma_wmb(); /* make sure WQE is written before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + print_hex_dump_debug("WQE: SRQ_MODIFY WQE", DUMP_PREFIX_OFFSET, 16, 8, + wqe, IRDMA_CQP_WQE_SIZE * 8, false); + if (post_sq) + irdma_sc_cqp_post_sq(cqp); + + return 0; +} + +/** + * irdma_sc_srq_destroy - send srq_destroy CQP WQE + * @srq: srq sc struct + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static int irdma_sc_srq_destroy(struct irdma_sc_srq *srq, u64 scratch, + bool post_sq) +{ + struct irdma_sc_cqp *cqp; + __le64 *wqe; + u64 hdr; + + cqp = srq->dev->cqp; + + wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 8, (uintptr_t)srq); + + hdr = srq->srq_uk.srq_id | + FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_SRQ) | + FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + dma_wmb(); /* make sure WQE is written before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + print_hex_dump_debug("WQE: SRQ_DESTROY WQE", DUMP_PREFIX_OFFSET, 16, + 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false); + if (post_sq) + irdma_sc_cqp_post_sq(cqp); + + return 0; +} + /** * irdma_sc_qp_create - create qp * @qp: sc qp @@ -837,6 +1040,7 @@ static void irdma_sc_qp_setctx_roce_gen_3(struct irdma_sc_qp *qp, FIELD_PREP(IRDMAQPC_ISQP1, roce_info->is_qp1) | FIELD_PREP(IRDMAQPC_ROCE_TVER, roce_info->roce_tver) | FIELD_PREP(IRDMAQPC_IPV4, udp->ipv4) | + FIELD_PREP(IRDMAQPC_USE_SRQ, !qp->qp_uk.srq_uk ? 0 : 1) | FIELD_PREP(IRDMAQPC_INSERTVLANTAG, udp->insert_vlan_tag); set_64bit_val(qp_ctx, 0, qw0); set_64bit_val(qp_ctx, 8, qp->sq_pa); @@ -921,6 +1125,9 @@ static void irdma_sc_qp_setctx_roce_gen_3(struct irdma_sc_qp *qp, FIELD_PREP(IRDMAQPC_LOCAL_IPADDR0, udp->local_ipaddr[0])); set_64bit_val(qp_ctx, 200, FIELD_PREP(IRDMAQPC_THIGH, roce_info->t_high) | + FIELD_PREP(IRDMAQPC_SRQ_ID, + !qp->qp_uk.srq_uk ? + 0 : qp->qp_uk.srq_uk->srq_id) | FIELD_PREP(IRDMAQPC_TLOW, roce_info->t_low)); set_64bit_val(qp_ctx, 208, roce_info->pd_id | FIELD_PREP(IRDMAQPC_STAT_INDEX_GEN3, info->stats_idx) | @@ -2219,6 +2426,14 @@ u8 irdma_get_encoded_wqe_size(u32 wqsize, enum irdma_queue_type queue_type) { u8 encoded_size = 0; + if (queue_type == IRDMA_QUEUE_TYPE_SRQ) { + /* Smallest SRQ size is 256B (8 quanta) that gets + * encoded to 0. + */ + encoded_size = ilog2(wqsize) - 3; + + return encoded_size; + } /* cqp sq's hw coded value starts from 1 for size of 4 * while it starts from 0 for qp' wq's. */ @@ -4585,7 +4800,7 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, case IRDMA_AE_SRQ_LIMIT: info->srq = true; /* [63:6] from CMPL_CTXT, [5:0] from WQDESCIDX. */ - info->compl_ctx = compl_ctx | info->wqe_idx; + info->compl_ctx = compl_ctx; ae_src = IRDMA_AE_SOURCE_RSVD; break; case IRDMA_AE_PRIV_OPERATION_DENIED: @@ -6161,6 +6376,22 @@ static int irdma_exec_cqp_cmd(struct irdma_sc_dev *dev, &pcmdinfo->in.u.mc_modify.info, pcmdinfo->in.u.mc_modify.scratch); break; + case IRDMA_OP_SRQ_CREATE: + status = irdma_sc_srq_create(pcmdinfo->in.u.srq_create.srq, + pcmdinfo->in.u.srq_create.scratch, + pcmdinfo->post_sq); + break; + case IRDMA_OP_SRQ_MODIFY: + status = irdma_sc_srq_modify(pcmdinfo->in.u.srq_modify.srq, + &pcmdinfo->in.u.srq_modify.info, + pcmdinfo->in.u.srq_modify.scratch, + pcmdinfo->post_sq); + break; + case IRDMA_OP_SRQ_DESTROY: + status = irdma_sc_srq_destroy(pcmdinfo->in.u.srq_destroy.srq, + pcmdinfo->in.u.srq_destroy.scratch, + pcmdinfo->post_sq); + break; default: status = -EOPNOTSUPP; break; @@ -6318,6 +6549,7 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, dev->protocol_used = info->protocol_used; /* Setup the hardware limits, hmc may limit further */ dev->hw_attrs.min_hw_qp_id = IRDMA_MIN_IW_QP_ID; + dev->hw_attrs.min_hw_srq_id = IRDMA_MIN_IW_SRQ_ID; dev->hw_attrs.min_hw_aeq_size = IRDMA_MIN_AEQ_ENTRIES; if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES_GEN_3; diff --git a/drivers/infiniband/hw/irdma/defs.h b/drivers/infiniband/hw/irdma/defs.h index d8f5ad23770b..408058b6ba55 100644 --- a/drivers/infiniband/hw/irdma/defs.h +++ b/drivers/infiniband/hw/irdma/defs.h @@ -140,7 +140,11 @@ enum irdma_protocol_used { #define IRDMA_QP_SW_MAX_RQ_QUANTA 32768 #define IRDMA_MAX_QP_WRS(max_quanta_per_wr) \ ((IRDMA_QP_SW_MAX_WQ_QUANTA - IRDMA_SQ_RSVD) / (max_quanta_per_wr)) +#define IRDMA_SRQ_MIN_QUANTA 8 #define IRDMA_SRQ_MAX_QUANTA 262144 +#define IRDMA_MAX_SRQ_WRS \ + ((IRDMA_SRQ_MAX_QUANTA - IRDMA_RQ_RSVD) / IRDMA_MAX_QUANTA_PER_WR) + #define IRDMAQP_TERM_SEND_TERM_AND_FIN 0 #define IRDMAQP_TERM_SEND_TERM_ONLY 1 #define IRDMAQP_TERM_SEND_FIN_ONLY 2 @@ -236,9 +240,12 @@ enum irdma_cqp_op_type { IRDMA_OP_ADD_LOCAL_MAC_ENTRY = 46, IRDMA_OP_DELETE_LOCAL_MAC_ENTRY = 47, IRDMA_OP_CQ_MODIFY = 48, + IRDMA_OP_SRQ_CREATE = 49, + IRDMA_OP_SRQ_MODIFY = 50, + IRDMA_OP_SRQ_DESTROY = 51, /* Must be last entry*/ - IRDMA_MAX_CQP_OPS = 49, + IRDMA_MAX_CQP_OPS = 52, }; /* CQP SQ WQES */ @@ -248,6 +255,9 @@ enum irdma_cqp_op_type { #define IRDMA_CQP_OP_CREATE_CQ 0x03 #define IRDMA_CQP_OP_MODIFY_CQ 0x04 #define IRDMA_CQP_OP_DESTROY_CQ 0x05 +#define IRDMA_CQP_OP_CREATE_SRQ 0x06 +#define IRDMA_CQP_OP_MODIFY_SRQ 0x07 +#define IRDMA_CQP_OP_DESTROY_SRQ 0x08 #define IRDMA_CQP_OP_ALLOC_STAG 0x09 #define IRDMA_CQP_OP_REG_MR 0x0a #define IRDMA_CQP_OP_QUERY_STAG 0x0b @@ -519,6 +529,7 @@ enum irdma_cqp_op_type { #define IRDMA_CQ_ERROR BIT_ULL(55) #define IRDMA_CQ_SQ BIT_ULL(62) +#define IRDMA_CQ_SRQ BIT_ULL(52) #define IRDMA_CQ_VALID BIT_ULL(63) #define IRDMA_CQ_IMMVALID BIT_ULL(62) #define IRDMA_CQ_UDSMACVALID BIT_ULL(61) @@ -628,6 +639,24 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_QP_DBSHADOWADDR IRDMA_CQPHC_QPCTX +#define IRDMA_CQPSQ_SRQ_RQSIZE GENMASK_ULL(3, 0) +#define IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE GENMASK_ULL(5, 4) +#define IRDMA_CQPSQ_SRQ_SRQ_LIMIT GENMASK_ULL(43, 32) +#define IRDMA_CQPSQ_SRQ_SRQCTX GENMASK_ULL(63, 6) +#define IRDMA_CQPSQ_SRQ_PD_ID GENMASK_ULL(39, 16) +#define IRDMA_CQPSQ_SRQ_SRQ_ID GENMASK_ULL(15, 0) +#define IRDMA_CQPSQ_SRQ_OP GENMASK_ULL(37, 32) +#define IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE GENMASK_ULL(45, 44) +#define IRDMA_CQPSQ_SRQ_VIRTMAP BIT_ULL(47) +#define IRDMA_CQPSQ_SRQ_TPH_EN BIT_ULL(60) +#define IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT BIT_ULL(61) +#define IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX GENMASK_ULL(27, 0) +#define IRDMA_CQPSQ_SRQ_TPH_VALUE GENMASK_ULL(7, 0) +#define IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S 8 +#define IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR GENMASK_ULL(63, 8) +#define IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S 6 +#define IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR GENMASK_ULL(63, 6) + #define IRDMA_CQPSQ_CQ_CQSIZE GENMASK_ULL(20, 0) #define IRDMA_CQPSQ_CQ_CQCTX GENMASK_ULL(62, 0) #define IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD GENMASK(17, 0) @@ -779,6 +808,11 @@ enum irdma_cqp_op_type { #define IRDMAQPC_INSERTL2TAG2 BIT_ULL(11) #define IRDMAQPC_LIMIT GENMASK_ULL(13, 12) +#define IRDMAQPC_USE_SRQ BIT_ULL(10) +#define IRDMAQPC_SRQ_ID GENMASK_ULL(15, 0) +#define IRDMAQPC_PASID GENMASK_ULL(19, 0) +#define IRDMAQPC_PASID_VALID BIT_ULL(11) + #define IRDMAQPC_ECN_EN BIT_ULL(14) #define IRDMAQPC_DROPOOOSEG BIT_ULL(15) #define IRDMAQPC_DUPACK_THRESH GENMASK_ULL(18, 16) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 2931d1a879e9..27b9623c2b09 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -269,6 +269,7 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) struct irdma_sc_qp *qp = NULL; struct irdma_qp_host_ctx_info *ctx_info = NULL; struct irdma_device *iwdev = rf->iwdev; + struct irdma_sc_srq *srq; unsigned long flags; u32 aeqcnt = 0; @@ -320,6 +321,9 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) iwqp->last_aeq = info->ae_id; spin_unlock_irqrestore(&iwqp->lock, flags); ctx_info = &iwqp->ctx_info; + } else if (info->srq) { + if (info->ae_id != IRDMA_AE_SRQ_LIMIT) + continue; } else { if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR && info->ae_id != IRDMA_AE_CQP_DEFERRED_COMPLETE) @@ -417,6 +421,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) } irdma_cq_rem_ref(&iwcq->ibcq); break; + case IRDMA_AE_SRQ_LIMIT: + srq = (struct irdma_sc_srq *)(uintptr_t)info->compl_ctx; + irdma_srq_event(srq); + break; + case IRDMA_AE_SRQ_CATASTROPHIC_ERROR: + break; case IRDMA_AE_CQP_DEFERRED_COMPLETE: /* Remove completed CQP requests from pending list * and notify about those CQP ops completion. @@ -1839,7 +1849,9 @@ static void irdma_get_used_rsrc(struct irdma_device *iwdev) iwdev->rf->used_qps = find_first_zero_bit(iwdev->rf->allocated_qps, iwdev->rf->max_qp); iwdev->rf->used_cqs = find_first_zero_bit(iwdev->rf->allocated_cqs, - iwdev->rf->max_cq); + iwdev->rf->max_cq); + iwdev->rf->used_srqs = find_first_zero_bit(iwdev->rf->allocated_srqs, + iwdev->rf->max_srq); iwdev->rf->used_mrs = find_first_zero_bit(iwdev->rf->allocated_mrs, iwdev->rf->max_mr); } @@ -2056,7 +2068,8 @@ static void irdma_set_hw_rsrc(struct irdma_pci_f *rf) rf->allocated_qps = (void *)(rf->mem_rsrc + (sizeof(struct irdma_arp_entry) * rf->arp_table_size)); rf->allocated_cqs = &rf->allocated_qps[BITS_TO_LONGS(rf->max_qp)]; - rf->allocated_mrs = &rf->allocated_cqs[BITS_TO_LONGS(rf->max_cq)]; + rf->allocated_srqs = &rf->allocated_cqs[BITS_TO_LONGS(rf->max_cq)]; + rf->allocated_mrs = &rf->allocated_srqs[BITS_TO_LONGS(rf->max_srq)]; rf->allocated_pds = &rf->allocated_mrs[BITS_TO_LONGS(rf->max_mr)]; rf->allocated_ahs = &rf->allocated_pds[BITS_TO_LONGS(rf->max_pd)]; rf->allocated_mcgs = &rf->allocated_ahs[BITS_TO_LONGS(rf->max_ah)]; @@ -2084,12 +2097,14 @@ static u32 irdma_calc_mem_rsrc_size(struct irdma_pci_f *rf) rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_qp); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_mr); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_cq); + rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_srq); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_pd); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->arp_table_size); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_ah); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_mcg); rsrc_size += sizeof(struct irdma_qp **) * rf->max_qp; rsrc_size += sizeof(struct irdma_cq **) * rf->max_cq; + rsrc_size += sizeof(struct irdma_srq **) * rf->max_srq; return rsrc_size; } @@ -2117,6 +2132,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) rf->max_qp = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt; rf->max_mr = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt; rf->max_cq = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt; + rf->max_srq = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].cnt; rf->max_pd = rf->sc_dev.hw_attrs.max_hw_pds; rf->arp_table_size = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].cnt; rf->max_ah = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt; @@ -2136,6 +2152,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) set_bit(0, rf->allocated_mrs); set_bit(0, rf->allocated_qps); set_bit(0, rf->allocated_cqs); + set_bit(0, rf->allocated_srqs); set_bit(0, rf->allocated_pds); set_bit(0, rf->allocated_arps); set_bit(0, rf->allocated_ahs); diff --git a/drivers/infiniband/hw/irdma/irdma.h b/drivers/infiniband/hw/irdma/irdma.h index e012f795bce8..ff938a01d70c 100644 --- a/drivers/infiniband/hw/irdma/irdma.h +++ b/drivers/infiniband/hw/irdma/irdma.h @@ -162,6 +162,7 @@ struct irdma_hw_attrs { u32 max_done_count; u32 max_sleep_count; u32 max_cqp_compl_wait_time_ms; + u32 min_hw_srq_id; u16 max_stat_inst; u16 max_stat_idx; }; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 6922cfaac6d0..886b30da188a 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -274,6 +274,8 @@ struct irdma_pci_f { u32 max_mr; u32 max_qp; u32 max_cq; + u32 max_srq; + u32 next_srq; u32 max_ah; u32 next_ah; u32 max_mcg; @@ -287,6 +289,7 @@ struct irdma_pci_f { u32 mr_stagmask; u32 used_pds; u32 used_cqs; + u32 used_srqs; u32 used_mrs; u32 used_qps; u32 arp_table_size; @@ -298,6 +301,7 @@ struct irdma_pci_f { unsigned long *allocated_ws_nodes; unsigned long *allocated_qps; unsigned long *allocated_cqs; + unsigned long *allocated_srqs; unsigned long *allocated_mrs; unsigned long *allocated_pds; unsigned long *allocated_mcgs; @@ -421,6 +425,11 @@ static inline struct irdma_pci_f *dev_to_rf(struct irdma_sc_dev *dev) return container_of(dev, struct irdma_pci_f, sc_dev); } +static inline struct irdma_srq *to_iwsrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct irdma_srq, ibsrq); +} + /** * irdma_alloc_resource - allocate a resource * @iwdev: device pointer @@ -516,7 +525,8 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, void irdma_cq_add_ref(struct ib_cq *ibcq); void irdma_cq_rem_ref(struct ib_cq *ibcq); void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); - +void irdma_srq_event(struct irdma_sc_srq *srq); +void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq); void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf); int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, struct irdma_modify_qp_info *info, bool wait); diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index f681baedd029..43dcdc7b846c 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -242,6 +242,7 @@ enum irdma_syn_rst_handling { enum irdma_queue_type { IRDMA_QUEUE_TYPE_SQ_RQ = 0, IRDMA_QUEUE_TYPE_CQP, + IRDMA_QUEUE_TYPE_SRQ, }; struct irdma_sc_dev; @@ -732,6 +733,51 @@ struct irdma_modify_cq_info { bool cq_resize:1; }; +struct irdma_srq_init_info { + struct irdma_sc_pd *pd; + struct irdma_sc_vsi *vsi; + u64 srq_pa; + u64 shadow_area_pa; + u32 first_pm_pbl_idx; + u32 pasid; + u32 srq_size; + u16 srq_limit; + u8 pasid_valid; + u8 wqe_size; + u8 leaf_pbl_size; + u8 virtual_map; + u8 tph_en; + u8 arm_limit_event; + u8 tph_value; + u8 pbl_chunk_size; + struct irdma_srq_uk_init_info srq_uk_init_info; +}; + +struct irdma_sc_srq { + struct irdma_sc_dev *dev; + struct irdma_sc_vsi *vsi; + struct irdma_sc_pd *pd; + struct irdma_srq_uk srq_uk; + void *back_srq; + u64 srq_pa; + u64 shadow_area_pa; + u32 first_pm_pbl_idx; + u32 pasid; + u32 hw_srq_size; + u16 srq_limit; + u8 pasid_valid; + u8 leaf_pbl_size; + u8 virtual_map; + u8 tph_en; + u8 arm_limit_event; + u8 tph_val; +}; + +struct irdma_modify_srq_info { + u16 srq_limit; + u8 arm_limit_event; +}; + struct irdma_create_qp_info { bool ord_valid:1; bool tcp_ctx_valid:1; @@ -1038,6 +1084,7 @@ struct irdma_qp_host_ctx_info { }; u32 send_cq_num; u32 rcv_cq_num; + u32 srq_id; u32 rem_endpoint_idx; u16 stats_idx; bool srq_valid:1; @@ -1337,6 +1384,8 @@ void irdma_sc_cq_resize(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *inf int irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, u8 hmc_fn_id, bool post_sq, bool poll_registers); +int irdma_sc_srq_init(struct irdma_sc_srq *srq, + struct irdma_srq_init_info *info); void sc_vsi_update_stats(struct irdma_sc_vsi *vsi); struct cqp_info { @@ -1580,6 +1629,23 @@ struct cqp_info { struct irdma_dma_mem query_buff_mem; u64 scratch; } query_rdma; + + struct { + struct irdma_sc_srq *srq; + u64 scratch; + } srq_create; + + struct { + struct irdma_sc_srq *srq; + struct irdma_modify_srq_info info; + u64 scratch; + } srq_modify; + + struct { + struct irdma_sc_srq *srq; + u64 scratch; + } srq_destroy; + } u; }; diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 38c54e59cc2e..e7ffde792781 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -198,6 +198,26 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, return wqe; } +__le64 *irdma_srq_get_next_recv_wqe(struct irdma_srq_uk *srq, u32 *wqe_idx) +{ + int ret_code; + __le64 *wqe; + + if (IRDMA_RING_FULL_ERR(srq->srq_ring)) + return NULL; + + IRDMA_ATOMIC_RING_MOVE_HEAD(srq->srq_ring, *wqe_idx, ret_code); + if (ret_code) + return NULL; + + if (!*wqe_idx) + srq->srwqe_polarity = !srq->srwqe_polarity; + /* rq_wqe_size_multiplier is no of 32 byte quanta in one rq wqe */ + wqe = srq->srq_base[*wqe_idx * (srq->wqe_size_multiplier)].elem; + + return wqe; +} + /** * irdma_qp_get_next_recv_wqe - get next qp's rcv wqe * @qp: hw qp ptr @@ -317,6 +337,58 @@ int irdma_uk_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, return 0; } +/** + * irdma_uk_srq_post_receive - post a receive wqe to a shared rq + * @srq: shared rq ptr + * @info: post rq information + */ +int irdma_uk_srq_post_receive(struct irdma_srq_uk *srq, + struct irdma_post_rq_info *info) +{ + u32 wqe_idx, i, byte_off; + u32 addl_frag_cnt; + __le64 *wqe; + u64 hdr; + + if (srq->max_srq_frag_cnt < info->num_sges) + return -EINVAL; + + wqe = irdma_srq_get_next_recv_wqe(srq, &wqe_idx); + if (!wqe) + return -ENOMEM; + + addl_frag_cnt = info->num_sges > 1 ? info->num_sges - 1 : 0; + srq->wqe_ops.iw_set_fragment(wqe, 0, info->sg_list, + srq->srwqe_polarity); + + for (i = 1, byte_off = 32; i < info->num_sges; i++) { + srq->wqe_ops.iw_set_fragment(wqe, byte_off, &info->sg_list[i], + srq->srwqe_polarity); + byte_off += 16; + } + + /* if not an odd number set valid bit in next fragment */ + if (srq->uk_attrs->hw_rev >= IRDMA_GEN_2 && !(info->num_sges & 0x01) && + info->num_sges) { + srq->wqe_ops.iw_set_fragment(wqe, byte_off, NULL, + srq->srwqe_polarity); + if (srq->uk_attrs->hw_rev == IRDMA_GEN_2) + ++addl_frag_cnt; + } + + set_64bit_val(wqe, 16, (u64)info->wr_id); + hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, addl_frag_cnt) | + FIELD_PREP(IRDMAQPSQ_VALID, srq->srwqe_polarity); + + dma_wmb(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + set_64bit_val(srq->shadow_area, 0, (wqe_idx + 1) % srq->srq_ring.size); + + return 0; +} + /** * irdma_uk_rdma_read - rdma read command * @qp: hw qp ptr @@ -973,6 +1045,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, u64 comp_ctx, qword0, qword2, qword3; __le64 *cqe; struct irdma_qp_uk *qp; + struct irdma_srq_uk *srq; + u8 is_srq; struct irdma_ring *pring = NULL; u32 wqe_idx; int ret_code; @@ -1046,8 +1120,14 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } info->q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); + is_srq = (u8)FIELD_GET(IRDMA_CQ_SRQ, qword3); info->error = (bool)FIELD_GET(IRDMA_CQ_ERROR, qword3); info->ipv4 = (bool)FIELD_GET(IRDMACQ_IPV4, qword3); + get_64bit_val(cqe, 8, &comp_ctx); + if (is_srq) + get_64bit_val(cqe, 40, (u64 *)&qp); + else + qp = (struct irdma_qp_uk *)(unsigned long)comp_ctx; if (info->error) { info->major_err = FIELD_GET(IRDMA_CQ_MAJERR, qword3); info->minor_err = FIELD_GET(IRDMA_CQ_MINERR, qword3); @@ -1085,7 +1165,22 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->qp_handle = (irdma_qp_handle)(unsigned long)qp; info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); - if (info->q_type == IRDMA_CQE_QTYPE_RQ) { + if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) { + srq = qp->srq_uk; + + get_64bit_val(cqe, 8, &info->wr_id); + info->bytes_xfered = (u32)FIELD_GET(IRDMACQ_PAYLDLEN, qword0); + + if (qword3 & IRDMACQ_STAG) { + info->stag_invalid_set = true; + info->inv_stag = (u32)FIELD_GET(IRDMACQ_INVSTAG, + qword2); + } else { + info->stag_invalid_set = false; + } + IRDMA_RING_MOVE_TAIL(srq->srq_ring); + pring = &srq->srq_ring; + } else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) { u32 array_idx; array_idx = wqe_idx / qp->rq_wqe_size_multiplier; @@ -1210,10 +1305,10 @@ exit: } /** - * irdma_qp_round_up - return round up qp wq depth + * irdma_round_up_wq - return round up qp wq depth * @wqdepth: wq depth in quanta to round up */ -static int irdma_qp_round_up(u32 wqdepth) +static int irdma_round_up_wq(u32 wqdepth) { int scount = 1; @@ -1268,7 +1363,7 @@ int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift, { u32 min_size = (u32)uk_attrs->min_hw_wq_size << shift; - *sqdepth = irdma_qp_round_up((sq_size << shift) + IRDMA_SQ_RSVD); + *sqdepth = irdma_round_up_wq((sq_size << shift) + IRDMA_SQ_RSVD); if (*sqdepth < min_size) *sqdepth = min_size; @@ -1290,7 +1385,7 @@ int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, { u32 min_size = (u32)uk_attrs->min_hw_wq_size << shift; - *rqdepth = irdma_qp_round_up((rq_size << shift) + IRDMA_RQ_RSVD); + *rqdepth = irdma_round_up_wq((rq_size << shift) + IRDMA_RQ_RSVD); if (*rqdepth < min_size) *rqdepth = min_size; @@ -1300,6 +1395,26 @@ int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, return 0; } +/* + * irdma_get_srqdepth - get SRQ depth (quanta) + * @uk_attrs: qp HW attributes + * @srq_size: SRQ size + * @shift: shift which determines size of WQE + * @srqdepth: depth of SRQ + */ +int irdma_get_srqdepth(struct irdma_uk_attrs *uk_attrs, u32 srq_size, u8 shift, + u32 *srqdepth) +{ + *srqdepth = irdma_round_up_wq((srq_size << shift) + IRDMA_RQ_RSVD); + + if (*srqdepth < ((u32)uk_attrs->min_hw_wq_size << shift)) + *srqdepth = uk_attrs->min_hw_wq_size << shift; + else if (*srqdepth > uk_attrs->max_hw_srq_quanta) + return -EINVAL; + + return 0; +} + static const struct irdma_wqe_uk_ops iw_wqe_uk_ops = { .iw_copy_inline_data = irdma_copy_inline_data, .iw_inline_data_size_to_quanta = irdma_inline_data_size_to_quanta, @@ -1335,6 +1450,42 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp, IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt); } +/** + * irdma_uk_srq_init - initialize shared qp + * @srq: hw srq (user and kernel) + * @info: srq initialization info + * + * Initializes the vars used in both user and kernel mode. + * The size of the wqe depends on number of max fragments + * allowed. Then size of wqe * the number of wqes should be the + * amount of memory allocated for srq. + */ +int irdma_uk_srq_init(struct irdma_srq_uk *srq, + struct irdma_srq_uk_init_info *info) +{ + u8 rqshift; + + srq->uk_attrs = info->uk_attrs; + if (info->max_srq_frag_cnt > srq->uk_attrs->max_hw_wq_frags) + return -EINVAL; + + irdma_get_wqe_shift(srq->uk_attrs, info->max_srq_frag_cnt, 0, &rqshift); + srq->srq_caps = info->srq_caps; + srq->srq_base = info->srq; + srq->shadow_area = info->shadow_area; + srq->srq_id = info->srq_id; + srq->srwqe_polarity = 0; + srq->srq_size = info->srq_size; + srq->wqe_size = rqshift; + srq->max_srq_frag_cnt = min(srq->uk_attrs->max_hw_wq_frags, + ((u32)2 << rqshift) - 1); + IRDMA_RING_INIT(srq->srq_ring, srq->srq_size); + srq->wqe_size_multiplier = 1 << rqshift; + srq->wqe_ops = iw_wqe_uk_ops; + + return 0; +} + /** * irdma_uk_calc_shift_wq - calculate WQE shift for both SQ and RQ * @ukinfo: qp initialization info @@ -1461,6 +1612,7 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info) qp->wqe_ops = iw_wqe_uk_ops_gen_1; else qp->wqe_ops = iw_wqe_uk_ops; + qp->srq_uk = info->srq_uk; return ret_code; } diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 5f489feda32c..cf324f1c539e 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -60,6 +60,7 @@ enum irdma_device_caps_const { IRDMA_GATHER_STATS_BUF_SIZE = 1024, IRDMA_MIN_IW_QP_ID = 0, IRDMA_MAX_IW_QP_ID = 262143, + IRDMA_MIN_IW_SRQ_ID = 0, IRDMA_MIN_CEQID = 0, IRDMA_MAX_CEQID = 1023, IRDMA_CEQ_MAX_COUNT = IRDMA_MAX_CEQID + 1, @@ -148,6 +149,8 @@ enum irdma_qp_caps { IRDMA_PUSH_MODE = 8, }; +struct irdma_srq_uk; +struct irdma_srq_uk_init_info; struct irdma_qp_uk; struct irdma_cq_uk; struct irdma_qp_uk_init_info; @@ -301,6 +304,39 @@ int irdma_uk_calc_depth_shift_sq(struct irdma_qp_uk_init_info *ukinfo, u32 *sq_depth, u8 *sq_shift); int irdma_uk_calc_depth_shift_rq(struct irdma_qp_uk_init_info *ukinfo, u32 *rq_depth, u8 *rq_shift); +int irdma_uk_srq_init(struct irdma_srq_uk *srq, + struct irdma_srq_uk_init_info *info); +int irdma_uk_srq_post_receive(struct irdma_srq_uk *srq, + struct irdma_post_rq_info *info); + +struct irdma_srq_uk { + u32 srq_caps; + struct irdma_qp_quanta *srq_base; + struct irdma_uk_attrs *uk_attrs; + __le64 *shadow_area; + struct irdma_ring srq_ring; + struct irdma_ring initial_ring; + u32 srq_id; + u32 srq_size; + u32 max_srq_frag_cnt; + struct irdma_wqe_uk_ops wqe_ops; + u8 srwqe_polarity; + u8 wqe_size; + u8 wqe_size_multiplier; + u8 deferred_flag; +}; + +struct irdma_srq_uk_init_info { + struct irdma_qp_quanta *srq; + struct irdma_uk_attrs *uk_attrs; + __le64 *shadow_area; + u64 *srq_wrid_array; + u32 srq_id; + u32 srq_caps; + u32 srq_size; + u32 max_srq_frag_cnt; +}; + struct irdma_sq_uk_wr_trk_info { u64 wrid; u32 wr_len; @@ -345,6 +381,7 @@ struct irdma_qp_uk { bool destroy_pending:1; /* Indicates the QP is being destroyed */ void *back_qp; u8 dbg_rq_flushed; + struct irdma_srq_uk *srq_uk; u8 sq_flush_seen; u8 rq_flush_seen; }; @@ -384,6 +421,7 @@ struct irdma_qp_uk_init_info { u8 rq_shift; int abi_ver; bool legacy_mode; + struct irdma_srq_uk *srq_uk; }; struct irdma_cq_uk_init_info { @@ -399,6 +437,7 @@ struct irdma_cq_uk_init_info { __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, u16 quanta, u32 total_size, struct irdma_post_sq_info *info); +__le64 *irdma_srq_get_next_recv_wqe(struct irdma_srq_uk *srq, u32 *wqe_idx); __le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx); void irdma_uk_clean_cq(void *q, struct irdma_cq_uk *cq); int irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq); @@ -410,5 +449,7 @@ int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift, u32 *wqdepth); int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, u32 *wqdepth); +int irdma_get_srqdepth(struct irdma_uk_attrs *uk_attrs, u32 srq_size, u8 shift, + u32 *srqdepth); void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx); #endif /* IRDMA_USER_H */ diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 0b12a875dbe9..8b94d87b0192 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -697,6 +697,9 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = { [IRDMA_OP_ADD_LOCAL_MAC_ENTRY] = "Add Local MAC Entry Cmd", [IRDMA_OP_DELETE_LOCAL_MAC_ENTRY] = "Delete Local MAC Entry Cmd", [IRDMA_OP_CQ_MODIFY] = "CQ Modify Cmd", + [IRDMA_OP_SRQ_CREATE] = "Create SRQ Cmd", + [IRDMA_OP_SRQ_MODIFY] = "Modify SRQ Cmd", + [IRDMA_OP_SRQ_DESTROY] = "Destroy SRQ Cmd", }; static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = { @@ -1167,6 +1170,30 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp) kfree(iwqp->kqp.rq_wrid_mem); } +/** + * irdma_srq_wq_destroy - send srq destroy cqp + * @rf: RDMA PCI function + * @srq: hardware control srq + */ +void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq) +{ + struct irdma_cqp_request *cqp_request; + struct cqp_cmds_info *cqp_info; + + cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); + if (!cqp_request) + return; + + cqp_info = &cqp_request->info; + cqp_info->cqp_cmd = IRDMA_OP_SRQ_DESTROY; + cqp_info->post_sq = 1; + cqp_info->in.u.srq_destroy.srq = srq; + cqp_info->in.u.srq_destroy.scratch = (uintptr_t)cqp_request; + + irdma_handle_cqp_op(rf, cqp_request); + irdma_put_cqp_request(&rf->cqp, cqp_request); +} + /** * irdma_cq_wq_destroy - send cq destroy cqp * @rf: RDMA PCI function diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index da0f56e0c897..1134a3546d91 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -59,6 +59,9 @@ static int irdma_query_device(struct ib_device *ibdev, #define HCA_CLOCK_TIMESTAMP_MASK 0x1ffff if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_2) props->timestamp_mask = HCA_CLOCK_TIMESTAMP_MASK; + props->max_srq = rf->max_srq - rf->used_srqs; + props->max_srq_wr = IRDMA_MAX_SRQ_WRS; + props->max_srq_sge = hw_attrs->uk_attrs.max_hw_wq_frags; return 0; } @@ -336,6 +339,8 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx, uresp.comp_mask |= IRDMA_ALLOC_UCTX_USE_RAW_ATTR; uresp.min_hw_wq_size = uk_attrs->min_hw_wq_size; uresp.comp_mask |= IRDMA_ALLOC_UCTX_MIN_HW_WQ_SIZE; + uresp.max_hw_srq_quanta = uk_attrs->max_hw_srq_quanta; + uresp.comp_mask |= IRDMA_ALLOC_UCTX_MAX_HW_SRQ_QUANTA; if (ib_copy_to_udata(udata, &uresp, min(sizeof(uresp), udata->outlen))) { rdma_user_mmap_entry_remove(ucontext->db_mmap_entry); @@ -347,6 +352,8 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx, spin_lock_init(&ucontext->cq_reg_mem_list_lock); INIT_LIST_HEAD(&ucontext->qp_reg_mem_list); spin_lock_init(&ucontext->qp_reg_mem_list_lock); + INIT_LIST_HEAD(&ucontext->srq_reg_mem_list); + spin_lock_init(&ucontext->srq_reg_mem_list_lock); return 0; @@ -571,7 +578,11 @@ static void irdma_setup_virt_qp(struct irdma_device *iwdev, if (iwpbl->pbl_allocated) { init_info->virtual_map = true; init_info->sq_pa = qpmr->sq_pbl.idx; - init_info->rq_pa = qpmr->rq_pbl.idx; + /* Need to use contiguous buffer for RQ of QP + * in case it is associated with SRQ. + */ + init_info->rq_pa = init_info->qp_uk_init_info.srq_uk ? + qpmr->rq_pa : qpmr->rq_pbl.idx; } else { init_info->sq_pa = qpmr->sq_pbl.addr; init_info->rq_pa = qpmr->rq_pbl.addr; @@ -940,6 +951,18 @@ static int irdma_create_qp(struct ib_qp *ibqp, struct irdma_uk_attrs *uk_attrs = &dev->hw_attrs.uk_attrs; struct irdma_qp_init_info init_info = {}; struct irdma_qp_host_ctx_info *ctx_info; + struct irdma_srq *iwsrq; + bool srq_valid = false; + u32 srq_id = 0; + + if (init_attr->srq) { + iwsrq = to_iwsrq(init_attr->srq); + srq_valid = true; + srq_id = iwsrq->srq_num; + init_attr->cap.max_recv_sge = uk_attrs->max_hw_wq_frags; + init_attr->cap.max_recv_wr = 4; + init_info.qp_uk_init_info.srq_uk = &iwsrq->sc_srq.srq_uk; + } err_code = irdma_validate_qp_attrs(init_attr, iwdev); if (err_code) @@ -1046,6 +1069,8 @@ static int irdma_create_qp(struct ib_qp *ibqp, } ctx_info = &iwqp->ctx_info; + ctx_info->srq_valid = srq_valid; + ctx_info->srq_id = srq_id; ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id; ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id; @@ -1171,6 +1196,7 @@ static int irdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->qp_context = iwqp->ibqp.qp_context; init_attr->send_cq = iwqp->ibqp.send_cq; init_attr->recv_cq = iwqp->ibqp.recv_cq; + init_attr->srq = iwqp->ibqp.srq; init_attr->cap = attr->cap; return 0; @@ -1833,6 +1859,24 @@ exit: return err; } +/** + * irdma_srq_free_rsrc - free up resources for srq + * @rf: RDMA PCI function + * @iwsrq: srq ptr + */ +static void irdma_srq_free_rsrc(struct irdma_pci_f *rf, struct irdma_srq *iwsrq) +{ + struct irdma_sc_srq *srq = &iwsrq->sc_srq; + + if (!iwsrq->user_mode) { + dma_free_coherent(rf->sc_dev.hw->device, iwsrq->kmem.size, + iwsrq->kmem.va, iwsrq->kmem.pa); + iwsrq->kmem.va = NULL; + } + + irdma_free_rsrc(rf, rf->allocated_srqs, srq->srq_uk.srq_id); +} + /** * irdma_cq_free_rsrc - free up resources for cq * @rf: RDMA PCI function @@ -1896,6 +1940,22 @@ static int irdma_process_resize_list(struct irdma_cq *iwcq, return cnt; } +/** + * irdma_destroy_srq - destroy srq + * @ibsrq: srq pointer + * @udata: user data + */ +static int irdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(ibsrq->device); + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_sc_srq *srq = &iwsrq->sc_srq; + + irdma_srq_wq_destroy(iwdev->rf, srq); + irdma_srq_free_rsrc(iwdev->rf, iwsrq); + return 0; +} + /** * irdma_destroy_cq - destroy cq * @ib_cq: cq pointer @@ -2084,6 +2144,293 @@ error: return ret; } +/** + * irdma_srq_event - event notification for srq limit + * @srq: shared srq struct + */ +void irdma_srq_event(struct irdma_sc_srq *srq) +{ + struct irdma_srq *iwsrq = container_of(srq, struct irdma_srq, sc_srq); + struct ib_srq *ibsrq = &iwsrq->ibsrq; + struct ib_event event; + + srq->srq_limit = 0; + + if (!ibsrq->event_handler) + return; + + event.device = ibsrq->device; + event.element.port_num = 1; + event.element.srq = ibsrq; + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + ibsrq->event_handler(&event, ibsrq->srq_context); +} + +/** + * irdma_modify_srq - modify srq request + * @ibsrq: srq's pointer for modify + * @attr: access attributes + * @attr_mask: state mask + * @udata: user data + */ +static int irdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(ibsrq->device); + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_cqp_request *cqp_request; + struct irdma_pci_f *rf = iwdev->rf; + struct irdma_modify_srq_info *info; + struct cqp_cmds_info *cqp_info; + int status; + + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (!(attr_mask & IB_SRQ_LIMIT)) + return 0; + + if (attr->srq_limit > iwsrq->sc_srq.srq_uk.srq_size) + return -EINVAL; + + /* Execute this cqp op synchronously, so we can update srq_limit + * upon successful completion. + */ + cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); + if (!cqp_request) + return -ENOMEM; + + cqp_info = &cqp_request->info; + info = &cqp_info->in.u.srq_modify.info; + info->srq_limit = attr->srq_limit; + if (info->srq_limit > 0xFFF) + info->srq_limit = 0xFFF; + info->arm_limit_event = 1; + + cqp_info->cqp_cmd = IRDMA_OP_SRQ_MODIFY; + cqp_info->post_sq = 1; + cqp_info->in.u.srq_modify.srq = &iwsrq->sc_srq; + cqp_info->in.u.srq_modify.scratch = (uintptr_t)cqp_request; + status = irdma_handle_cqp_op(rf, cqp_request); + irdma_put_cqp_request(&rf->cqp, cqp_request); + if (status) + return status; + + iwsrq->sc_srq.srq_limit = info->srq_limit; + + return 0; +} + +static int irdma_setup_umode_srq(struct irdma_device *iwdev, + struct irdma_srq *iwsrq, + struct irdma_srq_init_info *info, + struct ib_udata *udata) +{ +#define IRDMA_CREATE_SRQ_MIN_REQ_LEN \ + offsetofend(struct irdma_create_srq_req, user_shadow_area) + struct irdma_create_srq_req req = {}; + struct irdma_ucontext *ucontext; + struct irdma_srq_mr *srqmr; + struct irdma_pbl *iwpbl; + unsigned long flags; + + iwsrq->user_mode = true; + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + + if (udata->inlen < IRDMA_CREATE_SRQ_MIN_REQ_LEN) + return -EINVAL; + + if (ib_copy_from_udata(&req, udata, + min(sizeof(req), udata->inlen))) + return -EFAULT; + + spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags); + iwpbl = irdma_get_pbl((unsigned long)req.user_srq_buf, + &ucontext->srq_reg_mem_list); + spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags); + if (!iwpbl) + return -EPROTO; + + iwsrq->iwpbl = iwpbl; + srqmr = &iwpbl->srq_mr; + + if (iwpbl->pbl_allocated) { + info->virtual_map = true; + info->pbl_chunk_size = 1; + info->first_pm_pbl_idx = srqmr->srq_pbl.idx; + info->leaf_pbl_size = 1; + } else { + info->srq_pa = srqmr->srq_pbl.addr; + } + info->shadow_area_pa = srqmr->shadow; + + return 0; +} + +static int irdma_setup_kmode_srq(struct irdma_device *iwdev, + struct irdma_srq *iwsrq, + struct irdma_srq_init_info *info, u32 depth, + u8 shift) +{ + struct irdma_srq_uk_init_info *ukinfo = &info->srq_uk_init_info; + struct irdma_dma_mem *mem = &iwsrq->kmem; + u32 size, ring_size; + + ring_size = depth * IRDMA_QP_WQE_MIN_SIZE; + size = ring_size + (IRDMA_SHADOW_AREA_SIZE << 3); + + mem->size = ALIGN(size, 256); + mem->va = dma_alloc_coherent(iwdev->rf->hw.device, mem->size, + &mem->pa, GFP_KERNEL); + if (!mem->va) + return -ENOMEM; + + ukinfo->srq = mem->va; + ukinfo->srq_size = depth >> shift; + ukinfo->shadow_area = mem->va + ring_size; + + info->shadow_area_pa = info->srq_pa + ring_size; + info->srq_pa = mem->pa; + + return 0; +} + +/** + * irdma_create_srq - create srq + * @ibsrq: ib's srq pointer + * @initattrs: attributes for srq + * @udata: user data for create srq + */ +static int irdma_create_srq(struct ib_srq *ibsrq, + struct ib_srq_init_attr *initattrs, + struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(ibsrq->device); + struct ib_srq_attr *attr = &initattrs->attr; + struct irdma_pd *iwpd = to_iwpd(ibsrq->pd); + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_srq_uk_init_info *ukinfo; + struct irdma_cqp_request *cqp_request; + struct irdma_srq_init_info info = {}; + struct irdma_pci_f *rf = iwdev->rf; + struct irdma_uk_attrs *uk_attrs; + struct cqp_cmds_info *cqp_info; + int err_code = 0; + u32 depth; + u8 shift; + + uk_attrs = &rf->sc_dev.hw_attrs.uk_attrs; + ukinfo = &info.srq_uk_init_info; + + if (initattrs->srq_type != IB_SRQT_BASIC) + return -EOPNOTSUPP; + + if (!(uk_attrs->feature_flags & IRDMA_FEATURE_SRQ) || + attr->max_sge > uk_attrs->max_hw_wq_frags) + return -EINVAL; + + refcount_set(&iwsrq->refcnt, 1); + spin_lock_init(&iwsrq->lock); + err_code = irdma_alloc_rsrc(rf, rf->allocated_srqs, rf->max_srq, + &iwsrq->srq_num, &rf->next_srq); + if (err_code) + return err_code; + + ukinfo->max_srq_frag_cnt = attr->max_sge; + ukinfo->uk_attrs = uk_attrs; + ukinfo->srq_id = iwsrq->srq_num; + + irdma_get_wqe_shift(ukinfo->uk_attrs, ukinfo->max_srq_frag_cnt, 0, + &shift); + + err_code = irdma_get_srqdepth(ukinfo->uk_attrs, attr->max_wr, + shift, &depth); + if (err_code) + return err_code; + + /* Actual SRQ size in WRs for ring and HW */ + ukinfo->srq_size = depth >> shift; + + /* Max postable WRs to SRQ */ + iwsrq->max_wr = (depth - IRDMA_RQ_RSVD) >> shift; + attr->max_wr = iwsrq->max_wr; + + if (udata) + err_code = irdma_setup_umode_srq(iwdev, iwsrq, &info, udata); + else + err_code = irdma_setup_kmode_srq(iwdev, iwsrq, &info, depth, + shift); + + if (err_code) + goto free_rsrc; + + info.vsi = &iwdev->vsi; + info.pd = &iwpd->sc_pd; + + err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info); + if (err_code) + goto free_dmem; + + cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); + if (!cqp_request) { + err_code = -ENOMEM; + goto free_dmem; + } + + cqp_info = &cqp_request->info; + cqp_info->cqp_cmd = IRDMA_OP_SRQ_CREATE; + cqp_info->post_sq = 1; + cqp_info->in.u.srq_create.srq = &iwsrq->sc_srq; + cqp_info->in.u.srq_create.scratch = (uintptr_t)cqp_request; + err_code = irdma_handle_cqp_op(rf, cqp_request); + irdma_put_cqp_request(&rf->cqp, cqp_request); + if (err_code) + goto free_dmem; + + if (udata) { + struct irdma_create_srq_resp resp = {}; + + resp.srq_id = iwsrq->srq_num; + resp.srq_size = ukinfo->srq_size; + if (ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen))) { + err_code = -EPROTO; + goto srq_destroy; + } + } + + return 0; + +srq_destroy: + irdma_srq_wq_destroy(rf, &iwsrq->sc_srq); + +free_dmem: + if (!iwsrq->user_mode) + dma_free_coherent(rf->hw.device, iwsrq->kmem.size, + iwsrq->kmem.va, iwsrq->kmem.pa); +free_rsrc: + irdma_free_rsrc(rf, rf->allocated_srqs, iwsrq->srq_num); + return err_code; +} + +/** + * irdma_query_srq - get SRQ attributes + * @ibsrq: the SRQ to query + * @attr: the attributes of the SRQ + */ +static int irdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + + attr->max_wr = iwsrq->max_wr; + attr->max_sge = iwsrq->sc_srq.srq_uk.max_srq_frag_cnt; + attr->srq_limit = iwsrq->sc_srq.srq_limit; + + return 0; +} + static inline int cq_validate_flags(u32 flags, u8 hw_rev) { /* GEN1 does not support CQ create flags */ @@ -2536,6 +2883,7 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, struct irdma_mr *iwmr = iwpbl->iwmr; struct irdma_qp_mr *qpmr = &iwpbl->qp_mr; struct irdma_cq_mr *cqmr = &iwpbl->cq_mr; + struct irdma_srq_mr *srqmr = &iwpbl->srq_mr; struct irdma_hmc_pble *hmc_p; u64 *arr = iwmr->pgaddrmem; u32 pg_size, total; @@ -2555,7 +2903,10 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, total = req->sq_pages + req->rq_pages; hmc_p = &qpmr->sq_pbl; qpmr->shadow = (dma_addr_t)arr[total]; - + /* Need to use physical address for RQ of QP + * in case it is associated with SRQ. + */ + qpmr->rq_pa = (dma_addr_t)arr[req->sq_pages]; if (lvl) { ret = irdma_check_mem_contiguous(arr, req->sq_pages, pg_size); @@ -2575,6 +2926,18 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, hmc_p->addr = arr[req->sq_pages]; } break; + case IRDMA_MEMREG_TYPE_SRQ: + hmc_p = &srqmr->srq_pbl; + srqmr->shadow = (dma_addr_t)arr[req->rq_pages]; + if (lvl) + ret = irdma_check_mem_contiguous(arr, req->rq_pages, + pg_size); + + if (!ret) + hmc_p->idx = palloc->level1.idx; + else + hmc_p->addr = arr[0]; + break; case IRDMA_MEMREG_TYPE_CQ: hmc_p = &cqmr->cq_pbl; @@ -3045,6 +3408,37 @@ static int irdma_reg_user_mr_type_qp(struct irdma_mem_reg_req req, return 0; } +static int irdma_reg_user_mr_type_srq(struct irdma_mem_reg_req req, + struct ib_udata *udata, + struct irdma_mr *iwmr) +{ + struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device); + struct irdma_pbl *iwpbl = &iwmr->iwpbl; + struct irdma_ucontext *ucontext; + unsigned long flags; + u32 total; + int err; + u8 lvl; + + total = req.rq_pages + IRDMA_SHADOW_PGCNT; + if (total > iwmr->page_cnt) + return -EINVAL; + + lvl = req.rq_pages > 1 ? PBLE_LEVEL_1 : PBLE_LEVEL_0; + err = irdma_handle_q_mem(iwdev, &req, iwpbl, lvl); + if (err) + return err; + + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags); + list_add_tail(&iwpbl->list, &ucontext->srq_reg_mem_list); + iwpbl->on_list = true; + spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags); + + return 0; +} + static int irdma_reg_user_mr_type_cq(struct irdma_mem_reg_req req, struct ib_udata *udata, struct irdma_mr *iwmr) @@ -3135,6 +3529,12 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, if (err) goto error; + break; + case IRDMA_MEMREG_TYPE_SRQ: + err = irdma_reg_user_mr_type_srq(req, udata, iwmr); + if (err) + goto error; + break; case IRDMA_MEMREG_TYPE_CQ: err = irdma_reg_user_mr_type_cq(req, udata, iwmr); @@ -3455,6 +3855,14 @@ static void irdma_del_memlist(struct irdma_mr *iwmr, } spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); break; + case IRDMA_MEMREG_TYPE_SRQ: + spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags); + if (iwpbl->on_list) { + iwpbl->on_list = false; + list_del(&iwpbl->list); + } + spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags); + break; default: break; } @@ -3673,6 +4081,47 @@ static int irdma_post_send(struct ib_qp *ibqp, return err; } +/** + * irdma_post_srq_recv - post receive wr for kernel application + * @ibsrq: ib srq pointer + * @ib_wr: work request for receive + * @bad_wr: bad wr caused an error + */ +static int irdma_post_srq_recv(struct ib_srq *ibsrq, + const struct ib_recv_wr *ib_wr, + const struct ib_recv_wr **bad_wr) +{ + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_srq_uk *uksrq = &iwsrq->sc_srq.srq_uk; + struct irdma_post_rq_info post_recv = {}; + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&iwsrq->lock, flags); + while (ib_wr) { + if (ib_wr->num_sge > uksrq->max_srq_frag_cnt) { + err = -EINVAL; + goto out; + } + post_recv.num_sges = ib_wr->num_sge; + post_recv.wr_id = ib_wr->wr_id; + post_recv.sg_list = ib_wr->sg_list; + err = irdma_uk_srq_post_receive(uksrq, &post_recv); + if (err) + goto out; + + ib_wr = ib_wr->next; + } + +out: + spin_unlock_irqrestore(&iwsrq->lock, flags); + + if (err) + *bad_wr = ib_wr; + + return err; +} + /** * irdma_post_recv - post receive wr for kernel application * @ibqp: ib qp pointer @@ -3692,6 +4141,11 @@ static int irdma_post_recv(struct ib_qp *ibqp, iwqp = to_iwqp(ibqp); ukqp = &iwqp->sc_qp.qp_uk; + if (ukqp->srq_uk) { + *bad_wr = ib_wr; + return -EINVAL; + } + spin_lock_irqsave(&iwqp->lock, flags); while (ib_wr) { post_recv.num_sges = ib_wr->num_sge; @@ -4780,6 +5234,18 @@ static enum rdma_link_layer irdma_get_link_layer(struct ib_device *ibdev, return IB_LINK_LAYER_ETHERNET; } +static const struct ib_device_ops irdma_gen1_dev_ops = { + .dealloc_driver = irdma_ib_dealloc_device, +}; + +static const struct ib_device_ops irdma_gen3_dev_ops = { + .create_srq = irdma_create_srq, + .destroy_srq = irdma_destroy_srq, + .modify_srq = irdma_modify_srq, + .post_srq_recv = irdma_post_srq_recv, + .query_srq = irdma_query_srq, +}; + static const struct ib_device_ops irdma_roce_dev_ops = { .attach_mcast = irdma_attach_mcast, .create_ah = irdma_create_ah, @@ -4850,6 +5316,7 @@ static const struct ib_device_ops irdma_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_cq, irdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_mw, irdma_mr, ibmw), INIT_RDMA_OBJ_SIZE(ib_qp, irdma_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_srq, irdma_srq, ibsrq), }; /** @@ -4897,6 +5364,10 @@ static void irdma_init_rdma_device(struct irdma_device *iwdev) iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; iwdev->ibdev.dev.parent = &pcidev->dev; ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops); + if (iwdev->rf->rdma_ver == IRDMA_GEN_1) + ib_set_device_ops(&iwdev->ibdev, &irdma_gen1_dev_ops); + if (iwdev->rf->rdma_ver >= IRDMA_GEN_3) + ib_set_device_ops(&iwdev->ibdev, &irdma_gen3_dev_ops); } /** diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index fcb163c45252..2817122ba989 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -8,6 +8,7 @@ #define IRDMA_PKEY_TBL_SZ 1 #define IRDMA_DEFAULT_PKEY 0xFFFF +#define IRDMA_SHADOW_PGCNT 1 struct irdma_ucontext { struct ib_ucontext ibucontext; @@ -17,6 +18,8 @@ struct irdma_ucontext { spinlock_t cq_reg_mem_list_lock; /* protect CQ memory list */ struct list_head qp_reg_mem_list; spinlock_t qp_reg_mem_list_lock; /* protect QP memory list */ + struct list_head srq_reg_mem_list; + spinlock_t srq_reg_mem_list_lock; /* protect SRQ memory list */ int abi_ver; u8 legacy_mode : 1; u8 use_raw_attrs : 1; @@ -65,10 +68,16 @@ struct irdma_cq_mr { bool split; }; +struct irdma_srq_mr { + struct irdma_hmc_pble srq_pbl; + dma_addr_t shadow; +}; + struct irdma_qp_mr { struct irdma_hmc_pble sq_pbl; struct irdma_hmc_pble rq_pbl; dma_addr_t shadow; + dma_addr_t rq_pa; struct page *sq_page; }; @@ -85,6 +94,7 @@ struct irdma_pbl { union { struct irdma_qp_mr qp_mr; struct irdma_cq_mr cq_mr; + struct irdma_srq_mr srq_mr; }; bool pbl_allocated:1; @@ -112,6 +122,21 @@ struct irdma_mr { struct irdma_pbl iwpbl; }; +struct irdma_srq { + struct ib_srq ibsrq; + struct irdma_sc_srq sc_srq __aligned(64); + struct irdma_dma_mem kmem; + u64 *srq_wrid_mem; + refcount_t refcnt; + spinlock_t lock; /* for poll srq */ + struct irdma_pbl *iwpbl; + struct irdma_sge *sg_list; + u16 srq_head; + u32 srq_num; + u32 max_wr; + bool user_mode:1; +}; + struct irdma_cq { struct ib_cq ibcq; struct irdma_sc_cq sc_cq; diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h index 4e42054cca33..f7788d33376b 100644 --- a/include/uapi/rdma/irdma-abi.h +++ b/include/uapi/rdma/irdma-abi.h @@ -20,11 +20,13 @@ enum irdma_memreg_type { IRDMA_MEMREG_TYPE_MEM = 0, IRDMA_MEMREG_TYPE_QP = 1, IRDMA_MEMREG_TYPE_CQ = 2, + IRDMA_MEMREG_TYPE_SRQ = 3, }; enum { IRDMA_ALLOC_UCTX_USE_RAW_ATTR = 1 << 0, IRDMA_ALLOC_UCTX_MIN_HW_WQ_SIZE = 1 << 1, + IRDMA_ALLOC_UCTX_MAX_HW_SRQ_QUANTA = 1 << 2, IRDMA_SUPPORT_WQE_FORMAT_V2 = 1 << 3, }; @@ -55,7 +57,8 @@ struct irdma_alloc_ucontext_resp { __u8 rsvd2; __aligned_u64 comp_mask; __u16 min_hw_wq_size; - __u8 rsvd3[6]; + __u32 max_hw_srq_quanta; + __u8 rsvd3[2]; }; struct irdma_alloc_pd_resp { @@ -72,6 +75,16 @@ struct irdma_create_cq_req { __aligned_u64 user_shadow_area; }; +struct irdma_create_srq_req { + __aligned_u64 user_srq_buf; + __aligned_u64 user_shadow_area; +}; + +struct irdma_create_srq_resp { + __u32 srq_id; + __u32 srq_size; +}; + struct irdma_create_qp_req { __aligned_u64 user_wqe_bufs; __aligned_u64 user_compl_ctx; -- cgit v1.2.3 From 1b34cbbf4f011a121ef7b2d7d6e6920a036d5285 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Sep 2025 17:20:59 +0800 Subject: crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg Issuing two writes to the same af_alg socket is bogus as the data will be interleaved in an unpredictable fashion. Furthermore, concurrent writes may create inconsistencies in the internal socket state. Disallow this by adding a new ctx->write field that indiciates exclusive ownership for writing. Fixes: 8ff590903d5 ("crypto: algif_skcipher - User-space interface for skcipher operations") Reported-by: Muhammad Alifa Ramdhan Reported-by: Bing-Jhong Billy Jheng Signed-off-by: Herbert Xu --- crypto/af_alg.c | 7 +++++++ include/crypto/if_alg.h | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 407f2c238f2c..ca6fdcc6c54a 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -970,6 +970,12 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, } lock_sock(sk); + if (ctx->write) { + release_sock(sk); + return -EBUSY; + } + ctx->write = true; + if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; @@ -1105,6 +1111,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unlock: af_alg_data_wakeup(sk); + ctx->write = false; release_sock(sk); return copied ?: err; diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index f7b3b93f3a49..0c70f3a55575 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -135,6 +135,7 @@ struct af_alg_async_req { * SG? * @enc: Cryptographic operation to be performed when * recvmsg is invoked. + * @write: True if we are in the middle of a write. * @init: True if metadata has been sent. * @len: Length of memory allocated for this data structure. * @inflight: Non-zero when AIO requests are in flight. @@ -151,10 +152,11 @@ struct af_alg_ctx { size_t used; atomic_t rcvused; - bool more; - bool merge; - bool enc; - bool init; + u32 more:1, + merge:1, + enc:1, + write:1, + init:1; unsigned int len; -- cgit v1.2.3 From a3d076b0567e729d5f21a95525c4d096b1f59e79 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Wed, 17 Sep 2025 16:27:58 +0300 Subject: net/mlx5: Add uar access and odp page fault counters Add bar_uar_access, odp_local_triggered_page_fault, and odp_remote_triggered_page_fault counters to the query_vnic_env command. Additionally, add corresponding capabilities bits to the HCA CAP. Signed-off-by: Akiva Goldberger Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758115678-643464-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 097b1b7ada63..0cf187e13def 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1958,7 +1958,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_rqt[0x5]; u8 reserved_at_390[0x3]; u8 log_max_rqt_size[0x5]; - u8 reserved_at_398[0x3]; + u8 reserved_at_398[0x1]; + u8 vnic_env_cnt_bar_uar_access[0x1]; + u8 vnic_env_cnt_odp_page_fault[0x1]; u8 log_max_tis_per_sq[0x5]; u8 ext_stride_num_range[0x1]; @@ -4019,7 +4021,13 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits { u8 handled_pkt_steering_fail[0x40]; - u8 reserved_at_360[0xc80]; + u8 bar_uar_access[0x20]; + + u8 odp_local_triggered_page_fault[0x20]; + + u8 odp_remote_triggered_page_fault[0x20]; + + u8 reserved_at_3c0[0xc20]; }; struct mlx5_ifc_traffic_counter_bits { -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'include') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From ed8a507b748336902525aa79e3573552534e8b3e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:30 -0700 Subject: net: modify core data structures for PSP datapath support Add pointers to psp data structures to core networking structs, and an SKB extension to carry the PSP information from the drivers to the socket layer. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-4-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ include/net/inet_timewait_sock.h | 3 +++ include/net/psp/functions.h | 6 ++++++ include/net/psp/types.h | 7 +++++++ include/net/sock.h | 4 ++++ net/core/skbuff.c | 4 ++++ net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp_minisocks.c | 2 ++ 8 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62e7addccdf6..78ecfa7d00d0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4901,6 +4901,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + SKB_EXT_PSP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 67a313575780..c1295246216c 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -81,6 +81,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif }; #define tw_tclass tw_tos diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 074f9df9afc3..d0043bd14299 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,10 +5,16 @@ #include +struct inet_timewait_sock; + /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +/* Kernel-facing API */ +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } + #endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h index d242b1ecee7d..4922fc8d42fd 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -84,6 +84,13 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +struct psp_skb_ext { + __be32 spi; + u16 dev_id; + u8 generation; + u8 version; +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334..d1d3d36e39ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -249,6 +249,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -450,6 +451,9 @@ struct sock { #endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; #endif struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 23b776cd9879..d331e607edfb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5062,6 +5063,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a..e298dacb4a06 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); + psp_sk_assoc_free(sk); } EXPORT_SYMBOL(inet_sock_destruct); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 327095ef95ef..ddb67015ba28 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -24,6 +24,7 @@ #include #include #include +#include static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -392,6 +393,7 @@ void tcp_twsk_destructor(struct sock *sk) } #endif tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) -- cgit v1.2.3 From 659a2899a57da59f433182eba571881884d6323e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:31 -0700 Subject: tcp: add datapath logic for PSP with inline key exchange Add validation points and state propagation to support PSP key exchange inline, on TCP connections. The expectation is that application will use some well established mechanism like TLS handshake to establish a secure channel over the connection and if both endpoints are PSP-capable - exchange and install PSP keys. Because the connection can existing in PSP-unsecured and PSP-secured state we need to make sure that there are no race conditions or retransmission leaks. On Tx - mark packets with the skb->decrypted bit when PSP key is at the enqueue time. Drivers should only encrypt packets with this bit set. This prevents retransmissions getting encrypted when original transmission was not. Similarly to TLS, we'll use sk->sk_validate_xmit_skb to make sure PSP skbs can't "escape" via a PSP-unaware device without being encrypted. On Rx - validation is done under socket lock. This moves the validation point later than xfrm, for example. Please see the documentation patch for more details on the flow of securing a connection, but for the purpose of this patch what's important is that we want to enforce the invariant that once connection is secured any skb in the receive queue has been encrypted with PSP. Add GRO and coalescing checks to prevent PSP authenticated data from being combined with cleartext data, or data with non-matching PSP state. On Rx, check skb's with psp_skb_coalesce_diff() at points before psp_sk_rx_policy_check(). After skb's are policy checked and on the socket receive queue, skb_cmp_decrypted() is sufficient for checking for coalescable PSP state. On Tx, tcp_write_collapse_fence() should be called when transitioning a socket into PSP Tx state to prevent data sent as cleartext from being coalesced with PSP encapsulated data. This change only adds the validation points, for ease of review. Subsequent change will add the ability to install keys, and flesh the enforcement logic out Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-5-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 ++++ include/net/psp/functions.h | 77 +++++++++++++++++++++++++++++++++++++++++++ net/core/gro.c | 2 ++ net/ipv4/inet_timewait_sock.c | 2 ++ net/ipv4/ip_output.c | 5 ++- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_ipv4.c | 14 +++++++- net/ipv4/tcp_minisocks.c | 18 ++++++++++ net/ipv4/tcp_output.c | 17 ++++++---- net/ipv6/tcp_ipv6.c | 11 +++++++ net/psp/Kconfig | 1 + 11 files changed, 147 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d8ff24a33459..58d91ccc56e0 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -127,6 +127,8 @@ FN(CANXL_RX_INVALID_FRAME) \ FN(PFMEMALLOC) \ FN(DUALPI2_STEP_DROP) \ + FN(PSP_INPUT) \ + FN(PSP_OUTPUT) \ FNe(MAX) /** @@ -610,6 +612,10 @@ enum skb_drop_reason { * threshold of DualPI2 qdisc. */ SKB_DROP_REASON_DUALPI2_STEP_DROP, + /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */ + SKB_DROP_REASON_PSP_INPUT, + /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ + SKB_DROP_REASON_PSP_OUTPUT, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index d0043bd14299..1ccc5fc238b8 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -3,6 +3,8 @@ #ifndef __NET_PSP_HELPERS_H #define __NET_PSP_HELPERS_H +#include +#include #include struct inet_timewait_sock; @@ -14,7 +16,82 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +#if IS_ENABLED(CONFIG_INET_PSP) static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(struct sk_buff *skb) { } + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} +#else +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(struct sk_buff *skb) { } + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} +#endif + +static inline unsigned long +psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) +{ + return __psp_skb_coalesce_diff(one, two, 0); +} #endif /* __NET_PSP_HELPERS_H */ diff --git a/net/core/gro.c b/net/core/gro.c index b350e5b69549..5ba4504cfd28 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include @@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b5426b8ee92..1f83f333b8ac 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -16,6 +16,7 @@ #include #include #include +#include /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -219,6 +220,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); + psp_twsk_init(tw, sk); } return tw; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2b96651d719b..5ca97ede979c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,6 +84,7 @@ #include #include #include +#include static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -1665,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - if (orig_sk) + if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); + psp_reply_set_decrypted(nskb); + } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b5c655ded1d..d6d0d970e014 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,6 +277,7 @@ #include #include #include +#include #include #include @@ -705,6 +706,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a63be1f6461..f27f6f865a48 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -1907,6 +1908,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) enum skb_drop_reason reason; struct sock *rsk; + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1968,6 +1973,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -2069,7 +2075,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || - memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || + /* prior to PSP Rx policy check, retain exact PSP metadata */ + psp_skb_coalesce_diff(tail, skb)) goto no_coalesce; __skb_pull(skb, hdrlen); @@ -2437,6 +2445,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ddb67015ba28..2ec8c6f1cdcc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -105,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; + enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; + /* Instead of dropping immediately, wait to see what value is + * returned. We will accept a non psp-encapsulated syn in the + * case where TCP_TW_SYN is returned. + */ + psp_drop = psp_twsk_rx_policy_check(tw, skb); + tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { @@ -125,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ + if (psp_drop) + goto out_put; + /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, @@ -195,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ + if (psp_drop) + goto out_put; + if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this @@ -248,6 +261,9 @@ kill: return TCP_TW_SYN; } + if (psp_drop) + goto out_put; + if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); @@ -266,6 +282,8 @@ kill: return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } + +out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 388c45859469..223d7feeb19d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -358,13 +359,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk, + u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); + psp_enqueue_set_decrypted(sk, skb); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1656,6 +1659,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -3778,7 +3782,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - tcp_init_nondata_skb(skb, tp->write_seq, + tcp_init_nondata_skb(skb, sk, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } @@ -3806,7 +3810,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ @@ -4303,7 +4307,7 @@ int tcp_connect(struct sock *sk) /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ - tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); + tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); @@ -4428,7 +4432,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); + tcp_init_nondata_skb(buff, sk, + tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4474,7 +4479,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..4da8eb9183d7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -973,6 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); + psp_reply_set_decrypted(buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; @@ -1605,6 +1607,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1684,6 +1690,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1988,6 +1995,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 55f9dd87446b..5e3908a40945 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -5,6 +5,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET + select SKB_DECRYPTED help Enable kernel support for the PSP protocol. For more information see: -- cgit v1.2.3 From 117f02a49b7719b210d154a0d0e728001bf4af06 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:32 -0700 Subject: psp: add op for rotation of device key Rotating the device key is a key part of the PSP protocol design. Some external daemon needs to do it once a day, or so. Add a netlink op to perform this operation. Add a notification group for informing users that key has been rotated and they should rekey (next rotation will cut them off). Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-6-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 21 +++++++++++++++++++ include/net/psp/types.h | 5 +++++ include/uapi/linux/psp.h | 3 +++ net/psp/psp-nl-gen.c | 15 ++++++++++++++ net/psp/psp-nl-gen.h | 2 ++ net/psp/psp_main.c | 3 ++- net/psp/psp_nl.c | 40 ++++++++++++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 706f4baf8764..054cc02b65ad 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -88,9 +88,30 @@ operations: notify: dev-get mcgrp: mgmt + - + name: key-rotate + doc: Rotate the device key. + attribute-set: dev + do: + request: + attributes: + - id + reply: + attributes: + - id + pre: psp-device-get-locked + post: psp-device-unlock + - + name: key-rotate-ntf + doc: Notification about device key getting rotated. + notify: key-rotate + mcgrp: use + mcast-groups: list: - name: mgmt + - + name: use ... diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 4922fc8d42fd..66327fa80c92 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -102,6 +102,11 @@ struct psp_dev_ops { */ int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 4a404f085190..cbfbf3f0f364 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -32,11 +32,14 @@ enum { PSP_CMD_DEV_DEL_NTF, PSP_CMD_DEV_SET, PSP_CMD_DEV_CHANGE_NTF, + PSP_CMD_KEY_ROTATE, + PSP_CMD_KEY_ROTATE_NTF, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) }; #define PSP_MCGRP_MGMT "mgmt" +#define PSP_MCGRP_USE "use" #endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 859712e7c2c1..7f49577ac72f 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -21,6 +21,11 @@ static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), }; +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -46,10 +51,20 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, }; struct genl_family psp_nl_family __ro_after_init = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index a099686cab5d..00a2d4ec59e4 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -20,9 +20,11 @@ psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, + PSP_NLGRP_USE, }; extern struct genl_family psp_nl_family; diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e09499b7b14a..f60155493afc 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -54,7 +54,8 @@ psp_dev_create(struct net_device *netdev, int err; if (WARN_ON(!psd_caps->versions || - !psd_ops->set_config)) + !psd_ops->set_config || + !psd_ops->key_rotate)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index fda5ce800f82..75f2702c1029 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -221,3 +221,43 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} -- cgit v1.2.3 From 8c511c1df380780b8a81050767dbfe7ca518d3a2 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 16 Sep 2025 17:09:33 -0700 Subject: net: move sk_validate_xmit_skb() to net/core/dev.c Move definition of sk_validate_xmit_skb() from net/core/sock.c to net/core/dev.c. This change is in preparation of the next patch, where sk_validate_xmit_skb() will need to cast sk to a tcp_timewait_sock *, and access member fields. Including linux/tcp.h from linux/sock.h creates a circular dependency, and dev.c is the only current call site of this function. Reviewed-by: Willem de Bruijn Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-7-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 22 ---------------------- net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index d1d3d36e39ae..bf92029a88d6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2960,28 +2960,6 @@ sk_requests_wifi_status(struct sock *sk) return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } -/* Checks if this SKB belongs to an HW offloaded socket - * and whether any SW fallbacks are required based on dev. - * Check decrypted mark in case skb_orphan() cleared socket. - */ -static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, - struct net_device *dev) -{ -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sock *sk = skb->sk; - - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); - } else if (unlikely(skb_is_decrypted(skb))) { - pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); - kfree_skb(skb); - skb = NULL; - } -#endif - - return skb; -} - /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ diff --git a/net/core/dev.c b/net/core/dev.c index 2522d9d8f0e4..384e59d7e715 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3907,6 +3907,28 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sock *sk = skb->sk; + + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { + skb = sk->sk_validate_xmit_skb(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; + } +#endif + + return skb; +} + static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, struct net_device *dev) { -- cgit v1.2.3 From 0917bb139eed467a6376db903ad7a67981ec1420 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 16 Sep 2025 17:09:34 -0700 Subject: net: tcp: allow tcp_timewait_sock to validate skbs before handing to device Provide a callback to validate skb's originating from tcp timewait socks before passing to the device layer. Full socks have a sk_validate_xmit_skb member for checking that a device is capable of performing offloads required for transmitting an skb. With psp, tcp timewait socks will inherit the crypto state from their corresponding full socks. Any ACKs or RSTs that originate from a tcp timewait sock carrying psp state should be psp encapsulated. Reviewed-by: Willem de Bruijn Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-8-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/inet_timewait_sock.h | 5 +++++ net/core/dev.c | 14 ++++++++++++-- net/ipv4/inet_timewait_sock.c | 3 +++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c1295246216c..3a31c74c9e15 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -84,6 +84,11 @@ struct inet_timewait_sock { #if IS_ENABLED(CONFIG_INET_PSP) struct psp_assoc __rcu *psp_assoc; #endif +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*tw_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif }; #define tw_tclass tw_tos diff --git a/net/core/dev.c b/net/core/dev.c index 384e59d7e715..5e22d062bac5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3915,10 +3915,20 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f83f333b8ac..2ca2912f61f4 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -212,6 +212,9 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + tw->tw_validate_xmit_skb = NULL; +#endif /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this -- cgit v1.2.3 From 6b46ca260e2290e3453d1355ab5b6d283d73d780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:35 -0700 Subject: net: psp: add socket security association code Add the ability to install PSP Rx and Tx crypto keys on TCP connections. Netlink ops are provided for both operations. Rx side combines allocating a new Rx key and installing it on the socket. Theoretically these are separate actions, but in practice they will always be used one after the other. We can add distinct "alloc" and "install" ops later. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-9-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 70 +++++++++ include/net/psp/functions.h | 114 +++++++++++++-- include/net/psp/types.h | 57 ++++++++ include/uapi/linux/psp.h | 21 +++ net/psp/Kconfig | 1 + net/psp/Makefile | 2 +- net/psp/psp-nl-gen.c | 39 +++++ net/psp/psp-nl-gen.h | 7 + net/psp/psp.h | 22 +++ net/psp/psp_main.c | 26 +++- net/psp/psp_nl.c | 232 +++++++++++++++++++++++++++++ net/psp/psp_sock.c | 274 +++++++++++++++++++++++++++++++++++ 12 files changed, 854 insertions(+), 11 deletions(-) create mode 100644 net/psp/psp_sock.c (limited to 'include') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 054cc02b65ad..944429e5c9a8 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -38,6 +38,44 @@ attribute-sets: type: u32 enum: version enum-as-flags: true + - + name: assoc + attributes: + - + name: dev-id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: version + doc: | + PSP versions (AEAD and protocol version) used by this association, + dictates the size of the key. + type: u32 + enum: version + - + name: rx-key + type: nest + nested-attributes: keys + - + name: tx-key + type: nest + nested-attributes: keys + - + name: sock-fd + doc: Sockets which should be bound to the association immediately. + type: u32 + - + name: keys + attributes: + - + name: key + type: binary + - + name: spi + doc: Security Parameters Index (SPI) of the association. + type: u32 operations: list: @@ -107,6 +145,38 @@ operations: notify: key-rotate mcgrp: use + - + name: rx-assoc + doc: Allocate a new Rx key + SPI pair, associate it with a socket. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - sock-fd + reply: + attributes: + - dev-id + - rx-key + pre: psp-assoc-device-get-locked + post: psp-device-unlock + - + name: tx-assoc + doc: Add a PSP Tx association. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - tx-key + - sock-fd + reply: + attributes: [] + pre: psp-assoc-device-get-locked + post: psp-device-unlock + mcast-groups: list: - diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 1ccc5fc238b8..0d7141230f47 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -4,7 +4,9 @@ #define __NET_PSP_HELPERS_H #include +#include #include +#include #include struct inet_timewait_sock; @@ -16,41 +18,130 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + #if IS_ENABLED(CONFIG_INET_PSP) -static inline void psp_sk_assoc_free(struct sock *sk) { } -static inline void -psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } -static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } -static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); return diffs; } +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +{ + struct inet_timewait_sock *tw; + struct psp_assoc *pas; + int state; + + state = 1 << READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + return NULL; + + tw = inet_twsk(sk); + pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { - return NULL; + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); } #else static inline void psp_sk_assoc_free(struct sock *sk) { } @@ -60,6 +151,11 @@ static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(struct sk_buff *skb) { } +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 66327fa80c92..b0e32e7165a3 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -51,6 +51,7 @@ struct psp_dev_config { * @refcnt: reference count for the instance * @id: instance id * @config: current device configuration + * @active_assocs: list of registered associations * * @rcu: RCU head for freeing the structure */ @@ -68,6 +69,8 @@ struct psp_dev { struct psp_dev_config config; + struct list_head active_assocs; + struct rcu_head rcu; }; @@ -80,6 +83,12 @@ struct psp_dev_caps { * Set this field to 0 to indicate PSP is not supported at all. */ u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_spc + */ + u32 assoc_drv_spc; }; #define PSP_MAX_KEY 32 @@ -91,6 +100,32 @@ struct psp_skb_ext { u8 version; }; +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ @@ -107,6 +142,28 @@ struct psp_dev_ops { * @key_rotate: rotate the device key */ int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index cbfbf3f0f364..607c42c39ba5 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -26,6 +26,25 @@ enum { PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) }; +enum { + PSP_A_ASSOC_DEV_ID = 1, + PSP_A_ASSOC_VERSION, + PSP_A_ASSOC_RX_KEY, + PSP_A_ASSOC_TX_KEY, + PSP_A_ASSOC_SOCK_FD, + + __PSP_A_ASSOC_MAX, + PSP_A_ASSOC_MAX = (__PSP_A_ASSOC_MAX - 1) +}; + +enum { + PSP_A_KEYS_KEY = 1, + PSP_A_KEYS_SPI, + + __PSP_A_KEYS_MAX, + PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1) +}; + enum { PSP_CMD_DEV_GET = 1, PSP_CMD_DEV_ADD_NTF, @@ -34,6 +53,8 @@ enum { PSP_CMD_DEV_CHANGE_NTF, PSP_CMD_KEY_ROTATE, PSP_CMD_KEY_ROTATE_NTF, + PSP_CMD_RX_ASSOC, + PSP_CMD_TX_ASSOC, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 5e3908a40945..a7d24691a7e1 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP protocol. For more information see: diff --git a/net/psp/Makefile b/net/psp/Makefile index 41b51d06e560..eb5ff3c5bfb2 100644 --- a/net/psp/Makefile +++ b/net/psp/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_INET_PSP) += psp.o -psp-y := psp_main.o psp_nl.o psp-nl-gen.o +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 7f49577ac72f..9fdd6f831803 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -10,6 +10,12 @@ #include +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + /* PSP_CMD_DEV_GET - do */ static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), @@ -26,6 +32,21 @@ static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -60,6 +81,24 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_ID, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 00a2d4ec59e4..25268ed11fb5 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -11,8 +11,13 @@ #include +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + int psp_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -21,6 +26,8 @@ int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, diff --git a/net/psp/psp.h b/net/psp/psp.h index 94d0cc31a61f..defd3e3fd5e7 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -4,6 +4,7 @@ #define __PSP_PSP_H #include +#include #include #include #include @@ -17,15 +18,36 @@ int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); + static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_destroy(psd); } +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + #endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index f60155493afc..a1ae3c8920c3 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -55,7 +55,10 @@ psp_dev_create(struct net_device *netdev, if (WARN_ON(!psd_caps->versions || !psd_ops->set_config || - !psd_ops->key_rotate)) + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); @@ -68,6 +71,7 @@ psp_dev_create(struct net_device *netdev, psd->drv_priv = priv_ptr; mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -107,6 +111,8 @@ void psp_dev_destroy(struct psp_dev *psd) */ void psp_dev_unregister(struct psp_dev *psd) { + struct psp_assoc *pas, *next; + mutex_lock(&psp_devs_lock); mutex_lock(&psd->lock); @@ -119,6 +125,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); + list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); psd->ops = NULL; @@ -130,6 +139,21 @@ void psp_dev_unregister(struct psp_dev *psd) } EXPORT_SYMBOL(psp_dev_unregister); +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 75f2702c1029..1b1d08fce637 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -79,9 +79,12 @@ void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); } static int @@ -261,3 +264,232 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..8ebccee94593 --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct dst_entry *dst; + struct psp_dev *psd; + + dst = sk_dst_get(sk); + if (!dst) + return NULL; + + rcu_read_lock(); + psd = rcu_dereference(dst->dev->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + rcu_read_unlock(); + + dst_release(dst); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + kfree_skb_reason(skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(skb->sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); -- cgit v1.2.3 From e97269257fe437910cddc7c642a636ca3cf9fb1d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:36 -0700 Subject: net: psp: update the TCP MSS to reflect PSP packet overhead PSP eats 40B of header space. Adjust MSS appropriately. We can either modify tcp_mtu_to_mss() / tcp_mss_to_mtu() or reuse icsk_ext_hdr_len. The former option is more TCP specific and has runtime overhead. The latter is a bit of a hack as PSP is not an ext_hdr. If one squints hard enough, UDP encap is just a more practical version of IPv6 exthdr, so go with the latter. Happy to change. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-10-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 14 ++++++++++++++ include/net/psp/types.h | 3 +++ net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/ipv6_sockglue.c | 6 +++++- net/ipv6/tcp_ipv6.c | 6 +++--- net/psp/psp_sock.c | 5 +++++ 6 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 0d7141230f47..183a3c9216b7 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -143,6 +144,14 @@ static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) return psp_sk_get_assoc_rcu(skb->sk); } + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; + bool has_psp = rcu_access_pointer(sk->psp_assoc); + + return has_psp ? psp_encap : 0; +} #else static inline void psp_sk_assoc_free(struct sock *sk) { } static inline void @@ -182,6 +191,11 @@ static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { return NULL; } + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + return 0; +} #endif static inline unsigned long diff --git a/include/net/psp/types.h b/include/net/psp/types.h index b0e32e7165a3..f93ad0e6c04f 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -93,6 +93,9 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +#define PSP_HDR_SIZE 16 /* We don't support optional fields, yet */ +#define PSP_TRL_SIZE 16 /* AES-GCM/GMAC trailer size */ + struct psp_skb_ext { __be32 spi; u16 dev_id; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f27f6f865a48..b1fcf3e4e1ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -294,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); - inet_csk(sk)->icsk_ext_hdr_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e66ec623972e..a61e742794f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -107,7 +108,10 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + + icsk->icsk_ext_hdr_len = + psp_sk_overhead(sk) + + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4da8eb9183d7..43d1109e2180 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -302,10 +302,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); - icsk->icsk_ext_hdr_len = 0; + icsk->icsk_ext_hdr_len = psp_sk_overhead(sk); if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + - opt->opt_nflen; + icsk->icsk_ext_hdr_len += opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 8ebccee94593..10e1fda30aa0 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -180,6 +180,7 @@ int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack) { + struct inet_connection_sock *icsk; struct psp_assoc *pas, *dummy; int err; @@ -236,6 +237,10 @@ int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, tcp_write_collapse_fence(sk); pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len += psp_sk_overhead(sk); + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + exit_free_dummy: kfree(dummy); exit_unlock: -- cgit v1.2.3 From e78851058b35deb9f2d60ecf698fbf7ae7790d09 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:37 -0700 Subject: psp: track generations of device key There is a (somewhat theoretical in absence of multi-host support) possibility that another entity will rotate the key and we won't know. This may lead to accepting packets with matching SPI but which used different crypto keys than we expected. The PSP Architecture specification mentions that an implementation should track device key generation when device keys are managed by the NIC. Some PSP implementations may opt to include this key generation state in decryption metadata each time a device key is used to decrypt a packet. If that is the case, that key generation counter can also be used when policy checking a decrypted skb against a psp_assoc. This is an optional feature that is not explicitly part of the PSP spec, but can provide additional security in the case where an attacker may have the ability to force key rotations faster than rekeying can occur. Since we're tracking "key generations" more explicitly now, maintain different lists for associations from different generations. This way we can catch stale associations (the user space should listen to rotation notifications and change the keys). Drivers can "opt out" of generation tracking by setting the generation value to 0. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-11-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/types.h | 10 ++++++++++ net/psp/psp.h | 1 + net/psp/psp_main.c | 6 +++++- net/psp/psp_nl.c | 10 ++++++++++ net/psp/psp_sock.c | 16 ++++++++++++++++ 5 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index f93ad0e6c04f..ec218747ced0 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -50,8 +50,12 @@ struct psp_dev_config { * @lock: instance lock, protects all fields * @refcnt: reference count for the instance * @id: instance id + * @generation: current generation of the device key * @config: current device configuration * @active_assocs: list of registered associations + * @prev_assocs: associations which use old (but still usable) + * device key + * @stale_assocs: associations which use a rotated out key * * @rcu: RCU head for freeing the structure */ @@ -67,13 +71,19 @@ struct psp_dev { u32 id; + u8 generation; + struct psp_dev_config config; struct list_head active_assocs; + struct list_head prev_assocs; + struct list_head stale_assocs; struct rcu_head rcu; }; +#define PSP_GEN_VALID_MASK 0x7f + /** * struct psp_dev_caps - PSP device capabilities */ diff --git a/net/psp/psp.h b/net/psp/psp.h index defd3e3fd5e7..0f34e1a23fdd 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -27,6 +27,7 @@ int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); +void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index a1ae3c8920c3..98ad8c85b58e 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -72,6 +72,8 @@ psp_dev_create(struct net_device *netdev, mutex_init(&psd->lock); INIT_LIST_HEAD(&psd->active_assocs); + INIT_LIST_HEAD(&psd->prev_assocs); + INIT_LIST_HEAD(&psd->stale_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -125,7 +127,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); - list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list) psp_dev_tx_key_del(psd, pas); rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 1b1d08fce637..8aaca62744c3 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -230,6 +230,7 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) struct psp_dev *psd = info->user_ptr[0]; struct genl_info ntf_info; struct sk_buff *ntf, *rsp; + u8 prev_gen; int err; rsp = psp_nl_reply_new(info); @@ -249,10 +250,19 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) goto err_free_ntf; } + /* suggest the next gen number, driver can override */ + prev_gen = psd->generation; + psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK; + err = psd->ops->key_rotate(psd, info->extack); if (err) goto err_free_ntf; + WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) || + psd->generation & ~PSP_GEN_VALID_MASK); + + psp_assocs_key_rotated(psd); + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, 0, PSP_NLGRP_USE, GFP_KERNEL); diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 10e1fda30aa0..afa966c6b69d 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -60,6 +60,7 @@ struct psp_assoc *psp_assoc_create(struct psp_dev *psd) pas->psd = psd; pas->dev_id = psd->id; + pas->generation = psd->generation; psp_dev_get(psd); refcount_set(&pas->refcnt, 1); @@ -248,6 +249,21 @@ exit_unlock: return err; } +void psp_assocs_key_rotated(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + /* Mark the stale associations as invalid, they will no longer + * be able to Rx any traffic. + */ + list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) + pas->generation |= ~PSP_GEN_VALID_MASK; + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + + /* TODO: we should inform the sockets that got shut down */ +} + void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { struct psp_assoc *pas = psp_sk_assoc(sk); -- cgit v1.2.3 From fc724515741a1b86ca0457825fdb784ab038e92c Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 16 Sep 2025 17:09:40 -0700 Subject: psp: provide encapsulation helper for drivers Create a new function psp_encapsulate(), which takes a TCP packet and PSP encapsulates it according to the "Transport Mode Packet Format" section of the PSP Architecture Specification. psp_encapsulate() does not push a PSP trailer onto the skb. Both IPv6 and IPv4 are supported. Virtualization cookie is not included. Reviewed-by: Willem de Bruijn Signed-off-by: Raed Salem Signed-off-by: Rahul Rameshbabu Signed-off-by: Cosmin Ratiu Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-14-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 2 ++ include/net/psp/types.h | 2 ++ net/psp/psp_main.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 183a3c9216b7..0a539e1b39f4 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -17,6 +17,8 @@ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); diff --git a/include/net/psp/types.h b/include/net/psp/types.h index ec218747ced0..d9688e66cf09 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -20,6 +20,8 @@ struct psphdr { __be64 vc[]; /* optional */ }; +#define PSP_ENCAP_HLEN (sizeof(struct udphdr) + sizeof(struct psphdr)) + #define PSP_SPI_KEY_ID GENMASK(30, 0) #define PSP_SPI_KEY_PHASE BIT(31) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 98ad8c85b58e..e026880fa1a2 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include #include #include +#include #include "psp.h" #include "psp-nl-gen.h" @@ -158,6 +160,69 @@ unsigned int psp_key_size(u32 version) } EXPORT_SYMBOL(psp_key_size); +static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, unsigned int udp_len, __be16 sport) +{ + struct udphdr *uh = udp_hdr(skb); + struct psphdr *psph = (struct psphdr *)(uh + 1); + + uh->dest = htons(PSP_DEFAULT_UDP_PORT); + uh->source = udp_flow_src_port(net, skb, 0, 0, false); + uh->check = 0; + uh->len = htons(udp_len); + + psph->nexthdr = IPPROTO_TCP; + psph->hdrlen = PSP_HDRLEN_NOOPT; + psph->crypt_offset = 0; + psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) | + FIELD_PREP(PSPHDR_VERFL_ONE, 1); + psph->spi = spi; + memset(&psph->iv, 0, sizeof(psph->iv)); +} + +/* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling + * them in. + */ +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport) +{ + u32 network_len = skb_network_header_len(skb); + u32 ethr_len = skb_mac_header_len(skb); + u32 bufflen = ethr_len + network_len; + + if (skb_cow_head(skb, PSP_ENCAP_HLEN)) + return false; + + skb_push(skb, PSP_ENCAP_HLEN); + skb->mac_header -= PSP_ENCAP_HLEN; + skb->network_header -= PSP_ENCAP_HLEN; + skb->transport_header -= PSP_ENCAP_HLEN; + memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen); + + if (skb->protocol == htons(ETH_P_IP)) { + ip_hdr(skb)->protocol = IPPROTO_UDP; + be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN); + ip_hdr(skb)->check = 0; + ip_hdr(skb)->check = + ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_UDP; + be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN); + } else { + return false; + } + + skb_set_inner_ipproto(skb, IPPROTO_TCP); + skb_set_inner_transport_header(skb, skb_transport_offset(skb) + + PSP_ENCAP_HLEN); + skb->encapsulation = 1; + psp_write_headers(net, skb, spi, ver, + skb->len - skb_transport_offset(skb), sport); + + return true; +} +EXPORT_SYMBOL(psp_dev_encapsulate); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); -- cgit v1.2.3 From 0eddb8023cee546eb05658ef3322234de8461f3b Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 16 Sep 2025 17:09:44 -0700 Subject: psp: provide decapsulation and receive helper for drivers Create psp_dev_rcv(), which drivers can call to psp decapsulate and attach a psp_skb_ext to an skb. psp_dev_rcv() only supports what the PSP architecture specification refers to as "transport mode" packets, where the L3 header is either IPv6 or IPv4. Reviewed-by: Willem de Bruijn Signed-off-by: Raed Salem Signed-off-by: Rahul Rameshbabu Signed-off-by: Cosmin Ratiu Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-18-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 1 + net/psp/psp_main.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 0a539e1b39f4..91ba06733321 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -19,6 +19,7 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport); +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e026880fa1a2..b4b756f87382 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -223,6 +223,94 @@ bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, } EXPORT_SYMBOL(psp_dev_encapsulate); +/* Receive handler for PSP packets. + * + * Presently it accepts only already-authenticated packets and does not + * support optional fields, such as virtualization cookies. The caller should + * ensure that skb->data is pointing to the mac header, and that skb->mac_len + * is set. + */ +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) +{ + int l2_hlen = 0, l3_hlen, encap; + struct psp_skb_ext *pse; + struct psphdr *psph; + struct ethhdr *eth; + struct udphdr *uh; + __be16 proto; + bool is_udp; + + eth = (struct ethhdr *)skb->data; + proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen); + if (proto == htons(ETH_P_IP)) + l3_hlen = sizeof(struct iphdr); + else if (proto == htons(ETH_P_IPV6)) + l3_hlen = sizeof(struct ipv6hdr); + else + return -EINVAL; + + if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))) + return -EINVAL; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + is_udp = iph->protocol == IPPROTO_UDP; + l3_hlen = iph->ihl * 4; + if (l3_hlen != sizeof(struct iphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)) + return -EINVAL; + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + is_udp = ipv6h->nexthdr == IPPROTO_UDP; + } + + if (unlikely(!is_udp)) + return -EINVAL; + + uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen); + if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) + return -EINVAL; + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + pse->spi = psph->spi; + pse->dev_id = dev_id; + pse->generation = generation; + pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); + + encap = PSP_ENCAP_HLEN; + encap += strip_icv ? PSP_TRL_SIZE : 0; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + iph->protocol = psph->nexthdr; + iph->tot_len = htons(ntohs(iph->tot_len) - encap); + iph->check = 0; + iph->check = ip_fast_csum((u8 *)iph, iph->ihl); + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + ipv6h->nexthdr = psph->nexthdr; + ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); + } + + memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); + skb_pull(skb, PSP_ENCAP_HLEN); + + if (strip_icv) + pskb_trim(skb, skb->len - PSP_TRL_SIZE); + + return 0; +} +EXPORT_SYMBOL(psp_dev_rcv); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); -- cgit v1.2.3 From 6684b91d04b408843bd65e2120f6db2159e4f40b Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 Sep 2025 21:08:38 -0700 Subject: bnxt_en: Implement ethtool .get_tunable() for ETHTOOL_PFC_PREVENTION_TOUT Return the current PFC watchdog timeout value if it is supported. Reviewed-by: Andy Gospodarek Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250917040839.1924698-10-michael.chan@broadcom.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 18 ++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 21 ++++++++++++ include/linux/bnxt/hsi.h | 40 +++++++++++++++++++++++ 4 files changed, 81 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c49a4755a94d..d59612d1e176 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -14748,6 +14748,23 @@ static bool bnxt_fw_pre_resv_vnics(struct bnxt *bp) return false; } +static void bnxt_hwrm_pfcwd_qcaps(struct bnxt *bp) +{ + struct hwrm_queue_pfcwd_timeout_qcaps_output *resp; + struct hwrm_queue_pfcwd_timeout_qcaps_input *req; + int rc; + + bp->max_pfcwd_tmo_ms = 0; + rc = hwrm_req_init(bp, req, HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS); + if (rc) + return; + resp = hwrm_req_hold(bp, req); + rc = hwrm_req_send_silent(bp, req); + if (!rc) + bp->max_pfcwd_tmo_ms = le16_to_cpu(resp->max_pfcwd_timeout); + hwrm_req_drop(bp, req); +} + static int bnxt_fw_init_one_p1(struct bnxt *bp) { int rc; @@ -14825,6 +14842,7 @@ static int bnxt_fw_init_one_p2(struct bnxt *bp) if (bnxt_fw_pre_resv_vnics(bp)) bp->fw_cap |= BNXT_FW_CAP_PRE_RESV_VNICS; + bnxt_hwrm_pfcwd_qcaps(bp); bnxt_hwrm_func_qcfg(bp); bnxt_hwrm_vnic_qcaps(bp); bnxt_hwrm_port_led_qcaps(bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index c0f46eaf91c0..06a4c2afdf8a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2426,6 +2426,8 @@ struct bnxt { u8 max_q; u8 num_tc; + u16 max_pfcwd_tmo_ms; + u8 tph_mode; unsigned int current_interval; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 2830a2b17a27..d94a8a2bf0f9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4399,6 +4399,23 @@ static int bnxt_get_eee(struct net_device *dev, struct ethtool_keee *edata) return 0; } +static int bnxt_hwrm_pfcwd_qcfg(struct bnxt *bp, u16 *val) +{ + struct hwrm_queue_pfcwd_timeout_qcfg_output *resp; + struct hwrm_queue_pfcwd_timeout_qcfg_input *req; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_QUEUE_PFCWD_TIMEOUT_QCFG); + if (rc) + return rc; + resp = hwrm_req_hold(bp, req); + rc = hwrm_req_send(bp, req); + if (!rc) + *val = le16_to_cpu(resp->pfcwd_timeout_value); + hwrm_req_drop(bp, req); + return rc; +} + static int bnxt_set_tunable(struct net_device *dev, const struct ethtool_tunable *tuna, const void *data) @@ -4431,6 +4448,10 @@ static int bnxt_get_tunable(struct net_device *dev, case ETHTOOL_RX_COPYBREAK: *(u32 *)data = bp->rx_copybreak; break; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (!bp->max_pfcwd_tmo_ms) + return -EOPNOTSUPP; + return bnxt_hwrm_pfcwd_qcfg(bp, data); default: return -EOPNOTSUPP; } diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h index 8c5dac3b3ef3..23e7b1290a92 100644 --- a/include/linux/bnxt/hsi.h +++ b/include/linux/bnxt/hsi.h @@ -6751,6 +6751,46 @@ struct hwrm_queue_dscp2pri_cfg_output { u8 valid; }; +/* hwrm_queue_pfcwd_timeout_qcaps_input (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcaps_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; +}; + +/* hwrm_queue_pfcwd_timeout_qcaps_output (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcaps_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + __le16 max_pfcwd_timeout; + u8 unused_0[5]; + u8 valid; +}; + +/* hwrm_queue_pfcwd_timeout_qcfg_input (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcfg_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; +}; + +/* hwrm_queue_pfcwd_timeout_qcfg_output (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcfg_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + __le16 pfcwd_timeout_value; + u8 unused_0[5]; + u8 valid; +}; + /* hwrm_vnic_alloc_input (size:192b/24B) */ struct hwrm_vnic_alloc_input { __le16 req_type; -- cgit v1.2.3 From fa18932afb29b51530508f28adcc824a8c00b712 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 Sep 2025 21:08:39 -0700 Subject: bnxt_en: Implement ethtool .set_tunable() for ETHTOOL_PFC_PREVENTION_TOUT Support the setting of the tunable if it is supported by firmware. The supported range is 0 to the maximum msec value reported by firmware. PFC_STORM_PREVENTION_AUTO is also supported and 0 means it is disabled. Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250917040839.1924698-11-michael.chan@broadcom.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 24 ++++++++++++++++++++++- include/linux/bnxt/hsi.h | 21 ++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index d94a8a2bf0f9..be32ef8f5c96 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4416,12 +4416,25 @@ static int bnxt_hwrm_pfcwd_qcfg(struct bnxt *bp, u16 *val) return rc; } +static int bnxt_hwrm_pfcwd_cfg(struct bnxt *bp, u16 val) +{ + struct hwrm_queue_pfcwd_timeout_cfg_input *req; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_QUEUE_PFCWD_TIMEOUT_CFG); + if (rc) + return rc; + req->pfcwd_timeout_value = cpu_to_le16(val); + rc = hwrm_req_send(bp, req); + return rc; +} + static int bnxt_set_tunable(struct net_device *dev, const struct ethtool_tunable *tuna, const void *data) { struct bnxt *bp = netdev_priv(dev); - u32 rx_copybreak; + u32 rx_copybreak, val; switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: @@ -4434,6 +4447,15 @@ static int bnxt_set_tunable(struct net_device *dev, bp->rx_copybreak = rx_copybreak; } return 0; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (BNXT_VF(bp) || !bp->max_pfcwd_tmo_ms) + return -EOPNOTSUPP; + + val = *(u16 *)data; + if (val > bp->max_pfcwd_tmo_ms && + val != PFC_STORM_PREVENTION_AUTO) + return -EINVAL; + return bnxt_hwrm_pfcwd_cfg(bp, val); default: return -EOPNOTSUPP; } diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h index 23e7b1290a92..47c34990cf23 100644 --- a/include/linux/bnxt/hsi.h +++ b/include/linux/bnxt/hsi.h @@ -6771,6 +6771,27 @@ struct hwrm_queue_pfcwd_timeout_qcaps_output { u8 valid; }; +/* hwrm_queue_pfcwd_timeout_cfg_input (size:192b/24B) */ +struct hwrm_queue_pfcwd_timeout_cfg_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; + __le16 pfcwd_timeout_value; + u8 unused_0[6]; +}; + +/* hwrm_queue_pfcwd_timeout_cfg_output (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_cfg_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + u8 unused_0[7]; + u8 valid; +}; + /* hwrm_queue_pfcwd_timeout_qcfg_input (size:128b/16B) */ struct hwrm_queue_pfcwd_timeout_qcfg_input { __le16 req_type; -- cgit v1.2.3 From f72e2cff13aefe305fc8fc6afe4f43626e4ad88c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:01 +0200 Subject: compiler_types: Add __assume macro Make the statement attribute "assume" with a new __assume macro available. The assume attribute is used to indicate that a certain condition is assumed to be true. Compilers may or may not use this indication to generate optimized code. If this condition is violated at runtime, the behavior is undefined. Note that the clang documentation states that optimizers may react differently to this attribute, and this may even have a negative performance impact. Therefore this attribute should be used with care. Signed-off-by: Heiko Carstens Reviewed-by: Nathan Chancellor Signed-off-by: Alexander Gordeev --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ init/Kconfig | 10 ++++++++++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11..2f3e80bf9f35 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -329,6 +329,29 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * The assume attribute is used to indicate that a certain condition is + * assumed to be true. If this condition is violated at runtime, the behavior + * is undefined. Compilers may or may not use this indication to generate + * optimized code. + * + * Note that the clang documentation states that optimizers may react + * differently to this attribute, and this may even have a negative + * performance impact. Therefore this attribute should be used with care. + * + * Optional: only supported since gcc >= 13 + * Optional: only supported since clang >= 19 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#id13 + * + */ +#ifdef CONFIG_CC_HAS_ASSUME +# define __assume(expr) __attribute__((__assume__(expr))) +#else +# define __assume(expr) +#endif + /* * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 diff --git a/init/Kconfig b/init/Kconfig index 836320251219..dabec90f18f6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -112,6 +112,16 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_ASSUME + bool + # clang needs to be at least 19.1.0 since the meaning of the assume + # attribute changed: + # https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17 + default y if CC_IS_CLANG && CLANG_VERSION >= 190100 + # supported since gcc 13.1.0 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654 + default y if CC_IS_GCC && GCC_VERSION >= 130100 + config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) -- cgit v1.2.3 From 84eaf4359c36b0ba888f571a964138d22ba5914f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:11 -0700 Subject: net: ethtool: add get_rx_ring_count callback to optimize RX ring queries Add a new optional get_rx_ring_count callback in ethtool_ops to allow drivers to provide the number of RX rings directly without going through the full get_rxnfc flow classification interface. Create ethtool_get_rx_ring_count() to use .get_rx_ring_count if available, falling back to get_rxnfc() otherwise. It needs to be non-static, given it will be called by other ethtool functions laters, as those calling get_rxfh(). Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-4-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ net/ethtool/common.c | 20 ++++++++++++++++++++ net/ethtool/common.h | 2 ++ net/ethtool/ioctl.c | 8 +++----- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d7d757e72554..c869b7f8bce8 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -968,6 +968,7 @@ struct kernel_ethtool_ts_info { * @reset: Reset (part of) the device, as specified by a bitmask of * flags from &enum ethtool_reset_flags. Returns a negative * error code or zero. + * @get_rx_ring_count: Return the number of RX rings * @get_rxfh_key_size: Get the size of the RX flow hash key. * Returns zero if not supported for this specific device. * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table. @@ -1162,6 +1163,7 @@ struct ethtool_ops { int (*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *); int (*flash_device)(struct net_device *, struct ethtool_flash *); int (*reset)(struct net_device *, u32 *); + u32 (*get_rx_ring_count)(struct net_device *dev); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..10460ea3717c 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -577,6 +577,26 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } +int ethtool_get_rx_ring_count(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings = {}; + int ret; + + if (ops->get_rx_ring_count) + return ops->get_rx_ring_count(dev); + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret < 0) + return ret; + + return rx_rings.data; +} + static int ethtool_get_rxnfc_rule_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c4d084dde5bf..1609cf4e53eb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,6 +54,8 @@ void ethtool_ringparam_get_cfg(struct net_device *dev, struct kernel_ethtool_ringparam *kparam, struct netlink_ext_ack *extack); +int ethtool_get_rx_ring_count(struct net_device *dev); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a0f3de76cea0..8493ee200601 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1212,23 +1212,21 @@ static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, u32 cmd, void __user *useraddr) { - const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size; int ret; - if (!ops->get_rxnfc) - return -EOPNOTSUPP; - info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; - ret = ops->get_rxnfc(dev, &info, NULL); + ret = ethtool_get_rx_ring_count(dev); if (ret < 0) return ret; + info.data = ret; + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); } -- cgit v1.2.3 From 87ebb628a5acb892eba41ef1d8989beb8f036034 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2025 13:53:37 +0000 Subject: net: clear sk->sk_ino in sk_set_socket(sk, NULL) Andrei Vagin reported that blamed commit broke CRIU. Indeed, while we want to keep sk_uid unchanged when a socket is cloned, we want to clear sk->sk_ino. Otherwise, sock_diag might report multiple sockets sharing the same inode number. Move the clearing part from sock_orphan() to sk_set_socket(sk, NULL), called both from sock_orphan() and sk_clone_lock(). Fixes: 5d6b58c932ec ("net: lockless sock_i_ino()") Closes: https://lore.kernel.org/netdev/aMhX-VnXkYDpKd9V@google.com/ Closes: https://github.com/checkpoint-restore/criu/issues/2744 Reported-by: Andrei Vagin Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Link: https://patch.msgid.link/20250917135337.1736101-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index fb13322a11fc..2e14283c5be1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2061,6 +2061,9 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); } } @@ -2082,8 +2085,6 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; - /* Note: sk_uid is unchanged. */ - WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } -- cgit v1.2.3 From df8922afc37aa2111ca79a216653a629146763ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2025 13:59:15 -0600 Subject: io_uring/msg_ring: kill alloc_cache for io_kiocb allocations A recent commit: fc582cd26e88 ("io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU") fixed an issue with not deferring freeing of io_kiocb structs that msg_ring allocates to after the current RCU grace period. But this only covers requests that don't end up in the allocation cache. If a request goes into the alloc cache, it can get reused before it is sane to do so. A recent syzbot report would seem to indicate that there's something there, however it may very well just be because of the KASAN poisoning that the alloc_cache handles manually. Rather than attempt to make the alloc_cache sane for that use case, just drop the usage of the alloc_cache for msg_ring request payload data. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Link: https://lore.kernel.org/io-uring/68cc2687.050a0220.139b6.0005.GAE@google.com/ Reported-by: syzbot+baa2e0f4e02df602583e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- io_uring/io_uring.c | 4 ---- io_uring/msg_ring.c | 24 ++---------------------- 3 files changed, 2 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d896..12f5ee43850e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,9 +420,6 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; - struct io_alloc_cache msg_cache; - spinlock_t msg_lock; - #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bcec12256f34..93665cebe9bd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -290,7 +290,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +336,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4c2578f2efcb..5e5b94236d72 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kfree_rcu(req, rcu_head); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); -} - static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_msg *msg) { struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; -- cgit v1.2.3 From a660194dd101e937c319171ad99c3fbe466fd825 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 17 Sep 2025 12:02:07 -0700 Subject: arm64: Enable permission change on arm64 kernel block mappings This patch paves the path to enable huge mappings in vmalloc space and linear map space by default on arm64. For this we must ensure that we can handle any permission games on the kernel (init_mm) pagetable. Previously, __change_memory_common() used apply_to_page_range() which does not support changing permissions for block mappings. We move away from this by using the pagewalk API, similar to what riscv does right now. It is the responsibility of the caller to ensure that the range over which permissions are being changed falls on leaf mapping boundaries. For systems with BBML2, this will be handled in future patches by dyanmically splitting the mappings when required. Unlike apply_to_page_range(), the pagewalk API currently enforces the init_mm.mmap_lock to be held. To avoid the unnecessary bottleneck of the mmap_lock for our usecase, this patch extends this generic API to be used locklessly, so as to retain the existing behaviour for changing permissions. Apart from this reason, it is noted at [1] that KFENCE can manipulate kernel pgtable entries during softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). This being a non-sleepable context, we cannot take the init_mm mmap lock. Add comments to highlight the conditions under which we can use the lockless variant - no underlying VMA, and the user having exclusive control over the range, thus guaranteeing no concurrent access. We require that the start and end of a given range do not partially overlap block mappings, or cont mappings. Return -EINVAL in case a partial block mapping is detected in any of the PGD/P4D/PUD/PMD levels; add a corresponding comment in update_range_prot() to warn that eliminating such a condition is the responsibility of the caller. Note that, the pte level callback may change permissions for a whole contpte block, and that will be done one pte at a time, as opposed to an atomic operation for the block mappings. This is fine as any access will decode either the old or the new permission until the TLBI. apply_to_page_range() currently performs all pte level callbacks while in lazy mmu mode. Since arm64 can optimize performance by batching barriers when modifying kernel pgtables in lazy mmu mode, we would like to continue to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() does not use lazy mmu mode. However, since the pagewalk framework is not allocating any memory, we can safely bracket the whole operation inside lazy mmu mode ourselves. Therefore, wrap the call to walk_kernel_page_table_range() with the lazy MMU helpers. Link: https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ [1] Signed-off-by: Dev Jain Signed-off-by: Yang Shi Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/mm/pageattr.c | 119 +++++++++++++++++++++++++++++++++++------------ include/linux/pagewalk.h | 3 ++ mm/pagewalk.c | 36 +++++++++----- 3 files changed, 115 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 667aff1efe49..c0648764c403 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,65 @@ struct page_change_data { pgprot_t clear_mask; }; +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) +{ + struct page_change_data *masks = walk->private; + + val &= ~(pgprot_val(masks->clear_mask)); + val |= (pgprot_val(masks->set_mask)); + + return val; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = pudp_get(pud); + + if (pud_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) + return -EINVAL; + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = pmdp_get(pmd); + + if (pmd_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) + return -EINVAL; + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = __ptep_get(pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + __set_pte(pte, val); + + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, +}; + bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) @@ -37,32 +97,35 @@ bool can_set_direct_map(void) arm64_kfence_can_set_direct_map() || is_realm_world(); } -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) +static int update_range_prot(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data *cdata = data; - pte_t pte = __ptep_get(ptep); + struct page_change_data data; + int ret; - pte = clear_pte_bit(pte, cdata->clear_mask); - pte = set_pte_bit(pte, cdata->set_mask); + data.set_mask = set_mask; + data.clear_mask = clear_mask; - __set_pte(ptep, pte); - return 0; + arch_enter_lazy_mmu_mode(); + + /* + * The caller must ensure that the range we are operating on does not + * partially overlap a block mapping, or a cont mapping. Any such case + * must be eliminated by splitting the mapping. + */ + ret = walk_kernel_page_table_range_lockless(start, start + size, + &pageattr_ops, NULL, &data); + arch_leave_lazy_mmu_mode(); + + return ret; } -/* - * This function assumes that the range is mapped with PAGE_SIZE pages. - */ static int __change_memory_common(unsigned long start, unsigned long size, - pgprot_t set_mask, pgprot_t clear_mask) + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data data; int ret; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); + ret = update_range_prot(start, size, set_mask, clear_mask); /* * If the memory is being made valid without changing any other bits @@ -174,32 +237,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) int set_direct_map_invalid_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(0), - .clear_mask = __pgprot(PTE_VALID), - }; + pgprot_t clear_mask = __pgprot(PTE_VALID); + pgprot_t set_mask = __pgprot(0); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } int set_direct_map_default_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), - .clear_mask = __pgprot(PTE_RDONLY), - }; + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_RDONLY); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } static int __set_memory_enc_dec(unsigned long addr, diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 682472c15495..88e18615dd72 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..936689d8bcac 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { - struct mm_struct *mm = &init_mm; + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ struct mm_walk walk = { .ops = ops, - .mm = mm, + .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true @@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, if (!check_ops_valid(ops)) return -EINVAL; - /* - * Kernel intermediate page tables are usually not freed, so the mmap - * read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. - */ - mmap_assert_locked(mm); - return walk_pgd_range(start, end, &walk); } -- cgit v1.2.3 From 6b8ba0db92cd01450acaf375caf4c126aa913d72 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:08 +0000 Subject: ASoC: soc-dapm: add snd_soc_dapm_to_dev() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Some drivers need to get dev from dapm (which will be removed). We need such function. Add it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87cy86x06z.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 ++- sound/soc/soc-dapm.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index ed39458b94bf..ccd36a198a13 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -587,7 +587,7 @@ struct snd_soc_dapm_context { unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ unsigned int suspend_bias_off:1; /* Use BIAS_OFF in suspend if the DAPM is idle */ - struct device *dev; /* from parent - for debug */ + struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ struct snd_soc_card *card; /* parent card */ @@ -660,6 +660,7 @@ void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card); int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); +struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index d74c096cc208..83cdf97edb9d 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -165,6 +165,15 @@ static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) kfree(buf); } +struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm) +{ + if (dapm->component) + return dapm->component->dev; + + return dapm->card->dev; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_dev); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); -- cgit v1.2.3 From c8df096bca84c9eb04b656015c8430d0b87ebbcf Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:13 +0000 Subject: ASoC: soc-dapm: add snd_soc_dapm_to_card() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Some drivers need to get card from dapm (which will be removed). We need such function. Add it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87bjnqx06v.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index ccd36a198a13..dbb71e396feb 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -661,6 +661,7 @@ int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); +struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 83cdf97edb9d..cd8d38579886 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -174,6 +174,12 @@ struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm) } EXPORT_SYMBOL_GPL(snd_soc_dapm_to_dev); +struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm) +{ + return dapm->card; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_card); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); -- cgit v1.2.3 From 96e311b561a2d393a786a2aeb50cd5e02d06afb3 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:17 +0000 Subject: ASoC: soc-dapm: use dapm->component instead of container_of() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Now, snd_soc_dapm_to_component() (A) will convert dapm to component by container_of() (a). (A) static inline struct snd_soc_component *snd_soc_dapm_to_component( struct snd_soc_dapm_context *dapm) { (a) return container_of(dapm, struct snd_soc_component, dapm); } dapm of component works, but dapm of card will be "unknown" pointer (= not NULL), because (a) is using "container_of()". OTOH, ASoC will call snd_soc_dapm_init() (X) to initialize dapm, and it will be called from snd_soc_bind_card() (p) (for card) or soc_probe_component() (q) (for component) with component pointer. (p) static int snd_soc_bind_card(...) { ... (X) snd_soc_dapm_init(..., NULL); ... ^^^^ } (q) static int soc_probe_component(...) { ... (X) snd_soc_dapm_init(..., component); ... ^^^^^^^^^ } And snd_soc_dapm_init() (X) will fill dapm->component (x) (X) void snd_soc_dapm_init(..., component, ...) { ... (x) dapm->component = component; ... } We can simply use dapm->component in snd_soc_dapm_to_component() (A). In this case, dapm of card (p) will be just NULL. Use dapm->component instead of container_of(). The picky note can be removed by this patch. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87a53ax06q.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 15 --------------- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 6 ++++++ 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 48e45cbe82e5..7322d5d4c0bd 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -260,21 +260,6 @@ struct snd_soc_component { #define for_each_component_dais_safe(component, dai, _dai)\ list_for_each_entry_safe(dai, _dai, &(component)->dai_list, list) -/** - * snd_soc_dapm_to_component() - Casts a DAPM context to the component it is - * embedded in - * @dapm: The DAPM context to cast to the component - * - * This function must only be used on DAPM contexts that are known to be part of - * a component (e.g. in a component driver). Otherwise the behavior is - * undefined. - */ -static inline struct snd_soc_component *snd_soc_dapm_to_component( - struct snd_soc_dapm_context *dapm) -{ - return container_of(dapm, struct snd_soc_component, dapm); -} - /** * snd_soc_component_get_dapm() - Returns the DAPM context associated with a * component diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index dbb71e396feb..c6470d391eef 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -662,6 +662,7 @@ int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); +struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index cd8d38579886..4550bf33add2 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -180,6 +180,12 @@ struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm) } EXPORT_SYMBOL_GPL(snd_soc_dapm_to_card); +struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm) +{ + return dapm->component; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_component); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); -- cgit v1.2.3 From a1c99b6097afe64ed493c05b522ee4d6f9b0094d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:21 +0000 Subject: ASoC: soc-component: add snd_soc_component_to_dapm() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Current dapm of card/component are using "instance", but it will be "pointer" if snd_soc_dapm_context was removed from header. snd_soc_component_to_dapm() is needed to switch to the new style while maintaining compatibility Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/878qiux06m.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 7322d5d4c0bd..b954f34d6025 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -261,16 +261,19 @@ struct snd_soc_component { list_for_each_entry_safe(dai, _dai, &(component)->dai_list, list) /** - * snd_soc_component_get_dapm() - Returns the DAPM context associated with a + * snd_soc_component_to_dapm() - Returns the DAPM context associated with a * component * @component: The component for which to get the DAPM context */ -static inline struct snd_soc_dapm_context *snd_soc_component_get_dapm( +static inline struct snd_soc_dapm_context *snd_soc_component_to_dapm( struct snd_soc_component *component) { return &component->dapm; } +// FIXME +#define snd_soc_component_get_dapm snd_soc_component_to_dapm + /** * snd_soc_component_cache_sync() - Sync the register cache with the hardware * @component: COMPONENT to sync -- cgit v1.2.3 From e38a80c5c24f3058bd5da6f2910e2b672493f4f2 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:25 +0000 Subject: ASoC: soc-card: add snd_soc_card_to_dapm() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Current dapm of card/component are using "instance", but it will be "pointer" if snd_soc_dapm_context was removed from header. snd_soc_card_to_dapm() is needed to switch to the new style while maintaining compatibility Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/877byex06i.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 1fffef311c41..ddc508ff7b9b 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1120,6 +1120,11 @@ static inline int snd_soc_card_is_instantiated(struct snd_soc_card *card) return card && card->instantiated; } +static inline struct snd_soc_dapm_context *snd_soc_card_to_dapm(struct snd_soc_card *card) +{ + return &card->dapm; +} + /* SoC machine DAI configuration, glues a codec and cpu DAI together */ struct snd_soc_pcm_runtime { struct device *dev; -- cgit v1.2.3 From 3bc0a92cb2062fce54ddd97ad68ad6fe358c3ff0 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:29 +0000 Subject: ASoC: soc-dapm: remove suspend_bias_off from snd_soc_dapm_context We can directly use suspend_bias_off via snd_soc_component, no need to keep it on dapm. Remove it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/875xdyx06e.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 - sound/soc/soc-dapm.c | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index c6470d391eef..498f8af79cfa 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -585,7 +585,6 @@ struct snd_soc_dapm_context { /* bit field */ unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ - unsigned int suspend_bias_off:1; /* Use BIAS_OFF in suspend if the DAPM is idle */ struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 4550bf33add2..b90d0adb7713 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2182,13 +2182,16 @@ end: static bool dapm_idle_bias_off(struct snd_soc_dapm_context *dapm) { + struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); if (dapm->idle_bias_off) return true; switch (snd_power_get_state(dapm->card->snd_card)) { case SNDRV_CTL_POWER_D3hot: case SNDRV_CTL_POWER_D3cold: - return dapm->suspend_bias_off; + if (component) + return component->driver->suspend_bias_off; + fallthrough; default: break; } @@ -4823,7 +4826,6 @@ void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm, if (component) { dapm->dev = component->dev; dapm->idle_bias_off = !component->driver->idle_bias_on; - dapm->suspend_bias_off = component->driver->suspend_bias_off; } else { dapm->dev = card->dev; } -- cgit v1.2.3 From 889dd56f8c03586e5489050e7457a405fae6a420 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:33 +0000 Subject: ASoC: soc-dapm: tidyup idle_bias handling - step1 Current soc-dapm is using "idle_bias_off", and its default settings came from snd_soc_component "idle_bias_on". It is complicated/confusable. Let's handling it as "idle_bias". Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/874itix06a.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 3 +-- sound/soc/codecs/wm8993.c | 2 +- sound/soc/codecs/wm8994.c | 6 +++--- sound/soc/intel/avs/boards/es8336.c | 2 +- sound/soc/intel/avs/boards/rt274.c | 2 +- sound/soc/intel/avs/boards/rt5640.c | 2 +- sound/soc/intel/boards/bytcht_cx2072x.c | 2 +- sound/soc/intel/boards/bytcht_es8316.c | 2 +- sound/soc/intel/boards/bytcr_rt5640.c | 2 +- sound/soc/intel/boards/bytcr_rt5651.c | 2 +- sound/soc/intel/boards/bytcr_wm5102.c | 2 +- sound/soc/intel/boards/sof_es8336.c | 2 +- sound/soc/soc-core.c | 4 ++-- sound/soc/soc-dapm.c | 20 ++++++++++---------- sound/soc/sof/sof-client-probes.c | 2 +- 15 files changed, 27 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 498f8af79cfa..9618a54a5348 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -583,8 +583,7 @@ struct snd_soc_dapm_update { struct snd_soc_dapm_context { enum snd_soc_bias_level bias_level; - /* bit field */ - unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ + bool idle_bias; /* Use BIAS_OFF instead of STANDBY when false */ struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ diff --git a/sound/soc/codecs/wm8993.c b/sound/soc/codecs/wm8993.c index 9be4f6cadba3..75d923c2c9ca 100644 --- a/sound/soc/codecs/wm8993.c +++ b/sound/soc/codecs/wm8993.c @@ -1536,7 +1536,7 @@ static int wm8993_probe(struct snd_soc_component *component) * VMID as an output and can disable it. */ if (wm8993->pdata.lineout1_diff && wm8993->pdata.lineout2_diff) - dapm->idle_bias_off = 1; + dapm->idle_bias = false; return 0; diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index 240ec1bed234..128c3a59beac 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -4182,8 +4182,8 @@ static int wm8994_component_probe(struct snd_soc_component *component) wm8994->micdet_irq = control->pdata.micdet_irq; - /* By default use idle_bias_off, will override for WM8994 */ - dapm->idle_bias_off = 1; + /* By default use idle_bias false, will override for WM8994 */ + dapm->idle_bias = false; /* Set revision-specific configuration */ switch (control->type) { @@ -4191,7 +4191,7 @@ static int wm8994_component_probe(struct snd_soc_component *component) /* Single ended line outputs should have VMID on. */ if (!control->pdata.lineout1_diff || !control->pdata.lineout2_diff) - dapm->idle_bias_off = 0; + dapm->idle_bias = true; switch (control->revision) { case 2: diff --git a/sound/soc/intel/avs/boards/es8336.c b/sound/soc/intel/avs/boards/es8336.c index 12e4e0aba5fa..eb2b40894e3f 100644 --- a/sound/soc/intel/avs/boards/es8336.c +++ b/sound/soc/intel/avs/boards/es8336.c @@ -132,7 +132,7 @@ static int avs_es8336_codec_init(struct snd_soc_pcm_runtime *runtime) snd_jack_set_key(data->jack.jack, SND_JACK_BTN_0, KEY_PLAYPAUSE); snd_soc_component_set_jack(component, &data->jack, NULL); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/avs/boards/rt274.c b/sound/soc/intel/avs/boards/rt274.c index 67d2c4585cdd..4055ecc60838 100644 --- a/sound/soc/intel/avs/boards/rt274.c +++ b/sound/soc/intel/avs/boards/rt274.c @@ -117,7 +117,7 @@ static int avs_rt274_codec_init(struct snd_soc_pcm_runtime *runtime) return ret; } - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/avs/boards/rt5640.c b/sound/soc/intel/avs/boards/rt5640.c index 706b84ffe1ef..97d1fa944188 100644 --- a/sound/soc/intel/avs/boards/rt5640.c +++ b/sound/soc/intel/avs/boards/rt5640.c @@ -67,7 +67,7 @@ static int avs_rt5640_codec_init(struct snd_soc_pcm_runtime *runtime) return ret; snd_soc_component_set_jack(codec_dai->component, jack, NULL); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/boards/bytcht_cx2072x.c b/sound/soc/intel/boards/bytcht_cx2072x.c index 68a3d345dc25..27b63a853a48 100644 --- a/sound/soc/intel/boards/bytcht_cx2072x.c +++ b/sound/soc/intel/boards/bytcht_cx2072x.c @@ -77,7 +77,7 @@ static int byt_cht_cx2072x_init(struct snd_soc_pcm_runtime *rtd) byt_cht_cx2072x_acpi_gpios)) dev_warn(rtd->dev, "Unable to add GPIO mapping table\n"); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; /* set the default PLL rate, the clock is handled by the codec driver */ ret = snd_soc_dai_set_sysclk(snd_soc_rtd_to_codec(rtd, 0), CX2072X_MCLK_EXTERNAL_PLL, diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index b384d38654e6..3b5f63112237 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -179,7 +179,7 @@ static int byt_cht_es8316_init(struct snd_soc_pcm_runtime *runtime) int num_routes; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; switch (BYT_CHT_ES8316_MAP(quirk)) { case BYT_CHT_ES8316_INTMIC_IN1_MAP: diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index bc846558480e..1e9b1903fae8 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -1324,7 +1324,7 @@ static int byt_rt5640_init(struct snd_soc_pcm_runtime *runtime) int num_routes = 0; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; jack_data->use_platform_clock = true; /* Start with RC clk for jack-detect (we disable MCLK below) */ diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 604a35d380e9..ca540a66f22c 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -586,7 +586,7 @@ static int byt_rt5651_init(struct snd_soc_pcm_runtime *runtime) int report; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; /* Start with RC clk for jack-detect (we disable MCLK below) */ if (byt_rt5651_quirk & BYT_RT5651_MCLK_EN) diff --git a/sound/soc/intel/boards/bytcr_wm5102.c b/sound/soc/intel/boards/bytcr_wm5102.c index a6dfbcfdf74e..72c0e5941ae8 100644 --- a/sound/soc/intel/boards/bytcr_wm5102.c +++ b/sound/soc/intel/boards/bytcr_wm5102.c @@ -288,7 +288,7 @@ static int byt_wm5102_init(struct snd_soc_pcm_runtime *runtime) const struct snd_soc_dapm_route *custom_map = NULL; int ret, jack_type, num_routes = 0; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; ret = snd_soc_add_card_controls(card, byt_wm5102_controls, ARRAY_SIZE(byt_wm5102_controls)); diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index 1211a2b8a2a2..10b189ea88db 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -276,7 +276,7 @@ static int sof_es8316_init(struct snd_soc_pcm_runtime *runtime) int num_routes; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; if (quirk & SOC_ES8336_HEADSET_MIC1) { custom_map = sof_es8316_headset_mic1_map; diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index cc9125ffe92a..9dd84d73046b 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -717,7 +717,7 @@ int snd_soc_suspend(struct device *dev) * means it's doing something, * otherwise fall through. */ - if (dapm->idle_bias_off) { + if (!dapm->idle_bias) { dev_dbg(component->dev, "ASoC: idle_bias_off CODEC on over suspend\n"); break; @@ -1652,7 +1652,7 @@ static int soc_probe_component(struct snd_soc_card *card, if (ret < 0) goto err_probe; - WARN(dapm->idle_bias_off && + WARN(!dapm->idle_bias && dapm->bias_level != SND_SOC_BIAS_OFF, "codec %s can not start from non-off bias with idle_bias_off==1\n", component->name); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index b90d0adb7713..cbe945f1dc9c 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2180,23 +2180,23 @@ end: dapm_seq_insert(w, down_list, false); } -static bool dapm_idle_bias_off(struct snd_soc_dapm_context *dapm) +static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) { struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); - if (dapm->idle_bias_off) - return true; + if (!dapm->idle_bias) + return false; switch (snd_power_get_state(dapm->card->snd_card)) { case SNDRV_CTL_POWER_D3hot: case SNDRV_CTL_POWER_D3cold: if (component) - return component->driver->suspend_bias_off; + return !component->driver->suspend_bias_off; fallthrough; default: break; } - return false; + return true; } /* @@ -2224,10 +2224,10 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, trace_snd_soc_dapm_start(card, event); for_each_card_dapms(card, d) { - if (dapm_idle_bias_off(d)) - d->target_bias_level = SND_SOC_BIAS_OFF; - else + if (dapm_get_idle_bias(d)) d->target_bias_level = SND_SOC_BIAS_STANDBY; + else + d->target_bias_level = SND_SOC_BIAS_OFF; } dapm_reset(card); @@ -2291,7 +2291,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, if (d->target_bias_level > bias) bias = d->target_bias_level; for_each_card_dapms(card, d) - if (!dapm_idle_bias_off(d)) + if (dapm_get_idle_bias(d)) d->target_bias_level = bias; trace_snd_soc_dapm_walk_done(card); @@ -4825,7 +4825,7 @@ void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm, if (component) { dapm->dev = component->dev; - dapm->idle_bias_off = !component->driver->idle_bias_on; + dapm->idle_bias = component->driver->idle_bias_on; } else { dapm->dev = card->dev; } diff --git a/sound/soc/sof/sof-client-probes.c b/sound/soc/sof/sof-client-probes.c index 3ca8460774bb..aaf0ae4bf01f 100644 --- a/sound/soc/sof/sof-client-probes.c +++ b/sound/soc/sof/sof-client-probes.c @@ -525,7 +525,7 @@ static int sof_probes_client_probe(struct auxiliary_device *auxdev, card->dai_link = links; /* set idle_bias_off to prevent the core from resuming the card->dev */ - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; snd_soc_card_set_drvdata(card, cdev); -- cgit v1.2.3 From 2e7f0a86123d54a94fa3d309efdfbac02f2999b8 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:41 +0000 Subject: ASoC: soc-dapm: add snd_soc_dapm_get_bias_level() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Many drivers are directly using dapm->idle_bias, but it should get it via get_idle_bias() function. Makes it as global function. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/871pomx062.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 2 ++ sound/soc/soc-dapm.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 9618a54a5348..e978be4010b8 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -662,6 +662,8 @@ struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); +bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm); + /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); void snd_soc_dapm_free(struct snd_soc_dapm_context *dapm); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index a263696ef8ac..209da43bc023 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2180,7 +2180,7 @@ end: dapm_seq_insert(w, down_list, false); } -static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) +bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) { if (dapm->idle_bias) { struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); @@ -2193,6 +2193,7 @@ static bool dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) return dapm->idle_bias; } +EXPORT_SYMBOL_GPL(snd_soc_dapm_get_idle_bias); /* * Scan each dapm widget for complete audio path. @@ -2219,7 +2220,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, trace_snd_soc_dapm_start(card, event); for_each_card_dapms(card, d) { - if (dapm_get_idle_bias(d)) + if (snd_soc_dapm_get_idle_bias(d)) d->target_bias_level = SND_SOC_BIAS_STANDBY; else d->target_bias_level = SND_SOC_BIAS_OFF; @@ -2286,7 +2287,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, if (d->target_bias_level > bias) bias = d->target_bias_level; for_each_card_dapms(card, d) - if (dapm_get_idle_bias(d)) + if (snd_soc_dapm_get_idle_bias(d)) d->target_bias_level = bias; trace_snd_soc_dapm_walk_done(card); -- cgit v1.2.3 From cb3c715d89607f8896c0f20fe528a08e7ebffea9 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 4 Sep 2025 05:21:45 +0000 Subject: ASoC: soc-dapm: add snd_soc_dapm_set_idle_bias() Because struct snd_soc_dapm_context is soc-dapm framework specific, user driver don't need to access its member directly, we would like to hide them. struct snd_soc_dapm_context will be removed from header in the future. Many drivers are directly setting dapm->idle_bias, but it will be impossible soon. adds snd_soc_dapm_set_idle_bias() for them. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87zfbavllj.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index e978be4010b8..75941324886b 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -663,6 +663,7 @@ struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm); +void snd_soc_dapm_set_idle_bias(struct snd_soc_dapm_context *dapm, bool on); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 209da43bc023..51fb09d56c5a 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2195,6 +2195,12 @@ bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) } EXPORT_SYMBOL_GPL(snd_soc_dapm_get_idle_bias); +void snd_soc_dapm_set_idle_bias(struct snd_soc_dapm_context *dapm, bool on) +{ + dapm->idle_bias = on; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_set_idle_bias); + /* * Scan each dapm widget for complete audio path. * A complete path is a route that has valid endpoints i.e.:- -- cgit v1.2.3 From 76cffc3eb1bdee0a7e8cca090adfd46a740f1cb0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:44 +0100 Subject: soundwire: bus: add of_sdw_find_device_by_node helper There has been more than 3 instances of this helper in multiple codec drivers, it does not make sense to keep duplicating this part of code. Lets add a helper of_sdw_find_device_by_node for codec drivers to use it. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Acked-by: Vinod Koul Link: https://patch.msgid.link/20250909121954.225833-4-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/soundwire/slave.c | 6 ++++++ include/linux/soundwire/sdw.h | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index d2d99555ec5a..3d4d00188c26 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -273,4 +273,10 @@ int sdw_of_find_slaves(struct sdw_bus *bus) return 0; } +struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + return bus_find_device_by_of_node(&sdw_bus_type, np); +} +EXPORT_SYMBOL_GPL(of_sdw_find_device_by_node); + MODULE_IMPORT_NS("SND_SOC_SDCA"); diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 0832776262ac..096213956d31 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -19,6 +19,7 @@ struct dentry; struct fwnode_handle; +struct device_node; struct sdw_bus; struct sdw_slave; @@ -1086,6 +1087,8 @@ int sdw_stream_add_slave(struct sdw_slave *slave, int sdw_stream_remove_slave(struct sdw_slave *slave, struct sdw_stream_runtime *stream); +struct device *of_sdw_find_device_by_node(struct device_node *np); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1119,6 +1122,12 @@ static inline int sdw_stream_remove_slave(struct sdw_slave *slave, return -EINVAL; } +static inline struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return NULL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { -- cgit v1.2.3 From 2e07017b28e8bbace4a4973d11d0646575d36f94 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:45 +0100 Subject: soundwire: bus: add sdw_slave_get_current_bank helper There has been 2 instances of this helper in codec drivers, it does not make sense to keep duplicating this part of code. Lets add a helper sdw_get_current_bank() for codec drivers to use it. Signed-off-by: Srinivas Kandagatla Acked-by: Vinod Koul Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-5-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/soundwire/bus.c | 12 ++++++++++++ include/linux/soundwire/sdw.h | 8 ++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 4fd5cac799c5..55c1db816534 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -1360,6 +1360,18 @@ int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base) } EXPORT_SYMBOL(sdw_slave_get_scale_index); +int sdw_slave_get_current_bank(struct sdw_slave *slave) +{ + int tmp; + + tmp = sdw_read(slave, SDW_SCP_CTRL); + if (tmp < 0) + return tmp; + + return FIELD_GET(SDW_SCP_STAT_CURR_BANK, tmp); +} +EXPORT_SYMBOL_GPL(sdw_slave_get_current_bank); + static int sdw_slave_set_frequency(struct sdw_slave *slave) { int scale_index; diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 096213956d31..e6a3476bcef1 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1089,6 +1089,8 @@ int sdw_stream_remove_slave(struct sdw_slave *slave, struct device *of_sdw_find_device_by_node(struct device_node *np); +int sdw_slave_get_current_bank(struct sdw_slave *sdev); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1128,6 +1130,12 @@ static inline struct device *of_sdw_find_device_by_node(struct device_node *np) return NULL; } +static inline int sdw_slave_get_current_bank(struct sdw_slave *sdev) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return -EINVAL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { -- cgit v1.2.3 From ef1e734dbe257ce8bc42383b9977b5558f061288 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:48 +0300 Subject: power: supply: max77705_charger: use regfields for config registers Using regfields allows to cleanup masks and register offset definition, allowing to access register info by it's functional name. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 105 ++++++++++++-------------------- include/linux/power/max77705_charger.h | 102 ++++++++++++++++--------------- 2 files changed, 93 insertions(+), 114 deletions(-) (limited to 'include') diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 883affd18c8d..837e03bafcc6 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -74,8 +74,7 @@ static int max77705_charger_enable(struct max77705_charger_data *chg) { int rv; - rv = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, MAX77705_CHG_EN_MASK); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], 1); if (rv) dev_err(chg->dev, "unable to enable the charger: %d\n", rv); @@ -87,10 +86,7 @@ static void max77705_charger_disable(void *data) struct max77705_charger_data *chg = data; int rv; - rv = regmap_update_bits(chg->regmap, - MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, - MAX77705_CHG_DISABLE); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], MAX77705_CHG_DISABLE); if (rv) dev_err(chg->dev, "unable to disable the charger: %d\n", rv); } @@ -134,10 +130,10 @@ static int max77705_check_battery(struct max77705_charger_data *chg, int *val) static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) { struct regmap *regmap = chg->regmap; - unsigned int reg_data; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -162,10 +158,10 @@ static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) static int max77705_get_status(struct max77705_charger_data *chg, int *val) { struct regmap *regmap = chg->regmap; - unsigned int reg_data; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -295,16 +291,11 @@ static int max77705_get_input_current(struct max77705_charger_data *chg, { unsigned int reg_data; int get_current = 0; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - - reg_data &= MAX77705_CHG_CHGIN_LIM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®_data); if (reg_data <= 3) get_current = MAX77705_CURRENT_CHGIN_MIN; - else if (reg_data >= MAX77705_CHG_CHGIN_LIM_MASK) - get_current = MAX77705_CURRENT_CHGIN_MAX; else get_current = (reg_data + 1) * MAX77705_CURRENT_CHGIN_STEP; @@ -317,10 +308,8 @@ static int max77705_get_charge_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_02, ®_data); - reg_data &= MAX77705_CHG_CC; + regmap_field_read(chg->rfield[MAX77705_CHG_CC_LIM], ®_data); *val = reg_data <= 0x2 ? MAX77705_CURRENT_CHGIN_MIN : reg_data * MAX77705_CURRENT_CHG_STEP; @@ -332,7 +321,6 @@ static int max77705_set_float_voltage(struct max77705_charger_data *chg, { int float_voltage_mv; unsigned int reg_data = 0; - struct regmap *regmap = chg->regmap; float_voltage_mv = float_voltage / 1000; reg_data = float_voltage_mv <= 4000 ? 0x0 : @@ -340,9 +328,7 @@ static int max77705_set_float_voltage(struct max77705_charger_data *chg, (float_voltage_mv <= 4200) ? (float_voltage_mv - 4000) / 50 : (((float_voltage_mv - 4200) / 10) + 0x04); - return regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_04, - MAX77705_CHG_CV_PRM_MASK, - (reg_data << MAX77705_CHG_CV_PRM_SHIFT)); + return regmap_field_write(chg->rfield[MAX77705_CHG_CV_PRM], reg_data); } static int max77705_get_float_voltage(struct max77705_charger_data *chg, @@ -350,10 +336,8 @@ static int max77705_get_float_voltage(struct max77705_charger_data *chg, { unsigned int reg_data = 0; int voltage_mv; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_04, ®_data); - reg_data &= MAX77705_CHG_PRM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CV_PRM], ®_data); voltage_mv = reg_data <= 0x04 ? reg_data * 50 + 4000 : (reg_data - 4) * 10 + 4200; *val = voltage_mv * 1000; @@ -418,7 +402,6 @@ static void max77705_chgin_isr_work(struct work_struct *work) static void max77705_charger_initialize(struct max77705_charger_data *chg) { - u8 reg_data; struct power_supply_battery_info *info; struct regmap *regmap = chg->regmap; @@ -429,45 +412,31 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) /* unlock charger setting protect */ /* slowest LX slope */ - reg_data = MAX77705_CHGPROT_MASK | MAX77705_SLOWEST_LX_SLOPE; - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_06, reg_data, - reg_data); + regmap_field_write(chg->rfield[MAX77705_CHGPROT], MAX77705_CHGPROT_UNLOCKED); + regmap_field_write(chg->rfield[MAX77705_LX_SLOPE], MAX77705_SLOWEST_LX_SLOPE); /* fast charge timer disable */ /* restart threshold disable */ /* pre-qual charge disable */ - reg_data = (MAX77705_FCHGTIME_DISABLE << MAX77705_FCHGTIME_SHIFT) | - (MAX77705_CHG_RSTRT_DISABLE << MAX77705_CHG_RSTRT_SHIFT) | - (MAX77705_CHG_PQEN_DISABLE << MAX77705_PQEN_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_01, - (MAX77705_FCHGTIME_MASK | - MAX77705_CHG_RSTRT_MASK | - MAX77705_PQEN_MASK), - reg_data); - - /* OTG off(UNO on), boost off */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, - MAX77705_OTG_CTRL, 0); + regmap_field_write(chg->rfield[MAX77705_FCHGTIME], MAX77705_FCHGTIME_DISABLE); + regmap_field_write(chg->rfield[MAX77705_CHG_RSTRT], MAX77705_CHG_RSTRT_DISABLE); + regmap_field_write(chg->rfield[MAX77705_CHG_PQEN], MAX77705_CHG_PQEN_DISABLE); + + regmap_field_write(chg->rfield[MAX77705_MODE], + MAX77705_CHG_MASK | MAX77705_BUCK_MASK); /* charge current 450mA(default) */ /* otg current limit 900mA */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_02, - MAX77705_OTG_ILIM_MASK, - MAX77705_OTG_ILIM_900 << MAX77705_OTG_ILIM_SHIFT); + regmap_field_write(chg->rfield[MAX77705_OTG_ILIM], MAX77705_OTG_ILIM_900); /* BAT to SYS OCP 4.80A */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_05, - MAX77705_REG_B2SOVRC_MASK, - MAX77705_B2SOVRC_4_8A << MAX77705_REG_B2SOVRC_SHIFT); + regmap_field_write(chg->rfield[MAX77705_REG_B2SOVRC], MAX77705_B2SOVRC_4_8A); + /* top off current 150mA */ /* top off timer 30min */ - reg_data = (MAX77705_TO_ITH_150MA << MAX77705_TO_ITH_SHIFT) | - (MAX77705_TO_TIME_30M << MAX77705_TO_TIME_SHIFT) | - (MAX77705_SYS_TRACK_DISABLE << MAX77705_SYS_TRACK_DIS_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_03, - (MAX77705_TO_ITH_MASK | - MAX77705_TO_TIME_MASK | - MAX77705_SYS_TRACK_DIS_MASK), reg_data); + regmap_field_write(chg->rfield[MAX77705_TO], MAX77705_TO_ITH_150MA); + regmap_field_write(chg->rfield[MAX77705_TO_TIME], MAX77705_TO_TIME_30M); + regmap_field_write(chg->rfield[MAX77705_SYS_TRACK], MAX77705_SYS_TRACK_DISABLE); /* cv voltage 4.2V or 4.35V */ /* MINVSYS 3.6V(default) */ @@ -478,25 +447,21 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) max77705_set_float_voltage(chg, info->voltage_max_design_uv); } - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_VCHGIN_REG_MASK, MAX77705_VCHGIN_4_5); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_WCIN_REG_MASK, MAX77705_WCIN_4_5); + regmap_field_write(chg->rfield[MAX77705_VCHGIN], MAX77705_VCHGIN_4_5); + regmap_field_write(chg->rfield[MAX77705_WCIN], MAX77705_WCIN_4_5); /* Watchdog timer */ regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, MAX77705_WDTEN_MASK, 0); /* VBYPSET=5.0V */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_11, MAX77705_VBYPSET_MASK, 0); + regmap_field_write(chg->rfield[MAX77705_VBYPSET], 0); /* Switching Frequency : 1.5MHz */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_08, MAX77705_REG_FSW_MASK, - (MAX77705_CHG_FSW_1_5MHz << MAX77705_REG_FSW_SHIFT)); + regmap_field_write(chg->rfield[MAX77705_REG_FSW], MAX77705_CHG_FSW_1_5MHz); /* Auto skip mode */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, MAX77705_REG_DISKIP_MASK, - (MAX77705_AUTO_SKIP << MAX77705_REG_DISKIP_SHIFT)); + regmap_field_write(chg->rfield[MAX77705_REG_DISKIP], MAX77705_AUTO_SKIP); } static int max77705_charger_probe(struct i2c_client *i2c) @@ -520,6 +485,14 @@ static int max77705_charger_probe(struct i2c_client *i2c) if (IS_ERR(chg->regmap)) return PTR_ERR(chg->regmap); + for (int i = 0; i < MAX77705_N_REGMAP_FIELDS; i++) { + chg->rfield[i] = devm_regmap_field_alloc(dev, chg->regmap, + max77705_reg_field[i]); + if (IS_ERR(chg->rfield[i])) + return dev_err_probe(dev, PTR_ERR(chg->rfield[i]), + "cannot allocate regmap field\n"); + } + ret = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_INT_MASK, MAX77705_CHGIN_IM, 0); diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index fdec9af9c541..a612795577b6 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -9,6 +9,8 @@ #ifndef __MAX77705_CHARGER_H #define __MAX77705_CHARGER_H __FILE__ +#include + /* MAX77705_CHG_REG_CHG_INT */ #define MAX77705_BYP_I BIT(0) #define MAX77705_INP_LIMIT_I BIT(1) @@ -63,7 +65,6 @@ #define MAX77705_BUCK_SHIFT 2 #define MAX77705_BOOST_SHIFT 3 #define MAX77705_WDTEN_SHIFT 4 -#define MAX77705_MODE_MASK GENMASK(3, 0) #define MAX77705_CHG_MASK BIT(MAX77705_CHG_SHIFT) #define MAX77705_UNO_MASK BIT(MAX77705_UNO_SHIFT) #define MAX77705_OTG_MASK BIT(MAX77705_OTG_SHIFT) @@ -74,34 +75,19 @@ #define MAX77705_OTG_CTRL (MAX77705_OTG_MASK | MAX77705_BOOST_MASK) /* MAX77705_CHG_REG_CNFG_01 */ -#define MAX77705_FCHGTIME_SHIFT 0 -#define MAX77705_FCHGTIME_MASK GENMASK(2, 0) -#define MAX77705_CHG_RSTRT_SHIFT 4 -#define MAX77705_CHG_RSTRT_MASK GENMASK(5, 4) #define MAX77705_FCHGTIME_DISABLE 0 #define MAX77705_CHG_RSTRT_DISABLE 0x3 -#define MAX77705_PQEN_SHIFT 7 -#define MAX77705_PQEN_MASK BIT(7) #define MAX77705_CHG_PQEN_DISABLE 0 #define MAX77705_CHG_PQEN_ENABLE 1 /* MAX77705_CHG_REG_CNFG_02 */ -#define MAX77705_OTG_ILIM_SHIFT 6 -#define MAX77705_OTG_ILIM_MASK GENMASK(7, 6) #define MAX77705_OTG_ILIM_500 0 #define MAX77705_OTG_ILIM_900 1 #define MAX77705_OTG_ILIM_1200 2 #define MAX77705_OTG_ILIM_1500 3 -#define MAX77705_CHG_CC GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_03 */ -#define MAX77705_TO_ITH_SHIFT 0 -#define MAX77705_TO_ITH_MASK GENMASK(2, 0) -#define MAX77705_TO_TIME_SHIFT 3 -#define MAX77705_TO_TIME_MASK GENMASK(5, 3) -#define MAX77705_SYS_TRACK_DIS_SHIFT 7 -#define MAX77705_SYS_TRACK_DIS_MASK BIT(7) #define MAX77705_TO_ITH_150MA 0 #define MAX77705_TO_TIME_30M 3 #define MAX77705_SYS_TRACK_ENABLE 0 @@ -110,15 +96,8 @@ /* MAX77705_CHG_REG_CNFG_04 */ #define MAX77705_CHG_MINVSYS_SHIFT 6 #define MAX77705_CHG_MINVSYS_MASK GENMASK(7, 6) -#define MAX77705_CHG_PRM_SHIFT 0 -#define MAX77705_CHG_PRM_MASK GENMASK(5, 0) - -#define MAX77705_CHG_CV_PRM_SHIFT 0 -#define MAX77705_CHG_CV_PRM_MASK GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_05 */ -#define MAX77705_REG_B2SOVRC_SHIFT 0 -#define MAX77705_REG_B2SOVRC_MASK GENMASK(3, 0) #define MAX77705_B2SOVRC_DISABLE 0 #define MAX77705_B2SOVRC_4_5A 6 #define MAX77705_B2SOVRC_4_8A 8 @@ -128,9 +107,8 @@ #define MAX77705_WDTCLR_SHIFT 0 #define MAX77705_WDTCLR_MASK GENMASK(1, 0) #define MAX77705_WDTCLR 1 -#define MAX77705_CHGPROT_MASK GENMASK(3, 2) -#define MAX77705_CHGPROT_UNLOCKED GENMASK(3, 2) -#define MAX77705_SLOWEST_LX_SLOPE GENMASK(6, 5) +#define MAX77705_CHGPROT_UNLOCKED 3 +#define MAX77705_SLOWEST_LX_SLOPE 3 /* MAX77705_CHG_REG_CNFG_07 */ #define MAX77705_CHG_FMBST 4 @@ -140,36 +118,14 @@ #define MAX77705_REG_FGSRC_MASK BIT(MAX77705_REG_FGSRC_SHIFT) /* MAX77705_CHG_REG_CNFG_08 */ -#define MAX77705_REG_FSW_SHIFT 0 -#define MAX77705_REG_FSW_MASK GENMASK(1, 0) #define MAX77705_CHG_FSW_3MHz 0 #define MAX77705_CHG_FSW_2MHz 1 #define MAX77705_CHG_FSW_1_5MHz 2 /* MAX77705_CHG_REG_CNFG_09 */ -#define MAX77705_CHG_CHGIN_LIM_MASK GENMASK(6, 0) -#define MAX77705_CHG_EN_MASK BIT(7) #define MAX77705_CHG_DISABLE 0 -#define MAX77705_CHARGER_CHG_CHARGING(_reg) \ - (((_reg) & MAX77705_CHG_EN_MASK) > 1) - - -/* MAX77705_CHG_REG_CNFG_10 */ -#define MAX77705_CHG_WCIN_LIM GENMASK(5, 0) - -/* MAX77705_CHG_REG_CNFG_11 */ -#define MAX77705_VBYPSET_SHIFT 0 -#define MAX77705_VBYPSET_MASK GENMASK(6, 0) /* MAX77705_CHG_REG_CNFG_12 */ -#define MAX77705_CHGINSEL_SHIFT 5 -#define MAX77705_CHGINSEL_MASK BIT(MAX77705_CHGINSEL_SHIFT) -#define MAX77705_WCINSEL_SHIFT 6 -#define MAX77705_WCINSEL_MASK BIT(MAX77705_WCINSEL_SHIFT) -#define MAX77705_VCHGIN_REG_MASK GENMASK(4, 3) -#define MAX77705_WCIN_REG_MASK GENMASK(2, 1) -#define MAX77705_REG_DISKIP_SHIFT 0 -#define MAX77705_REG_DISKIP_MASK BIT(MAX77705_REG_DISKIP_SHIFT) /* REG=4.5V, UVLO=4.7V */ #define MAX77705_VCHGIN_4_5 0 /* REG=4.5V, UVLO=4.7V */ @@ -183,9 +139,59 @@ #define MAX77705_CURRENT_CHGIN_MIN 100000 #define MAX77705_CURRENT_CHGIN_MAX 3200000 +enum max77705_field_idx { + MAX77705_CHGPROT, + MAX77705_CHG_EN, + MAX77705_CHG_CC_LIM, + MAX77705_CHG_CHGIN_LIM, + MAX77705_CHG_CV_PRM, + MAX77705_CHG_PQEN, + MAX77705_CHG_RSTRT, + MAX77705_CHG_WCIN, + MAX77705_FCHGTIME, + MAX77705_LX_SLOPE, + MAX77705_MODE, + MAX77705_OTG_ILIM, + MAX77705_REG_B2SOVRC, + MAX77705_REG_DISKIP, + MAX77705_REG_FSW, + MAX77705_SYS_TRACK, + MAX77705_TO, + MAX77705_TO_TIME, + MAX77705_VBYPSET, + MAX77705_VCHGIN, + MAX77705_WCIN, + MAX77705_N_REGMAP_FIELDS, +}; + +static const struct reg_field max77705_reg_field[MAX77705_N_REGMAP_FIELDS] = { + [MAX77705_MODE] = REG_FIELD(MAX77705_CHG_REG_CNFG_00, 0, 3), + [MAX77705_FCHGTIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 0, 2), + [MAX77705_CHG_RSTRT] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 4, 5), + [MAX77705_CHG_PQEN] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 7, 7), + [MAX77705_CHG_CC_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 0, 5), + [MAX77705_OTG_ILIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 6, 7), + [MAX77705_TO] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 0, 2), + [MAX77705_TO_TIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 3, 5), + [MAX77705_SYS_TRACK] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 7, 7), + [MAX77705_CHG_CV_PRM] = REG_FIELD(MAX77705_CHG_REG_CNFG_04, 0, 5), + [MAX77705_REG_B2SOVRC] = REG_FIELD(MAX77705_CHG_REG_CNFG_05, 0, 3), + [MAX77705_CHGPROT] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 2, 3), + [MAX77705_LX_SLOPE] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 5, 6), + [MAX77705_REG_FSW] = REG_FIELD(MAX77705_CHG_REG_CNFG_08, 0, 1), + [MAX77705_CHG_CHGIN_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 0, 6), + [MAX77705_CHG_EN] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 7, 7), + [MAX77705_CHG_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_10, 0, 5), + [MAX77705_VBYPSET] = REG_FIELD(MAX77705_CHG_REG_CNFG_11, 0, 6), + [MAX77705_REG_DISKIP] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 0, 0), + [MAX77705_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 1, 2), + [MAX77705_VCHGIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 3, 4), +}; + struct max77705_charger_data { struct device *dev; struct regmap *regmap; + struct regmap_field *rfield[MAX77705_N_REGMAP_FIELDS]; struct power_supply_battery_info *bat_info; struct workqueue_struct *wqueue; struct work_struct chgin_work; -- cgit v1.2.3 From bc7d3a0f92dad811110f5602f58fe756cefce2b8 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:52 +0300 Subject: power: supply: max77705_charger: use REGMAP_IRQ_REG_LINE macro Refactor regmap_irq declarations with REGMAP_IRQ_REG_LINE saves a few lines on definitions. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 16 ++++++------- include/linux/power/max77705_charger.h | 42 +++++++++++++-------------------- 2 files changed, 24 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index bf065693af45..b1a227bf72e2 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -50,14 +50,14 @@ static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) } static const struct regmap_irq max77705_charger_irqs[] = { - { .mask = MAX77705_BYP_IM, }, - { .mask = MAX77705_INP_LIMIT_IM, }, - { .mask = MAX77705_BATP_IM, }, - { .mask = MAX77705_BAT_IM, }, - { .mask = MAX77705_CHG_IM, }, - { .mask = MAX77705_WCIN_IM, }, - { .mask = MAX77705_CHGIN_IM, }, - { .mask = MAX77705_AICL_IM, }, + REGMAP_IRQ_REG_LINE(MAX77705_BYP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_INP_LIMIT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BATP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BAT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHG_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_WCIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHGIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE), }; static struct regmap_irq_chip max77705_charger_irq_chip = { diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index a612795577b6..6653abfdf747 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -12,34 +12,24 @@ #include /* MAX77705_CHG_REG_CHG_INT */ -#define MAX77705_BYP_I BIT(0) -#define MAX77705_INP_LIMIT_I BIT(1) -#define MAX77705_BATP_I BIT(2) -#define MAX77705_BAT_I BIT(3) -#define MAX77705_CHG_I BIT(4) -#define MAX77705_WCIN_I BIT(5) -#define MAX77705_CHGIN_I BIT(6) -#define MAX77705_AICL_I BIT(7) - -/* MAX77705_CHG_REG_CHG_INT_MASK */ -#define MAX77705_BYP_IM BIT(0) -#define MAX77705_INP_LIMIT_IM BIT(1) -#define MAX77705_BATP_IM BIT(2) -#define MAX77705_BAT_IM BIT(3) -#define MAX77705_CHG_IM BIT(4) -#define MAX77705_WCIN_IM BIT(5) -#define MAX77705_CHGIN_IM BIT(6) -#define MAX77705_AICL_IM BIT(7) +#define MAX77705_BYP_I (0) +#define MAX77705_INP_LIMIT_I (1) +#define MAX77705_BATP_I (2) +#define MAX77705_BAT_I (3) +#define MAX77705_CHG_I (4) +#define MAX77705_WCIN_I (5) +#define MAX77705_CHGIN_I (6) +#define MAX77705_AICL_I (7) /* MAX77705_CHG_REG_CHG_INT_OK */ -#define MAX77705_BYP_OK BIT(0) -#define MAX77705_DISQBAT_OK BIT(1) -#define MAX77705_BATP_OK BIT(2) -#define MAX77705_BAT_OK BIT(3) -#define MAX77705_CHG_OK BIT(4) -#define MAX77705_WCIN_OK BIT(5) -#define MAX77705_CHGIN_OK BIT(6) -#define MAX77705_AICL_OK BIT(7) +#define MAX77705_BYP_OK BIT(MAX77705_BYP_I) +#define MAX77705_DISQBAT_OK BIT(MAX77705_INP_LIMIT_I) +#define MAX77705_BATP_OK BIT(MAX77705_BATP_I) +#define MAX77705_BAT_OK BIT(MAX77705_BAT_I) +#define MAX77705_CHG_OK BIT(MAX77705_CHG_I) +#define MAX77705_WCIN_OK BIT(MAX77705_WCIN_I) +#define MAX77705_CHGIN_OK BIT(MAX77705_CHGIN_I) +#define MAX77705_AICL_OK BIT(MAX77705_AICL_I) /* MAX77705_CHG_REG_DETAILS_00 */ #define MAX77705_BATP_DTLS BIT(0) -- cgit v1.2.3 From 603b4416232524dafde8e2cf859788dae786dea1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:30 +0200 Subject: bpf: Update the bpf_prog_calc_tag to use SHA256 Exclusive maps restrict map access to specific programs using a hash. The current hash used for this is SHA1, which is prone to collisions. This patch uses SHA256, which is more resilient against collisions. This new hash is stored in bpf_prog and used by the verifier to determine if a program can access a given exclusive map. The original 64-bit tags are kept, as they are used by users as a short, possibly colliding program identifier for non-security purposes. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- kernel/bpf/Kconfig | 2 +- kernel/bpf/core.c | 5 ++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41f776071ff5..d75902074bd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1717,7 +1718,10 @@ struct bpf_prog { enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; + union { + u8 digest[SHA256_DIGEST_SIZE]; + u8 tag[BPF_TAG_SIZE]; + }; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 17067dcb4386..eb3de35734f0 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,7 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool - select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1cda2589d4b3..9b64674df16b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -296,7 +297,6 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { size_t size = bpf_prog_insn_size(fp); - u8 digest[SHA1_DIGEST_SIZE]; struct bpf_insn *dst; bool was_ld_map; u32 i; @@ -327,8 +327,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - sha1((const u8 *)dst, size, digest); - memcpy(fp->tag, digest, sizeof(fp->tag)); + sha256((u8 *)dst, size, fp->digest); vfree(dst); return 0; } -- cgit v1.2.3 From baefdbdf6812e120c9fba9cfb101d3656f478026 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:31 +0200 Subject: bpf: Implement exclusive map creation Exclusive maps allow maps to only be accessed by program with a program with a matching hash which is specified in the excl_prog_hash attr. For the signing use-case, this allows the trusted loader program to load the map and verify the integrity Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/uapi/linux/bpf.h | 6 ++++++ 5 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d75902074bd1..c6a6ee1b2938 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f178a0f8eb1..c8ef91acfe98 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -860,6 +860,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -1338,9 +1339,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1534,7 +1535,29 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + return -EINVAL; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -6008,7 +6031,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6625570ac23d..aef6b266f08d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20407,6 +20407,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ -- cgit v1.2.3 From ea2e6467ac36bf3d785defc89e58269b15d182f7 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:35 +0200 Subject: bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD Currently only array maps are supported, but the implementation can be extended for other maps and objects. The hash is memoized only for exclusive and frozen maps as their content is stable until the exclusive program modifies the map. This is required for BPF signing, enabling a trusted loader program to verify a map's integrity. The loader retrieves the map's runtime hash from the kernel and compares it against an expected hash computed at build time. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 13 ++++++++++++ kernel/bpf/syscall.c | 23 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ .../testing/selftests/bpf/progs/verifier_map_ptr.c | 7 +++++-- 6 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6a6ee1b2938..e0c2c78a5faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -289,6 +291,7 @@ struct bpf_map_owner { }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf9..26d5dda989bc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -800,6 +812,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c8ef91acfe98..cf7173b1bb83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include #include #include #include @@ -5184,6 +5185,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5208,6 +5212,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a079145966..e2767d27d8aa 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : -- cgit v1.2.3 From 8cd189e414bb705312fbfff7f7b5605f6de2459a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:36 +0200 Subject: bpf: Move the signature kfuncs to helpers.c No functional changes, except for the addition of the headers for the kfuncs so that they can be used for signature verification. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-8-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++++ kernel/bpf/helpers.c | 166 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 183 ----------------------------------------------- 3 files changed, 198 insertions(+), 183 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0c2c78a5faa..dfc1a27b56d5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3424,6 +3424,38 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) + +struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); +struct bpf_key *bpf_lookup_system_key(u64 id); +void bpf_key_put(struct bpf_key *bkey); +int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring); + +#else +static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + return NULL; +} + +static inline struct bpf_key *bpf_lookup_system_key(u64 id) +{ + return NULL; +} + +static inline void bpf_key_put(struct bpf_key *bkey) +{ +} + +static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ + return -EOPNOTSUPP; +} +#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 51229aba5318..ef4ede8bb74f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -3747,6 +3748,163 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +#ifdef CONFIG_KEYS +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_p: data to verify + * @sig_p: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const void *data, *sig; + u32 data_len, sig_len; + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +} +#endif /* CONFIG_KEYS */ __bpf_kfunc_end_defs(); @@ -3788,6 +3946,14 @@ BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif +#ifdef CONFIG_KEYS +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 606007c387c5..f2360579658e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -1241,188 +1240,6 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_KEYS -__bpf_kfunc_start_defs(); - -/** - * bpf_lookup_user_key - lookup a key by its serial - * @serial: key handle serial number - * @flags: lookup-specific flags - * - * Search a key with a given *serial* and the provided *flags*. - * If found, increment the reference count of the key by one, and - * return it in the bpf_key structure. - * - * The bpf_key structure must be passed to bpf_key_put() when done - * with it, so that the key reference count is decremented and the - * bpf_key structure is freed. - * - * Permission checks are deferred to the time the key is used by - * one of the available key-specific kfuncs. - * - * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested - * special keyring (e.g. session keyring), if it doesn't yet exist. - * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting - * for the key construction, and to retrieve uninstantiated keys (keys - * without data attached to them). - * - * Return: a bpf_key pointer with a valid key pointer if the key is found, a - * NULL pointer otherwise. - */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) -{ - key_ref_t key_ref; - struct bpf_key *bkey; - - if (flags & ~KEY_LOOKUP_ALL) - return NULL; - - /* - * Permission check is deferred until the key is used, as the - * intent of the caller is unknown here. - */ - key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); - if (IS_ERR(key_ref)) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); - if (!bkey) { - key_put(key_ref_to_ptr(key_ref)); - return NULL; - } - - bkey->key = key_ref_to_ptr(key_ref); - bkey->has_ref = true; - - return bkey; -} - -/** - * bpf_lookup_system_key - lookup a key by a system-defined ID - * @id: key ID - * - * Obtain a bpf_key structure with a key pointer set to the passed key ID. - * The key pointer is marked as invalid, to prevent bpf_key_put() from - * attempting to decrement the key reference count on that pointer. The key - * pointer set in such way is currently understood only by - * verify_pkcs7_signature(). - * - * Set *id* to one of the values defined in include/linux/verification.h: - * 0 for the primary keyring (immutable keyring of system keys); - * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring - * (where keys can be added only if they are vouched for by existing keys - * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform - * keyring (primarily used by the integrity subsystem to verify a kexec'ed - * kerned image and, possibly, the initramfs signature). - * - * Return: a bpf_key pointer with an invalid key pointer set from the - * pre-determined ID on success, a NULL pointer otherwise - */ -__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) -{ - struct bpf_key *bkey; - - if (system_keyring_id_check(id) < 0) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); - if (!bkey) - return NULL; - - bkey->key = (struct key *)(unsigned long)id; - bkey->has_ref = false; - - return bkey; -} - -/** - * bpf_key_put - decrement key reference count if key is valid and free bpf_key - * @bkey: bpf_key structure - * - * Decrement the reference count of the key inside *bkey*, if the pointer - * is valid, and free *bkey*. - */ -__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) -{ - if (bkey->has_ref) - key_put(bkey->key); - - kfree(bkey); -} - -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -/** - * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_p: data to verify - * @sig_p: signature of the data - * @trusted_keyring: keyring with keys trusted for signature verification - * - * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* - * with keys in a keyring referenced by *trusted_keyring*. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, - struct bpf_key *trusted_keyring) -{ - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; - const void *data, *sig; - u32 data_len, sig_len; - int ret; - - if (trusted_keyring->has_ref) { - /* - * Do the permission check deferred in bpf_lookup_user_key(). - * See bpf_lookup_user_key() for more details. - * - * A call to key_task_permission() here would be redundant, as - * it is already done by keyring_search() called by - * find_asymmetric_key(). - */ - ret = key_validate(trusted_keyring->key); - if (ret < 0) - return ret; - } - - data_len = __bpf_dynptr_size(data_ptr); - data = __bpf_dynptr_data(data_ptr, data_len); - sig_len = __bpf_dynptr_size(sig_ptr); - sig = __bpf_dynptr_data(sig_ptr, sig_len); - - return verify_pkcs7_signature(data, data_len, sig, sig_len, - trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, - NULL); -} -#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(key_sig_kfunc_set) -BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) -#endif -BTF_KFUNCS_END(key_sig_kfunc_set) - -static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { - .owner = THIS_MODULE, - .set = &key_sig_kfunc_set, -}; - -static int __init bpf_key_sig_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, - &bpf_key_sig_kfunc_set); -} - -late_initcall(bpf_key_sig_kfuncs_init); -#endif /* CONFIG_KEYS */ - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { -- cgit v1.2.3 From eafedbc7c050c44744fbdf80bdf3315e860b7513 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 19 Sep 2025 06:42:07 +0000 Subject: rust_binder: add Rust Binder driver We're generally not proponents of rewrites (nasty uncomfortable things that make you late for dinner!). So why rewrite Binder? Binder has been evolving over the past 15+ years to meet the evolving needs of Android. Its responsibilities, expectations, and complexity have grown considerably during that time. While we expect Binder to continue to evolve along with Android, there are a number of factors that currently constrain our ability to develop/maintain it. Briefly those are: 1. Complexity: Binder is at the intersection of everything in Android and fulfills many responsibilities beyond IPC. It has become many things to many people, and due to its many features and their interactions with each other, its complexity is quite high. In just 6kLOC it must deliver transactions to the right threads. It must correctly parse and translate the contents of transactions, which can contain several objects of different types (e.g., pointers, fds) that can interact with each other. It controls the size of thread pools in userspace, and ensures that transactions are assigned to threads in ways that avoid deadlocks where the threadpool has run out of threads. It must track refcounts of objects that are shared by several processes by forwarding refcount changes between the processes correctly. It must handle numerous error scenarios and it combines/nests 13 different locks, 7 reference counters, and atomic variables. Finally, It must do all of this as fast and efficiently as possible. Minor performance regressions can cause a noticeably degraded user experience. 2. Things to improve: Thousand-line functions [1], error-prone error handling [2], and confusing structure can occur as a code base grows organically. After more than a decade of development, this codebase could use an overhaul. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/android/binder.c?h=v6.5#n2896 [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/android/binder.c?h=v6.5#n3658 3. Security critical: Binder is a critical part of Android's sandboxing strategy. Even Android's most de-privileged sandboxes (e.g. the Chrome renderer, or SW Codec) have direct access to Binder. More than just about any other component, it's important that Binder provide robust security, and itself be robust against security vulnerabilities. It's #1 (high complexity) that has made continuing to evolve Binder and resolving #2 (tech debt) exceptionally difficult without causing #3 (security issues). For Binder to continue to meet Android's needs, we need better ways to manage (and reduce!) complexity without increasing the risk. The biggest change is obviously the choice of programming language. We decided to use Rust because it directly addresses a number of the challenges within Binder that we have faced during the last years. It prevents mistakes with ref counting, locking, bounds checking, and also does a lot to reduce the complexity of error handling. Additionally, we've been able to use the more expressive type system to encode the ownership semantics of the various structs and pointers, which takes the complexity of managing object lifetimes out of the hands of the programmer, reducing the risk of use-after-frees and similar problems. Rust has many different pointer types that it uses to encode ownership semantics into the type system, and this is probably one of the most important aspects of how it helps in Binder. The Binder driver has a lot of different objects that have complex ownership semantics; some pointers own a refcount, some pointers have exclusive ownership, and some pointers just reference the object and it is kept alive in some other manner. With Rust, we can use a different pointer type for each kind of pointer, which enables the compiler to enforce that the ownership semantics are implemented correctly. Another useful feature is Rust's error handling. Rust allows for more simplified error handling with features such as destructors, and you get compilation failures if errors are not properly handled. This means that even though Rust requires you to spend more lines of code than C on things such as writing down invariants that are left implicit in C, the Rust driver is still slightly smaller than C binder: Rust is 5.5kLOC and C is 5.8kLOC. (These numbers are excluding blank lines, comments, binderfs, and any debugging facilities in C that are not yet implemented in the Rust driver. The numbers include abstractions in rust/kernel/ that are unlikely to be used by other drivers than Binder.) Although this rewrite completely rethinks how the code is structured and how assumptions are enforced, we do not fundamentally change *how* the driver does the things it does. A lot of careful thought has gone into the existing design. The rewrite is aimed rather at improving code health, structure, readability, robustness, security, maintainability and extensibility. We also include more inline documentation, and improve how assumptions in the code are enforced. Furthermore, all unsafe code is annotated with a SAFETY comment that explains why it is correct. We have left the binderfs filesystem component in C. Rewriting it in Rust would be a large amount of work and requires a lot of bindings to the file system interfaces. Binderfs has not historically had the same challenges with security and complexity, so rewriting binderfs seems to have lower value than the rest of Binder. Correctness and feature parity ------------------------------ Rust binder passes all tests that validate the correctness of Binder in the Android Open Source Project. We can boot a device, and run a variety of apps and functionality without issues. We have performed this both on the Cuttlefish Android emulator device, and on a Pixel 6 Pro. As for feature parity, Rust binder currently implements all features that C binder supports, with the exception of some debugging facilities. The missing debugging facilities will be added before we submit the Rust implementation upstream. Tracepoints ----------- I did not include all of the tracepoints as I felt that the mechansim for making C access fields of Rust structs should be discussed on list separately. I also did not include the support for building Rust Binder as a module since that requires exporting a bunch of additional symbols on the C side. Original RFC Link with old benchmark numbers: https://lore.kernel.org/r/20231101-rust-binder-v1-0-08ba9197f637@google.com Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Matt Gilbride Signed-off-by: Matt Gilbride Acked-by: Carlos Llamas Acked-by: Paul Moore Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20250919-rust-binder-v2-1-a384b09f28dd@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/Kconfig | 15 +- drivers/android/Makefile | 1 + drivers/android/binder/Makefile | 9 + drivers/android/binder/allocation.rs | 602 +++++++++ drivers/android/binder/context.rs | 180 +++ drivers/android/binder/deferred_close.rs | 204 +++ drivers/android/binder/defs.rs | 182 +++ drivers/android/binder/error.rs | 99 ++ drivers/android/binder/freeze.rs | 388 ++++++ drivers/android/binder/node.rs | 1131 +++++++++++++++++ drivers/android/binder/node/wrapper.rs | 78 ++ drivers/android/binder/page_range.rs | 734 +++++++++++ drivers/android/binder/page_range_helper.c | 24 + drivers/android/binder/page_range_helper.h | 15 + drivers/android/binder/process.rs | 1696 +++++++++++++++++++++++++ drivers/android/binder/range_alloc/array.rs | 251 ++++ drivers/android/binder/range_alloc/mod.rs | 329 +++++ drivers/android/binder/range_alloc/tree.rs | 488 +++++++ drivers/android/binder/rust_binder.h | 23 + drivers/android/binder/rust_binder_events.c | 59 + drivers/android/binder/rust_binder_events.h | 36 + drivers/android/binder/rust_binder_internal.h | 87 ++ drivers/android/binder/rust_binder_main.rs | 627 +++++++++ drivers/android/binder/rust_binderfs.c | 850 +++++++++++++ drivers/android/binder/stats.rs | 89 ++ drivers/android/binder/thread.rs | 1596 +++++++++++++++++++++++ drivers/android/binder/trace.rs | 16 + drivers/android/binder/transaction.rs | 456 +++++++ include/uapi/linux/android/binder.h | 2 +- rust/bindings/bindings_helper.h | 8 + rust/helpers/binder.c | 26 + rust/helpers/helpers.c | 1 + rust/helpers/page.c | 8 + rust/helpers/security.c | 24 + rust/kernel/cred.rs | 6 + rust/kernel/page.rs | 6 + rust/kernel/security.rs | 37 + rust/uapi/uapi_helper.h | 1 + 38 files changed, 10382 insertions(+), 2 deletions(-) create mode 100644 drivers/android/binder/Makefile create mode 100644 drivers/android/binder/allocation.rs create mode 100644 drivers/android/binder/context.rs create mode 100644 drivers/android/binder/deferred_close.rs create mode 100644 drivers/android/binder/defs.rs create mode 100644 drivers/android/binder/error.rs create mode 100644 drivers/android/binder/freeze.rs create mode 100644 drivers/android/binder/node.rs create mode 100644 drivers/android/binder/node/wrapper.rs create mode 100644 drivers/android/binder/page_range.rs create mode 100644 drivers/android/binder/page_range_helper.c create mode 100644 drivers/android/binder/page_range_helper.h create mode 100644 drivers/android/binder/process.rs create mode 100644 drivers/android/binder/range_alloc/array.rs create mode 100644 drivers/android/binder/range_alloc/mod.rs create mode 100644 drivers/android/binder/range_alloc/tree.rs create mode 100644 drivers/android/binder/rust_binder.h create mode 100644 drivers/android/binder/rust_binder_events.c create mode 100644 drivers/android/binder/rust_binder_events.h create mode 100644 drivers/android/binder/rust_binder_internal.h create mode 100644 drivers/android/binder/rust_binder_main.rs create mode 100644 drivers/android/binder/rust_binderfs.c create mode 100644 drivers/android/binder/stats.rs create mode 100644 drivers/android/binder/thread.rs create mode 100644 drivers/android/binder/trace.rs create mode 100644 drivers/android/binder/transaction.rs create mode 100644 rust/helpers/binder.c (limited to 'include') diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 75af3cf472c8..e2e402c9d175 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -14,6 +14,19 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. +config ANDROID_BINDER_IPC_RUST + bool "Rust version of Android Binder IPC Driver" + depends on RUST && MMU && !ANDROID_BINDER_IPC + help + This enables the Rust implementation of the Binder driver. + + Binder is used in Android for both communication between processes, + and remote method invocation. + + This means one Android process can call a method/routine in another + Android process, using Binder to identify, invoke and pass arguments + between said processes. + config ANDROID_BINDERFS bool "Android Binderfs filesystem" depends on ANDROID_BINDER_IPC @@ -28,7 +41,7 @@ config ANDROID_BINDERFS config ANDROID_BINDER_DEVICES string "Android Binder devices" - depends on ANDROID_BINDER_IPC + depends on ANDROID_BINDER_IPC || ANDROID_BINDER_IPC_RUST default "binder,hwbinder,vndbinder" help Default value for the binder.devices parameter. diff --git a/drivers/android/Makefile b/drivers/android/Makefile index f422f91e026b..e0c650d3898e 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,3 +4,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o binder_netlink.o obj-$(CONFIG_ANDROID_BINDER_ALLOC_KUNIT_TEST) += tests/ +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += binder/ diff --git a/drivers/android/binder/Makefile b/drivers/android/binder/Makefile new file mode 100644 index 000000000000..09eabb527fa0 --- /dev/null +++ b/drivers/android/binder/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += rust_binder.o +rust_binder-y := \ + rust_binder_main.o \ + rust_binderfs.o \ + rust_binder_events.o \ + page_range_helper.o diff --git a/drivers/android/binder/allocation.rs b/drivers/android/binder/allocation.rs new file mode 100644 index 000000000000..7f65a9c3a0e5 --- /dev/null +++ b/drivers/android/binder/allocation.rs @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::mem::{size_of, size_of_val, MaybeUninit}; +use core::ops::Range; + +use kernel::{ + bindings, + fs::file::{File, FileDescriptorReservation}, + prelude::*, + sync::{aref::ARef, Arc}, + transmute::{AsBytes, FromBytes}, + uaccess::UserSliceReader, + uapi, +}; + +use crate::{ + deferred_close::DeferredFdCloser, + defs::*, + node::{Node, NodeRef}, + process::Process, + DArc, +}; + +#[derive(Default)] +pub(crate) struct AllocationInfo { + /// Range within the allocation where we can find the offsets to the object descriptors. + pub(crate) offsets: Option>, + /// The target node of the transaction this allocation is associated to. + /// Not set for replies. + pub(crate) target_node: Option, + /// When this allocation is dropped, call `pending_oneway_finished` on the node. + /// + /// This is used to serialize oneway transaction on the same node. Binder guarantees that + /// oneway transactions to the same node are delivered sequentially in the order they are sent. + pub(crate) oneway_node: Option>, + /// Zero the data in the buffer on free. + pub(crate) clear_on_free: bool, + /// List of files embedded in this transaction. + file_list: FileList, +} + +/// Represents an allocation that the kernel is currently using. +/// +/// When allocations are idle, the range allocator holds the data related to them. +/// +/// # Invariants +/// +/// This allocation corresponds to an allocation in the range allocator, so the relevant pages are +/// marked in use in the page range. +pub(crate) struct Allocation { + pub(crate) offset: usize, + size: usize, + pub(crate) ptr: usize, + pub(crate) process: Arc, + allocation_info: Option, + free_on_drop: bool, + pub(crate) oneway_spam_detected: bool, + #[allow(dead_code)] + pub(crate) debug_id: usize, +} + +impl Allocation { + pub(crate) fn new( + process: Arc, + debug_id: usize, + offset: usize, + size: usize, + ptr: usize, + oneway_spam_detected: bool, + ) -> Self { + Self { + process, + offset, + size, + ptr, + debug_id, + oneway_spam_detected, + allocation_info: None, + free_on_drop: true, + } + } + + fn size_check(&self, offset: usize, size: usize) -> Result { + let overflow_fail = offset.checked_add(size).is_none(); + let cmp_size_fail = offset.wrapping_add(size) > self.size; + if overflow_fail || cmp_size_fail { + return Err(EFAULT); + } + Ok(()) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + self.size_check(offset, size)?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { + self.process + .pages + .copy_from_user_slice(reader, self.offset + offset, size) + } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + self.size_check(offset, size_of::())?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.read(self.offset + offset) } + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + self.size_check(offset, size_of_val::(obj))?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.write(self.offset + offset, obj) } + } + + pub(crate) fn fill_zero(&self) -> Result { + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.fill_zero(self.offset, self.size) } + } + + pub(crate) fn keep_alive(mut self) { + self.process + .buffer_make_freeable(self.offset, self.allocation_info.take()); + self.free_on_drop = false; + } + + pub(crate) fn set_info(&mut self, info: AllocationInfo) { + self.allocation_info = Some(info); + } + + pub(crate) fn get_or_init_info(&mut self) -> &mut AllocationInfo { + self.allocation_info.get_or_insert_with(Default::default) + } + + pub(crate) fn set_info_offsets(&mut self, offsets: Range) { + self.get_or_init_info().offsets = Some(offsets); + } + + pub(crate) fn set_info_oneway_node(&mut self, oneway_node: DArc) { + self.get_or_init_info().oneway_node = Some(oneway_node); + } + + pub(crate) fn set_info_clear_on_drop(&mut self) { + self.get_or_init_info().clear_on_free = true; + } + + pub(crate) fn set_info_target_node(&mut self, target_node: NodeRef) { + self.get_or_init_info().target_node = Some(target_node); + } + + /// Reserve enough space to push at least `num_fds` fds. + pub(crate) fn info_add_fd_reserve(&mut self, num_fds: usize) -> Result { + self.get_or_init_info() + .file_list + .files_to_translate + .reserve(num_fds, GFP_KERNEL)?; + + Ok(()) + } + + pub(crate) fn info_add_fd( + &mut self, + file: ARef, + buffer_offset: usize, + close_on_free: bool, + ) -> Result { + self.get_or_init_info().file_list.files_to_translate.push( + FileEntry { + file, + buffer_offset, + close_on_free, + }, + GFP_KERNEL, + )?; + + Ok(()) + } + + pub(crate) fn set_info_close_on_free(&mut self, cof: FdsCloseOnFree) { + self.get_or_init_info().file_list.close_on_free = cof.0; + } + + pub(crate) fn translate_fds(&mut self) -> Result { + let file_list = match self.allocation_info.as_mut() { + Some(info) => &mut info.file_list, + None => return Ok(TranslatedFds::new()), + }; + + let files = core::mem::take(&mut file_list.files_to_translate); + + let num_close_on_free = files.iter().filter(|entry| entry.close_on_free).count(); + let mut close_on_free = KVec::with_capacity(num_close_on_free, GFP_KERNEL)?; + + let mut reservations = KVec::with_capacity(files.len(), GFP_KERNEL)?; + for file_info in files { + let res = FileDescriptorReservation::get_unused_fd_flags(bindings::O_CLOEXEC)?; + let fd = res.reserved_fd(); + self.write::(file_info.buffer_offset, &fd)?; + + reservations.push( + Reservation { + res, + file: file_info.file, + }, + GFP_KERNEL, + )?; + if file_info.close_on_free { + close_on_free.push(fd, GFP_KERNEL)?; + } + } + + Ok(TranslatedFds { + reservations, + close_on_free: FdsCloseOnFree(close_on_free), + }) + } + + /// Should the looper return to userspace when freeing this allocation? + pub(crate) fn looper_need_return_on_free(&self) -> bool { + // Closing fds involves pushing task_work for execution when we return to userspace. Hence, + // we should return to userspace asap if we are closing fds. + match self.allocation_info { + Some(ref info) => !info.file_list.close_on_free.is_empty(), + None => false, + } + } +} + +impl Drop for Allocation { + fn drop(&mut self) { + if !self.free_on_drop { + return; + } + + if let Some(mut info) = self.allocation_info.take() { + if let Some(oneway_node) = info.oneway_node.as_ref() { + oneway_node.pending_oneway_finished(); + } + + info.target_node = None; + + if let Some(offsets) = info.offsets.clone() { + let view = AllocationView::new(self, offsets.start); + for i in offsets.step_by(size_of::()) { + if view.cleanup_object(i).is_err() { + pr_warn!("Error cleaning up object at offset {}\n", i) + } + } + } + + for &fd in &info.file_list.close_on_free { + let closer = match DeferredFdCloser::new(GFP_KERNEL) { + Ok(closer) => closer, + Err(kernel::alloc::AllocError) => { + // Ignore allocation failures. + break; + } + }; + + // Here, we ignore errors. The operation can fail if the fd is not valid, or if the + // method is called from a kthread. However, this is always called from a syscall, + // so the latter case cannot happen, and we don't care about the first case. + let _ = closer.close_fd(fd); + } + + if info.clear_on_free { + if let Err(e) = self.fill_zero() { + pr_warn!("Failed to clear data on free: {:?}", e); + } + } + } + + self.process.buffer_raw_free(self.ptr); + } +} + +/// A wrapper around `Allocation` that is being created. +/// +/// If the allocation is destroyed while wrapped in this wrapper, then the allocation will be +/// considered to be part of a failed transaction. Successful transactions avoid that by calling +/// `success`, which skips the destructor. +#[repr(transparent)] +pub(crate) struct NewAllocation(pub(crate) Allocation); + +impl NewAllocation { + pub(crate) fn success(self) -> Allocation { + // This skips the destructor. + // + // SAFETY: This type is `#[repr(transparent)]`, so the layout matches. + unsafe { core::mem::transmute(self) } + } +} + +impl core::ops::Deref for NewAllocation { + type Target = Allocation; + fn deref(&self) -> &Allocation { + &self.0 + } +} + +impl core::ops::DerefMut for NewAllocation { + fn deref_mut(&mut self) -> &mut Allocation { + &mut self.0 + } +} + +/// A view into the beginning of an allocation. +/// +/// All attempts to read or write outside of the view will fail. To intentionally access outside of +/// this view, use the `alloc` field of this struct directly. +pub(crate) struct AllocationView<'a> { + pub(crate) alloc: &'a mut Allocation, + limit: usize, +} + +impl<'a> AllocationView<'a> { + pub(crate) fn new(alloc: &'a mut Allocation, limit: usize) -> Self { + AllocationView { alloc, limit } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.read(offset) + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.write(offset, obj) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + if offset.checked_add(size).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.copy_into(reader, offset, size) + } + + pub(crate) fn transfer_binder_object( + &self, + offset: usize, + obj: &uapi::flat_binder_object, + strong: bool, + node_ref: NodeRef, + ) -> Result { + let mut newobj = FlatBinderObject::default(); + let node = node_ref.node.clone(); + if Arc::ptr_eq(&node_ref.node.owner, &self.alloc.process) { + // The receiving process is the owner of the node, so send it a binder object (instead + // of a handle). + let (ptr, cookie) = node.get_id(); + newobj.hdr.type_ = if strong { + BINDER_TYPE_BINDER + } else { + BINDER_TYPE_WEAK_BINDER + }; + newobj.flags = obj.flags; + newobj.__bindgen_anon_1.binder = ptr as _; + newobj.cookie = cookie as _; + self.write(offset, &newobj)?; + // Increment the user ref count on the node. It will be decremented as part of the + // destruction of the buffer, when we see a binder or weak-binder object. + node.update_refcount(true, 1, strong); + } else { + // The receiving process is different from the owner, so we need to insert a handle to + // the binder object. + let handle = self + .alloc + .process + .as_arc_borrow() + .insert_or_update_handle(node_ref, false)?; + newobj.hdr.type_ = if strong { + BINDER_TYPE_HANDLE + } else { + BINDER_TYPE_WEAK_HANDLE + }; + newobj.flags = obj.flags; + newobj.__bindgen_anon_1.handle = handle; + if self.write(offset, &newobj).is_err() { + // Decrement ref count on the handle we just created. + let _ = self + .alloc + .process + .as_arc_borrow() + .update_ref(handle, false, strong); + return Err(EINVAL); + } + } + + Ok(()) + } + + fn cleanup_object(&self, index_offset: usize) -> Result { + let offset = self.alloc.read(index_offset)?; + let header = self.read::(offset)?; + match header.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_BINDER; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}BINDER`, so the `binder` field is + // populated. + let ptr = unsafe { obj.__bindgen_anon_1.binder }; + let cookie = obj.cookie; + self.alloc.process.update_node(ptr, cookie, strong); + Ok(()) + } + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_HANDLE; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}HANDLE`, so the `handle` field is + // populated. + let handle = unsafe { obj.__bindgen_anon_1.handle }; + self.alloc + .process + .as_arc_borrow() + .update_ref(handle, false, strong) + } + _ => Ok(()), + } + } +} + +/// A binder object as it is serialized. +/// +/// # Invariants +/// +/// All bytes must be initialized, and the value of `self.hdr.type_` must be one of the allowed +/// types. +#[repr(C)] +pub(crate) union BinderObject { + hdr: uapi::binder_object_header, + fbo: uapi::flat_binder_object, + fdo: uapi::binder_fd_object, + bbo: uapi::binder_buffer_object, + fdao: uapi::binder_fd_array_object, +} + +/// A view into a `BinderObject` that can be used in a match statement. +pub(crate) enum BinderObjectRef<'a> { + Binder(&'a mut uapi::flat_binder_object), + Handle(&'a mut uapi::flat_binder_object), + Fd(&'a mut uapi::binder_fd_object), + Ptr(&'a mut uapi::binder_buffer_object), + Fda(&'a mut uapi::binder_fd_array_object), +} + +impl BinderObject { + pub(crate) fn read_from(reader: &mut UserSliceReader) -> Result { + let object = Self::read_from_inner(|slice| { + let read_len = usize::min(slice.len(), reader.len()); + reader.clone_reader().read_slice(&mut slice[..read_len])?; + Ok(()) + })?; + + // If we used a object type smaller than the largest object size, then we've read more + // bytes than we needed to. However, we used `.clone_reader()` to avoid advancing the + // original reader. Now, we call `skip` so that the caller's reader is advanced by the + // right amount. + // + // The `skip` call fails if the reader doesn't have `size` bytes available. This could + // happen if the type header corresponds to an object type that is larger than the rest of + // the reader. + // + // Any extra bytes beyond the size of the object are inaccessible after this call, so + // reading them again from the `reader` later does not result in TOCTOU bugs. + reader.skip(object.size())?; + + Ok(object) + } + + /// Use the provided reader closure to construct a `BinderObject`. + /// + /// The closure should write the bytes for the object into the provided slice. + pub(crate) fn read_from_inner(reader: R) -> Result + where + R: FnOnce(&mut [u8; size_of::()]) -> Result<()>, + { + let mut obj = MaybeUninit::::zeroed(); + + // SAFETY: The lengths of `BinderObject` and `[u8; size_of::()]` are equal, + // and the byte array has an alignment requirement of one, so the pointer cast is okay. + // Additionally, `obj` was initialized to zeros, so the byte array will not be + // uninitialized. + (reader)(unsafe { &mut *obj.as_mut_ptr().cast() })?; + + // SAFETY: The entire object is initialized, so accessing this field is safe. + let type_ = unsafe { obj.assume_init_ref().hdr.type_ }; + if Self::type_to_size(type_).is_none() { + // The value of `obj.hdr_type_` was invalid. + return Err(EINVAL); + } + + // SAFETY: All bytes are initialized (since we zeroed them at the start) and we checked + // that `self.hdr.type_` is one of the allowed types, so the type invariants are satisfied. + unsafe { Ok(obj.assume_init()) } + } + + pub(crate) fn as_ref(&mut self) -> BinderObjectRef<'_> { + use BinderObjectRef::*; + // SAFETY: The constructor ensures that all bytes of `self` are initialized, and all + // variants of this union accept all initialized bit patterns. + unsafe { + match self.hdr.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => Binder(&mut self.fbo), + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => Handle(&mut self.fbo), + BINDER_TYPE_FD => Fd(&mut self.fdo), + BINDER_TYPE_PTR => Ptr(&mut self.bbo), + BINDER_TYPE_FDA => Fda(&mut self.fdao), + // SAFETY: By the type invariant, the value of `self.hdr.type_` cannot have any + // other value than the ones checked above. + _ => core::hint::unreachable_unchecked(), + } + } + } + + pub(crate) fn size(&self) -> usize { + // SAFETY: The entire object is initialized, so accessing this field is safe. + let type_ = unsafe { self.hdr.type_ }; + + // SAFETY: The type invariants guarantee that the type field is correct. + unsafe { Self::type_to_size(type_).unwrap_unchecked() } + } + + fn type_to_size(type_: u32) -> Option { + match type_ { + BINDER_TYPE_WEAK_BINDER => Some(size_of::()), + BINDER_TYPE_BINDER => Some(size_of::()), + BINDER_TYPE_WEAK_HANDLE => Some(size_of::()), + BINDER_TYPE_HANDLE => Some(size_of::()), + BINDER_TYPE_FD => Some(size_of::()), + BINDER_TYPE_PTR => Some(size_of::()), + BINDER_TYPE_FDA => Some(size_of::()), + _ => None, + } + } +} + +#[derive(Default)] +struct FileList { + files_to_translate: KVec, + close_on_free: KVec, +} + +struct FileEntry { + /// The file for which a descriptor will be created in the recipient process. + file: ARef, + /// The offset in the buffer where the file descriptor is stored. + buffer_offset: usize, + /// Whether this fd should be closed when the allocation is freed. + close_on_free: bool, +} + +pub(crate) struct TranslatedFds { + reservations: KVec, + /// If commit is called, then these fds should be closed. (If commit is not called, then they + /// shouldn't be closed.) + close_on_free: FdsCloseOnFree, +} + +struct Reservation { + res: FileDescriptorReservation, + file: ARef, +} + +impl TranslatedFds { + pub(crate) fn new() -> Self { + Self { + reservations: KVec::new(), + close_on_free: FdsCloseOnFree(KVec::new()), + } + } + + pub(crate) fn commit(self) -> FdsCloseOnFree { + for entry in self.reservations { + entry.res.fd_install(entry.file); + } + + self.close_on_free + } +} + +pub(crate) struct FdsCloseOnFree(KVec); diff --git a/drivers/android/binder/context.rs b/drivers/android/binder/context.rs new file mode 100644 index 000000000000..3d135ec03ca7 --- /dev/null +++ b/drivers/android/binder/context.rs @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + error::Error, + list::{List, ListArc, ListLinks}, + prelude::*, + security, + str::{CStr, CString}, + sync::{Arc, Mutex}, + task::Kuid, +}; + +use crate::{error::BinderError, node::NodeRef, process::Process}; + +kernel::sync::global_lock! { + // SAFETY: We call `init` in the module initializer, so it's initialized before first use. + pub(crate) unsafe(uninit) static CONTEXTS: Mutex = ContextList { + list: List::new(), + }; +} + +pub(crate) struct ContextList { + list: List, +} + +pub(crate) fn get_all_contexts() -> Result>> { + let lock = CONTEXTS.lock(); + + let count = lock.list.iter().count(); + + let mut ctxs = KVec::with_capacity(count, GFP_KERNEL)?; + for ctx in &lock.list { + ctxs.push(Arc::from(ctx), GFP_KERNEL)?; + } + Ok(ctxs) +} + +/// This struct keeps track of the processes using this context, and which process is the context +/// manager. +struct Manager { + node: Option, + uid: Option, + all_procs: List, +} + +/// There is one context per binder file (/dev/binder, /dev/hwbinder, etc) +#[pin_data] +pub(crate) struct Context { + #[pin] + manager: Mutex, + pub(crate) name: CString, + #[pin] + links: ListLinks, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Context { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Context { + using ListLinks { self.links }; + } +} + +impl Context { + pub(crate) fn new(name: &CStr) -> Result> { + let name = CString::try_from(name)?; + let list_ctx = ListArc::pin_init::( + try_pin_init!(Context { + name, + links <- ListLinks::new(), + manager <- kernel::new_mutex!(Manager { + all_procs: List::new(), + node: None, + uid: None, + }, "Context::manager"), + }), + GFP_KERNEL, + )?; + + let ctx = list_ctx.clone_arc(); + CONTEXTS.lock().list.push_back(list_ctx); + + Ok(ctx) + } + + /// Called when the file for this context is unlinked. + /// + /// No-op if called twice. + pub(crate) fn deregister(&self) { + // SAFETY: We never add the context to any other linked list than this one, so it is either + // in this list, or not in any list. + unsafe { CONTEXTS.lock().list.remove(self) }; + } + + pub(crate) fn register_process(self: &Arc, proc: ListArc) { + if !Arc::ptr_eq(self, &proc.ctx) { + pr_err!("Context::register_process called on the wrong context."); + return; + } + self.manager.lock().all_procs.push_back(proc); + } + + pub(crate) fn deregister_process(self: &Arc, proc: &Process) { + if !Arc::ptr_eq(self, &proc.ctx) { + pr_err!("Context::deregister_process called on the wrong context."); + return; + } + // SAFETY: We just checked that this is the right list. + unsafe { self.manager.lock().all_procs.remove(proc) }; + } + + pub(crate) fn set_manager_node(&self, node_ref: NodeRef) -> Result { + let mut manager = self.manager.lock(); + if manager.node.is_some() { + pr_warn!("BINDER_SET_CONTEXT_MGR already set"); + return Err(EBUSY); + } + security::binder_set_context_mgr(&node_ref.node.owner.cred)?; + + // If the context manager has been set before, ensure that we use the same euid. + let caller_uid = Kuid::current_euid(); + if let Some(ref uid) = manager.uid { + if *uid != caller_uid { + return Err(EPERM); + } + } + + manager.node = Some(node_ref); + manager.uid = Some(caller_uid); + Ok(()) + } + + pub(crate) fn unset_manager_node(&self) { + let node_ref = self.manager.lock().node.take(); + drop(node_ref); + } + + pub(crate) fn get_manager_node(&self, strong: bool) -> Result { + self.manager + .lock() + .node + .as_ref() + .ok_or_else(BinderError::new_dead)? + .clone(strong) + .map_err(BinderError::from) + } + + pub(crate) fn for_each_proc(&self, mut func: F) + where + F: FnMut(&Process), + { + let lock = self.manager.lock(); + for proc in &lock.all_procs { + func(&proc); + } + } + + pub(crate) fn get_all_procs(&self) -> Result>> { + let lock = self.manager.lock(); + let count = lock.all_procs.iter().count(); + + let mut procs = KVec::with_capacity(count, GFP_KERNEL)?; + for proc in &lock.all_procs { + procs.push(Arc::from(proc), GFP_KERNEL)?; + } + Ok(procs) + } + + pub(crate) fn get_procs_with_pid(&self, pid: i32) -> Result>> { + let orig = self.get_all_procs()?; + let mut backing = KVec::with_capacity(orig.len(), GFP_KERNEL)?; + for proc in orig.into_iter().filter(|proc| proc.task.pid() == pid) { + backing.push(proc, GFP_KERNEL)?; + } + Ok(backing) + } +} diff --git a/drivers/android/binder/deferred_close.rs b/drivers/android/binder/deferred_close.rs new file mode 100644 index 000000000000..ac895c04d0cb --- /dev/null +++ b/drivers/android/binder/deferred_close.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Logic for closing files in a deferred manner. +//! +//! This file could make sense to have in `kernel::fs`, but it was rejected for being too +//! Binder-specific. + +use core::mem::MaybeUninit; +use kernel::{ + alloc::{AllocError, Flags}, + bindings, + prelude::*, +}; + +/// Helper used for closing file descriptors in a way that is safe even if the file is currently +/// held using `fdget`. +/// +/// Additional motivation can be found in commit 80cd795630d6 ("binder: fix use-after-free due to +/// ksys_close() during fdget()") and in the comments on `binder_do_fd_close`. +pub(crate) struct DeferredFdCloser { + inner: KBox, +} + +/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with +/// moving it across threads. +unsafe impl Send for DeferredFdCloser {} +/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with +/// moving it across threads. +unsafe impl Sync for DeferredFdCloser {} + +/// # Invariants +/// +/// If the `file` pointer is non-null, then it points at a `struct file` and owns a refcount to +/// that file. +#[repr(C)] +struct DeferredFdCloserInner { + twork: MaybeUninit, + file: *mut bindings::file, +} + +impl DeferredFdCloser { + /// Create a new [`DeferredFdCloser`]. + pub(crate) fn new(flags: Flags) -> Result { + Ok(Self { + // INVARIANT: The `file` pointer is null, so the type invariant does not apply. + inner: KBox::new( + DeferredFdCloserInner { + twork: MaybeUninit::uninit(), + file: core::ptr::null_mut(), + }, + flags, + )?, + }) + } + + /// Schedule a task work that closes the file descriptor when this task returns to userspace. + /// + /// Fails if this is called from a context where we cannot run work when returning to + /// userspace. (E.g., from a kthread.) + pub(crate) fn close_fd(self, fd: u32) -> Result<(), DeferredFdCloseError> { + use bindings::task_work_notify_mode_TWA_RESUME as TWA_RESUME; + + // In this method, we schedule the task work before closing the file. This is because + // scheduling a task work is fallible, and we need to know whether it will fail before we + // attempt to close the file. + + // Task works are not available on kthreads. + let current = kernel::current!(); + + // Check if this is a kthread. + // SAFETY: Reading `flags` from a task is always okay. + if unsafe { ((*current.as_ptr()).flags & bindings::PF_KTHREAD) != 0 } { + return Err(DeferredFdCloseError::TaskWorkUnavailable); + } + + // Transfer ownership of the box's allocation to a raw pointer. This disables the + // destructor, so we must manually convert it back to a KBox to drop it. + // + // Until we convert it back to a `KBox`, there are no aliasing requirements on this + // pointer. + let inner = KBox::into_raw(self.inner); + + // The `callback_head` field is first in the struct, so this cast correctly gives us a + // pointer to the field. + let callback_head = inner.cast::(); + // SAFETY: This pointer offset operation does not go out-of-bounds. + let file_field = unsafe { core::ptr::addr_of_mut!((*inner).file) }; + + let current = current.as_ptr(); + + // SAFETY: This function currently has exclusive access to the `DeferredFdCloserInner`, so + // it is okay for us to perform unsynchronized writes to its `callback_head` field. + unsafe { bindings::init_task_work(callback_head, Some(Self::do_close_fd)) }; + + // SAFETY: This inserts the `DeferredFdCloserInner` into the task workqueue for the current + // task. If this operation is successful, then this transfers exclusive ownership of the + // `callback_head` field to the C side until it calls `do_close_fd`, and we don't touch or + // invalidate the field during that time. + // + // When the C side calls `do_close_fd`, the safety requirements of that method are + // satisfied because when a task work is executed, the callback is given ownership of the + // pointer. + // + // The file pointer is currently null. If it is changed to be non-null before `do_close_fd` + // is called, then that change happens due to the write at the end of this function, and + // that write has a safety comment that explains why the refcount can be dropped when + // `do_close_fd` runs. + let res = unsafe { bindings::task_work_add(current, callback_head, TWA_RESUME) }; + + if res != 0 { + // SAFETY: Scheduling the task work failed, so we still have ownership of the box, so + // we may destroy it. + unsafe { drop(KBox::from_raw(inner)) }; + + return Err(DeferredFdCloseError::TaskWorkUnavailable); + } + + // This removes the fd from the fd table in `current`. The file is not fully closed until + // `filp_close` is called. We are given ownership of one refcount to the file. + // + // SAFETY: This is safe no matter what `fd` is. If the `fd` is valid (that is, if the + // pointer is non-null), then we call `filp_close` on the returned pointer as required by + // `file_close_fd`. + let file = unsafe { bindings::file_close_fd(fd) }; + if file.is_null() { + // We don't clean up the task work since that might be expensive if the task work queue + // is long. Just let it execute and let it clean up for itself. + return Err(DeferredFdCloseError::BadFd); + } + + // Acquire a second refcount to the file. + // + // SAFETY: The `file` pointer points at a file with a non-zero refcount. + unsafe { bindings::get_file(file) }; + + // This method closes the fd, consuming one of our two refcounts. There could be active + // light refcounts created from that fd, so we must ensure that the file has a positive + // refcount for the duration of those active light refcounts. We do that by holding on to + // the second refcount until the current task returns to userspace. + // + // SAFETY: The `file` pointer is valid. Passing `current->files` as the file table to close + // it in is correct, since we just got the `fd` from `file_close_fd` which also uses + // `current->files`. + // + // Note: fl_owner_t is currently a void pointer. + unsafe { bindings::filp_close(file, (*current).files as bindings::fl_owner_t) }; + + // We update the file pointer that the task work is supposed to fput. This transfers + // ownership of our last refcount. + // + // INVARIANT: This changes the `file` field of a `DeferredFdCloserInner` from null to + // non-null. This doesn't break the type invariant for `DeferredFdCloserInner` because we + // still own a refcount to the file, so we can pass ownership of that refcount to the + // `DeferredFdCloserInner`. + // + // When `do_close_fd` runs, it must be safe for it to `fput` the refcount. However, this is + // the case because all light refcounts that are associated with the fd we closed + // previously must be dropped when `do_close_fd`, since light refcounts must be dropped + // before returning to userspace. + // + // SAFETY: Task works are executed on the current thread right before we return to + // userspace, so this write is guaranteed to happen before `do_close_fd` is called, which + // means that a race is not possible here. + unsafe { *file_field = file }; + + Ok(()) + } + + /// # Safety + /// + /// The provided pointer must point at the `twork` field of a `DeferredFdCloserInner` stored in + /// a `KBox`, and the caller must pass exclusive ownership of that `KBox`. Furthermore, if the + /// file pointer is non-null, then it must be okay to release the refcount by calling `fput`. + unsafe extern "C" fn do_close_fd(inner: *mut bindings::callback_head) { + // SAFETY: The caller just passed us ownership of this box. + let inner = unsafe { KBox::from_raw(inner.cast::()) }; + if !inner.file.is_null() { + // SAFETY: By the type invariants, we own a refcount to this file, and the caller + // guarantees that dropping the refcount now is okay. + unsafe { bindings::fput(inner.file) }; + } + // The allocation is freed when `inner` goes out of scope. + } +} + +/// Represents a failure to close an fd in a deferred manner. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub(crate) enum DeferredFdCloseError { + /// Closing the fd failed because we were unable to schedule a task work. + TaskWorkUnavailable, + /// Closing the fd failed because the fd does not exist. + BadFd, +} + +impl From for Error { + fn from(err: DeferredFdCloseError) -> Error { + match err { + DeferredFdCloseError::TaskWorkUnavailable => ESRCH, + DeferredFdCloseError::BadFd => EBADF, + } + } +} diff --git a/drivers/android/binder/defs.rs b/drivers/android/binder/defs.rs new file mode 100644 index 000000000000..33f51b4139c7 --- /dev/null +++ b/drivers/android/binder/defs.rs @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::mem::MaybeUninit; +use core::ops::{Deref, DerefMut}; +use kernel::{ + transmute::{AsBytes, FromBytes}, + uapi::{self, *}, +}; + +macro_rules! pub_no_prefix { + ($prefix:ident, $($newname:ident),+ $(,)?) => { + $(pub(crate) const $newname: u32 = kernel::macros::concat_idents!($prefix, $newname);)+ + }; +} + +pub_no_prefix!( + binder_driver_return_protocol_, + BR_TRANSACTION, + BR_TRANSACTION_SEC_CTX, + BR_REPLY, + BR_DEAD_REPLY, + BR_FAILED_REPLY, + BR_FROZEN_REPLY, + BR_NOOP, + BR_SPAWN_LOOPER, + BR_TRANSACTION_COMPLETE, + BR_TRANSACTION_PENDING_FROZEN, + BR_ONEWAY_SPAM_SUSPECT, + BR_OK, + BR_ERROR, + BR_INCREFS, + BR_ACQUIRE, + BR_RELEASE, + BR_DECREFS, + BR_DEAD_BINDER, + BR_CLEAR_DEATH_NOTIFICATION_DONE, + BR_FROZEN_BINDER, + BR_CLEAR_FREEZE_NOTIFICATION_DONE, +); + +pub_no_prefix!( + binder_driver_command_protocol_, + BC_TRANSACTION, + BC_TRANSACTION_SG, + BC_REPLY, + BC_REPLY_SG, + BC_FREE_BUFFER, + BC_ENTER_LOOPER, + BC_EXIT_LOOPER, + BC_REGISTER_LOOPER, + BC_INCREFS, + BC_ACQUIRE, + BC_RELEASE, + BC_DECREFS, + BC_INCREFS_DONE, + BC_ACQUIRE_DONE, + BC_REQUEST_DEATH_NOTIFICATION, + BC_CLEAR_DEATH_NOTIFICATION, + BC_DEAD_BINDER_DONE, + BC_REQUEST_FREEZE_NOTIFICATION, + BC_CLEAR_FREEZE_NOTIFICATION, + BC_FREEZE_NOTIFICATION_DONE, +); + +pub_no_prefix!( + flat_binder_object_flags_, + FLAT_BINDER_FLAG_ACCEPTS_FDS, + FLAT_BINDER_FLAG_TXN_SECURITY_CTX +); + +pub_no_prefix!( + transaction_flags_, + TF_ONE_WAY, + TF_ACCEPT_FDS, + TF_CLEAR_BUF, + TF_UPDATE_TXN +); + +pub(crate) use uapi::{ + BINDER_TYPE_BINDER, BINDER_TYPE_FD, BINDER_TYPE_FDA, BINDER_TYPE_HANDLE, BINDER_TYPE_PTR, + BINDER_TYPE_WEAK_BINDER, BINDER_TYPE_WEAK_HANDLE, +}; + +macro_rules! decl_wrapper { + ($newname:ident, $wrapped:ty) => { + // Define a wrapper around the C type. Use `MaybeUninit` to enforce that the value of + // padding bytes must be preserved. + #[derive(Copy, Clone)] + #[repr(transparent)] + pub(crate) struct $newname(MaybeUninit<$wrapped>); + + // SAFETY: This macro is only used with types where this is ok. + unsafe impl FromBytes for $newname {} + // SAFETY: This macro is only used with types where this is ok. + unsafe impl AsBytes for $newname {} + + impl Deref for $newname { + type Target = $wrapped; + fn deref(&self) -> &Self::Target { + // SAFETY: We use `MaybeUninit` only to preserve padding. The value must still + // always be valid. + unsafe { self.0.assume_init_ref() } + } + } + + impl DerefMut for $newname { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: We use `MaybeUninit` only to preserve padding. The value must still + // always be valid. + unsafe { self.0.assume_init_mut() } + } + } + + impl Default for $newname { + fn default() -> Self { + // Create a new value of this type where all bytes (including padding) are zeroed. + Self(MaybeUninit::zeroed()) + } + } + }; +} + +decl_wrapper!(BinderNodeDebugInfo, uapi::binder_node_debug_info); +decl_wrapper!(BinderNodeInfoForRef, uapi::binder_node_info_for_ref); +decl_wrapper!(FlatBinderObject, uapi::flat_binder_object); +decl_wrapper!(BinderFdObject, uapi::binder_fd_object); +decl_wrapper!(BinderFdArrayObject, uapi::binder_fd_array_object); +decl_wrapper!(BinderObjectHeader, uapi::binder_object_header); +decl_wrapper!(BinderBufferObject, uapi::binder_buffer_object); +decl_wrapper!(BinderTransactionData, uapi::binder_transaction_data); +decl_wrapper!( + BinderTransactionDataSecctx, + uapi::binder_transaction_data_secctx +); +decl_wrapper!(BinderTransactionDataSg, uapi::binder_transaction_data_sg); +decl_wrapper!(BinderWriteRead, uapi::binder_write_read); +decl_wrapper!(BinderVersion, uapi::binder_version); +decl_wrapper!(BinderFrozenStatusInfo, uapi::binder_frozen_status_info); +decl_wrapper!(BinderFreezeInfo, uapi::binder_freeze_info); +decl_wrapper!(BinderFrozenStateInfo, uapi::binder_frozen_state_info); +decl_wrapper!(BinderHandleCookie, uapi::binder_handle_cookie); +decl_wrapper!(ExtendedError, uapi::binder_extended_error); + +impl BinderVersion { + pub(crate) fn current() -> Self { + Self(MaybeUninit::new(uapi::binder_version { + protocol_version: BINDER_CURRENT_PROTOCOL_VERSION as _, + })) + } +} + +impl BinderTransactionData { + pub(crate) fn with_buffers_size(self, buffers_size: u64) -> BinderTransactionDataSg { + BinderTransactionDataSg(MaybeUninit::new(uapi::binder_transaction_data_sg { + transaction_data: *self, + buffers_size, + })) + } +} + +impl BinderTransactionDataSecctx { + /// View the inner data as wrapped in `BinderTransactionData`. + pub(crate) fn tr_data(&mut self) -> &mut BinderTransactionData { + // SAFETY: Transparent wrapper is safe to transmute. + unsafe { + &mut *(&mut self.transaction_data as *mut uapi::binder_transaction_data + as *mut BinderTransactionData) + } + } +} + +impl ExtendedError { + pub(crate) fn new(id: u32, command: u32, param: i32) -> Self { + Self(MaybeUninit::new(uapi::binder_extended_error { + id, + command, + param, + })) + } +} diff --git a/drivers/android/binder/error.rs b/drivers/android/binder/error.rs new file mode 100644 index 000000000000..9921827267d0 --- /dev/null +++ b/drivers/android/binder/error.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::prelude::*; + +use crate::defs::*; + +pub(crate) type BinderResult = core::result::Result; + +/// An error that will be returned to userspace via the `BINDER_WRITE_READ` ioctl rather than via +/// errno. +pub(crate) struct BinderError { + pub(crate) reply: u32, + source: Option, +} + +impl BinderError { + pub(crate) fn new_dead() -> Self { + Self { + reply: BR_DEAD_REPLY, + source: None, + } + } + + pub(crate) fn new_frozen() -> Self { + Self { + reply: BR_FROZEN_REPLY, + source: None, + } + } + + pub(crate) fn new_frozen_oneway() -> Self { + Self { + reply: BR_TRANSACTION_PENDING_FROZEN, + source: None, + } + } + + pub(crate) fn is_dead(&self) -> bool { + self.reply == BR_DEAD_REPLY + } + + pub(crate) fn as_errno(&self) -> kernel::ffi::c_int { + self.source.unwrap_or(EINVAL).to_errno() + } + + pub(crate) fn should_pr_warn(&self) -> bool { + self.source.is_some() + } +} + +/// Convert an errno into a `BinderError` and store the errno used to construct it. The errno +/// should be stored as the thread's extended error when given to userspace. +impl From for BinderError { + fn from(source: Error) -> Self { + Self { + reply: BR_FAILED_REPLY, + source: Some(source), + } + } +} + +impl From for BinderError { + fn from(source: kernel::fs::file::BadFdError) -> Self { + BinderError::from(Error::from(source)) + } +} + +impl From for BinderError { + fn from(_: kernel::alloc::AllocError) -> Self { + Self { + reply: BR_FAILED_REPLY, + source: Some(ENOMEM), + } + } +} + +impl core::fmt::Debug for BinderError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.reply { + BR_FAILED_REPLY => match self.source.as_ref() { + Some(source) => f + .debug_struct("BR_FAILED_REPLY") + .field("source", source) + .finish(), + None => f.pad("BR_FAILED_REPLY"), + }, + BR_DEAD_REPLY => f.pad("BR_DEAD_REPLY"), + BR_FROZEN_REPLY => f.pad("BR_FROZEN_REPLY"), + BR_TRANSACTION_PENDING_FROZEN => f.pad("BR_TRANSACTION_PENDING_FROZEN"), + BR_TRANSACTION_COMPLETE => f.pad("BR_TRANSACTION_COMPLETE"), + _ => f + .debug_struct("BinderError") + .field("reply", &self.reply) + .finish(), + } + } +} diff --git a/drivers/android/binder/freeze.rs b/drivers/android/binder/freeze.rs new file mode 100644 index 000000000000..e68c3c8bc55a --- /dev/null +++ b/drivers/android/binder/freeze.rs @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + alloc::AllocError, + list::ListArc, + prelude::*, + rbtree::{self, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + sync::{Arc, UniqueArc}, + uaccess::UserSliceReader, +}; + +use crate::{ + defs::*, node::Node, process::Process, thread::Thread, BinderReturnWriter, DArc, DLArc, + DTRWrap, DeliverToRead, +}; + +#[derive(Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +pub(crate) struct FreezeCookie(u64); + +/// Represents a listener for changes to the frozen state of a process. +pub(crate) struct FreezeListener { + /// The node we are listening for. + pub(crate) node: DArc, + /// The cookie of this freeze listener. + cookie: FreezeCookie, + /// What value of `is_frozen` did we most recently tell userspace about? + last_is_frozen: Option, + /// We sent a `BR_FROZEN_BINDER` and we are waiting for `BC_FREEZE_NOTIFICATION_DONE` before + /// sending any other commands. + is_pending: bool, + /// Userspace sent `BC_CLEAR_FREEZE_NOTIFICATION` and we need to reply with + /// `BR_CLEAR_FREEZE_NOTIFICATION_DONE` as soon as possible. If `is_pending` is set, then we + /// must wait for it to be unset before we can reply. + is_clearing: bool, + /// Number of cleared duplicates that can't be deleted until userspace sends + /// `BC_FREEZE_NOTIFICATION_DONE`. + num_pending_duplicates: u64, + /// Number of cleared duplicates that can be deleted. + num_cleared_duplicates: u64, +} + +impl FreezeListener { + /// Is it okay to create a new listener with the same cookie as this one for the provided node? + /// + /// Under some scenarios, userspace may delete a freeze listener and immediately recreate it + /// with the same cookie. This results in duplicate listeners. To avoid issues with ambiguity, + /// we allow this only if the new listener is for the same node, and we also require that the + /// old listener has already been cleared. + fn allow_duplicate(&self, node: &DArc) -> bool { + Arc::ptr_eq(&self.node, node) && self.is_clearing + } +} + +type UninitFM = UniqueArc>>; + +/// Represents a notification that the freeze state has changed. +pub(crate) struct FreezeMessage { + cookie: FreezeCookie, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for FreezeMessage { + untracked; + } +} + +impl FreezeMessage { + fn new(flags: kernel::alloc::Flags) -> Result { + UniqueArc::new_uninit(flags) + } + + fn init(ua: UninitFM, cookie: FreezeCookie) -> DLArc { + match ua.pin_init_with(DTRWrap::new(FreezeMessage { cookie })) { + Ok(msg) => ListArc::from(msg), + Err(err) => match err {}, + } + } +} + +impl DeliverToRead for FreezeMessage { + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let _removed_listener; + let mut node_refs = thread.process.node_refs.lock(); + let Some(mut freeze_entry) = node_refs.freeze_listeners.find_mut(&self.cookie) else { + return Ok(true); + }; + let freeze = freeze_entry.get_mut(); + + if freeze.num_cleared_duplicates > 0 { + freeze.num_cleared_duplicates -= 1; + drop(node_refs); + writer.write_code(BR_CLEAR_FREEZE_NOTIFICATION_DONE)?; + writer.write_payload(&self.cookie.0)?; + return Ok(true); + } + + if freeze.is_pending { + return Ok(true); + } + if freeze.is_clearing { + _removed_listener = freeze_entry.remove_node(); + drop(node_refs); + writer.write_code(BR_CLEAR_FREEZE_NOTIFICATION_DONE)?; + writer.write_payload(&self.cookie.0)?; + Ok(true) + } else { + let is_frozen = freeze.node.owner.inner.lock().is_frozen; + if freeze.last_is_frozen == Some(is_frozen) { + return Ok(true); + } + + let mut state_info = BinderFrozenStateInfo::default(); + state_info.is_frozen = is_frozen as u32; + state_info.cookie = freeze.cookie.0; + freeze.is_pending = true; + freeze.last_is_frozen = Some(is_frozen); + drop(node_refs); + + writer.write_code(BR_FROZEN_BINDER)?; + writer.write_payload(&state_info)?; + // BR_FROZEN_BINDER notifications can cause transactions + Ok(false) + } + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!(m, "{}has frozen binder\n", prefix); + Ok(()) + } +} + +impl FreezeListener { + pub(crate) fn on_process_exit(&self, proc: &Arc) { + if !self.is_clearing { + self.node.remove_freeze_listener(proc); + } + } +} + +impl Process { + pub(crate) fn request_freeze_notif( + self: &Arc, + reader: &mut UserSliceReader, + ) -> Result<()> { + let hc = reader.read::()?; + let handle = hc.handle; + let cookie = FreezeCookie(hc.cookie); + + let msg = FreezeMessage::new(GFP_KERNEL)?; + let alloc = RBTreeNodeReservation::new(GFP_KERNEL)?; + + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(info) = node_refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION invalid ref {}\n", handle); + return Err(EINVAL); + }; + if info.freeze().is_some() { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION already set\n"); + return Err(EINVAL); + } + let node_ref = info.node_ref(); + let freeze_entry = node_refs.freeze_listeners.entry(cookie); + + if let rbtree::Entry::Occupied(ref dupe) = freeze_entry { + if !dupe.get().allow_duplicate(&node_ref.node) { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION duplicate cookie\n"); + return Err(EINVAL); + } + } + + // All failure paths must come before this call, and all modifications must come after this + // call. + node_ref.node.add_freeze_listener(self, GFP_KERNEL)?; + + match freeze_entry { + rbtree::Entry::Vacant(entry) => { + entry.insert( + FreezeListener { + cookie, + node: node_ref.node.clone(), + last_is_frozen: None, + is_pending: false, + is_clearing: false, + num_pending_duplicates: 0, + num_cleared_duplicates: 0, + }, + alloc, + ); + } + rbtree::Entry::Occupied(mut dupe) => { + let dupe = dupe.get_mut(); + if dupe.is_pending { + dupe.num_pending_duplicates += 1; + } else { + dupe.num_cleared_duplicates += 1; + } + dupe.last_is_frozen = None; + dupe.is_pending = false; + dupe.is_clearing = false; + } + } + + *info.freeze() = Some(cookie); + let msg = FreezeMessage::init(msg, cookie); + drop(node_refs_guard); + let _ = self.push_work(msg); + Ok(()) + } + + pub(crate) fn freeze_notif_done(self: &Arc, reader: &mut UserSliceReader) -> Result<()> { + let cookie = FreezeCookie(reader.read()?); + let alloc = FreezeMessage::new(GFP_KERNEL)?; + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(freeze) = node_refs.freeze_listeners.get_mut(&cookie) else { + pr_warn!("BC_FREEZE_NOTIFICATION_DONE {:016x} not found\n", cookie.0); + return Err(EINVAL); + }; + let mut clear_msg = None; + if freeze.num_pending_duplicates > 0 { + clear_msg = Some(FreezeMessage::init(alloc, cookie)); + freeze.num_pending_duplicates -= 1; + freeze.num_cleared_duplicates += 1; + } else { + if !freeze.is_pending { + pr_warn!( + "BC_FREEZE_NOTIFICATION_DONE {:016x} not pending\n", + cookie.0 + ); + return Err(EINVAL); + } + if freeze.is_clearing { + // Immediately send another FreezeMessage for BR_CLEAR_FREEZE_NOTIFICATION_DONE. + clear_msg = Some(FreezeMessage::init(alloc, cookie)); + } + freeze.is_pending = false; + } + drop(node_refs_guard); + if let Some(clear_msg) = clear_msg { + let _ = self.push_work(clear_msg); + } + Ok(()) + } + + pub(crate) fn clear_freeze_notif(self: &Arc, reader: &mut UserSliceReader) -> Result<()> { + let hc = reader.read::()?; + let handle = hc.handle; + let cookie = FreezeCookie(hc.cookie); + + let alloc = FreezeMessage::new(GFP_KERNEL)?; + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(info) = node_refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION invalid ref {}\n", handle); + return Err(EINVAL); + }; + let Some(info_cookie) = info.freeze() else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION freeze notification not active\n"); + return Err(EINVAL); + }; + if *info_cookie != cookie { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION freeze notification cookie mismatch\n"); + return Err(EINVAL); + } + let Some(listener) = node_refs.freeze_listeners.get_mut(&cookie) else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION invalid cookie {}\n", handle); + return Err(EINVAL); + }; + listener.is_clearing = true; + listener.node.remove_freeze_listener(self); + *info.freeze() = None; + let mut msg = None; + if !listener.is_pending { + msg = Some(FreezeMessage::init(alloc, cookie)); + } + drop(node_refs_guard); + + if let Some(msg) = msg { + let _ = self.push_work(msg); + } + Ok(()) + } + + fn get_freeze_cookie(&self, node: &DArc) -> Option { + let node_refs = &mut *self.node_refs.lock(); + let handle = node_refs.by_node.get(&node.global_id())?; + let node_ref = node_refs.by_handle.get_mut(handle)?; + *node_ref.freeze() + } + + /// Creates a vector of every freeze listener on this process. + /// + /// Returns pairs of the remote process listening for notifications and the local node it is + /// listening on. + #[expect(clippy::type_complexity)] + fn find_freeze_recipients(&self) -> Result, Arc)>, AllocError> { + // Defined before `inner` to drop after releasing spinlock if `push_within_capacity` fails. + let mut node_proc_pair; + + // We pre-allocate space for up to 8 recipients before we take the spinlock. However, if + // the allocation fails, use a vector with a capacity of zero instead of failing. After + // all, there might not be any freeze listeners, in which case this operation could still + // succeed. + let mut recipients = + KVVec::with_capacity(8, GFP_KERNEL).unwrap_or_else(|_err| KVVec::new()); + + let mut inner = self.lock_with_nodes(); + let mut curr = inner.nodes.cursor_front(); + while let Some(cursor) = curr { + let (key, node) = cursor.current(); + let key = *key; + let list = node.freeze_list(&inner.inner); + let len = list.len(); + + if recipients.spare_capacity_mut().len() < len { + drop(inner); + recipients.reserve(len, GFP_KERNEL)?; + inner = self.lock_with_nodes(); + // Find the node we were looking at and try again. If the set of nodes was changed, + // then just proceed to the next node. This is ok because we don't guarantee the + // inclusion of nodes that are added or removed in parallel with this operation. + curr = inner.nodes.cursor_lower_bound(&key); + continue; + } + + for proc in list { + node_proc_pair = (node.clone(), proc.clone()); + recipients + .push_within_capacity(node_proc_pair) + .map_err(|_| { + pr_err!( + "push_within_capacity failed even though we checked the capacity\n" + ); + AllocError + })?; + } + + curr = cursor.move_next(); + } + Ok(recipients) + } + + /// Prepare allocations for sending freeze messages. + pub(crate) fn prepare_freeze_messages(&self) -> Result { + let recipients = self.find_freeze_recipients()?; + let mut batch = KVVec::with_capacity(recipients.len(), GFP_KERNEL)?; + for (node, proc) in recipients { + let Some(cookie) = proc.get_freeze_cookie(&node) else { + // If the freeze listener was removed in the meantime, just discard the + // notification. + continue; + }; + let msg_alloc = FreezeMessage::new(GFP_KERNEL)?; + let msg = FreezeMessage::init(msg_alloc, cookie); + batch.push((proc, msg), GFP_KERNEL)?; + } + + Ok(FreezeMessages { batch }) + } +} + +pub(crate) struct FreezeMessages { + batch: KVVec<(Arc, DLArc)>, +} + +impl FreezeMessages { + pub(crate) fn send_messages(self) { + for (proc, msg) in self.batch { + let _ = proc.push_work(msg); + } + } +} diff --git a/drivers/android/binder/node.rs b/drivers/android/binder/node.rs new file mode 100644 index 000000000000..ade895ef791e --- /dev/null +++ b/drivers/android/binder/node.rs @@ -0,0 +1,1131 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + list::{AtomicTracker, List, ListArc, ListLinks, TryNewListArc}, + prelude::*, + seq_file::SeqFile, + seq_print, + sync::lock::{spinlock::SpinLockBackend, Guard}, + sync::{Arc, LockedBy, SpinLock}, +}; + +use crate::{ + defs::*, + error::BinderError, + process::{NodeRefInfo, Process, ProcessInner}, + thread::Thread, + transaction::Transaction, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +use core::mem; + +mod wrapper; +pub(crate) use self::wrapper::CritIncrWrapper; + +#[derive(Debug)] +pub(crate) struct CouldNotDeliverCriticalIncrement; + +/// Keeps track of how this node is scheduled. +/// +/// There are two ways to schedule a node to a work list. Just schedule the node itself, or +/// allocate a wrapper that references the node and schedule the wrapper. These wrappers exists to +/// make it possible to "move" a node from one list to another - when `do_work` is called directly +/// on the `Node`, then it's a no-op if there's also a pending wrapper. +/// +/// Wrappers are generally only needed for zero-to-one refcount increments, and there are two cases +/// of this: weak increments and strong increments. We call such increments "critical" because it +/// is critical that they are delivered to the thread doing the increment. Some examples: +/// +/// * One thread makes a zero-to-one strong increment, and another thread makes a zero-to-one weak +/// increment. Delivering the node to the thread doing the weak increment is wrong, since the +/// thread doing the strong increment may have ended a long time ago when the command is actually +/// processed by userspace. +/// +/// * We have a weak reference and are about to drop it on one thread. But then another thread does +/// a zero-to-one strong increment. If the strong increment gets sent to the thread that was +/// about to drop the weak reference, then the strong increment could be processed after the +/// other thread has already exited, which would be too late. +/// +/// Note that trying to create a `ListArc` to the node can succeed even if `has_normal_push` is +/// set. This is because another thread might just have popped the node from a todo list, but not +/// yet called `do_work`. However, if `has_normal_push` is false, then creating a `ListArc` should +/// always succeed. +/// +/// Like the other fields in `NodeInner`, the delivery state is protected by the process lock. +struct DeliveryState { + /// Is the `Node` currently scheduled? + has_pushed_node: bool, + + /// Is a wrapper currently scheduled? + /// + /// The wrapper is used only for strong zero2one increments. + has_pushed_wrapper: bool, + + /// Is the currently scheduled `Node` scheduled due to a weak zero2one increment? + /// + /// Weak zero2one operations are always scheduled using the `Node`. + has_weak_zero2one: bool, + + /// Is the currently scheduled wrapper/`Node` scheduled due to a strong zero2one increment? + /// + /// If `has_pushed_wrapper` is set, then the strong zero2one increment was scheduled using the + /// wrapper. Otherwise, `has_pushed_node` must be set and it was scheduled using the `Node`. + has_strong_zero2one: bool, +} + +impl DeliveryState { + fn should_normal_push(&self) -> bool { + !self.has_pushed_node && !self.has_pushed_wrapper + } + + fn did_normal_push(&mut self) { + assert!(self.should_normal_push()); + self.has_pushed_node = true; + } + + fn should_push_weak_zero2one(&self) -> bool { + !self.has_weak_zero2one && !self.has_strong_zero2one + } + + fn can_push_weak_zero2one_normally(&self) -> bool { + !self.has_pushed_node + } + + fn did_push_weak_zero2one(&mut self) { + assert!(self.should_push_weak_zero2one()); + assert!(self.can_push_weak_zero2one_normally()); + self.has_pushed_node = true; + self.has_weak_zero2one = true; + } + + fn should_push_strong_zero2one(&self) -> bool { + !self.has_strong_zero2one + } + + fn can_push_strong_zero2one_normally(&self) -> bool { + !self.has_pushed_node + } + + fn did_push_strong_zero2one(&mut self) { + assert!(self.should_push_strong_zero2one()); + assert!(self.can_push_strong_zero2one_normally()); + self.has_pushed_node = true; + self.has_strong_zero2one = true; + } + + fn did_push_strong_zero2one_wrapper(&mut self) { + assert!(self.should_push_strong_zero2one()); + assert!(!self.can_push_strong_zero2one_normally()); + self.has_pushed_wrapper = true; + self.has_strong_zero2one = true; + } +} + +struct CountState { + /// The reference count. + count: usize, + /// Whether the process that owns this node thinks that we hold a refcount on it. (Note that + /// even if count is greater than one, we only increment it once in the owning process.) + has_count: bool, +} + +impl CountState { + fn new() -> Self { + Self { + count: 0, + has_count: false, + } + } +} + +struct NodeInner { + /// Strong refcounts held on this node by `NodeRef` objects. + strong: CountState, + /// Weak refcounts held on this node by `NodeRef` objects. + weak: CountState, + delivery_state: DeliveryState, + /// The binder driver guarantees that oneway transactions sent to the same node are serialized, + /// that is, userspace will not be given the next one until it has finished processing the + /// previous oneway transaction. This is done to avoid the case where two oneway transactions + /// arrive in opposite order from the order in which they were sent. (E.g., they could be + /// delivered to two different threads, which could appear as-if they were sent in opposite + /// order.) + /// + /// To fix that, we store pending oneway transactions in a separate list in the node, and don't + /// deliver the next oneway transaction until userspace signals that it has finished processing + /// the previous oneway transaction by calling the `BC_FREE_BUFFER` ioctl. + oneway_todo: List>, + /// Keeps track of whether this node has a pending oneway transaction. + /// + /// When this is true, incoming oneway transactions are stored in `oneway_todo`, instead of + /// being delivered directly to the process. + has_oneway_transaction: bool, + /// List of processes to deliver a notification to when this node is destroyed (usually due to + /// the process dying). + death_list: List, 1>, + /// List of processes to deliver freeze notifications to. + freeze_list: KVVec>, + /// The number of active BR_INCREFS or BR_ACQUIRE operations. (should be maximum two) + /// + /// If this is non-zero, then we postpone any BR_RELEASE or BR_DECREFS notifications until the + /// active operations have ended. This avoids the situation an increment and decrement get + /// reordered from userspace's perspective. + active_inc_refs: u8, + /// List of `NodeRefInfo` objects that reference this node. + refs: List, +} + +#[pin_data] +pub(crate) struct Node { + pub(crate) debug_id: usize, + ptr: u64, + pub(crate) cookie: u64, + pub(crate) flags: u32, + pub(crate) owner: Arc, + inner: LockedBy, + #[pin] + links_track: AtomicTracker, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Node { + tracked_by links_track: AtomicTracker; + } +} + +// Make `oneway_todo` work. +kernel::list::impl_list_item! { + impl ListItem<0> for DTRWrap { + using ListLinks { self.links.inner }; + } +} + +impl Node { + pub(crate) fn new( + ptr: u64, + cookie: u64, + flags: u32, + owner: Arc, + ) -> impl PinInit { + pin_init!(Self { + inner: LockedBy::new( + &owner.inner, + NodeInner { + strong: CountState::new(), + weak: CountState::new(), + delivery_state: DeliveryState { + has_pushed_node: false, + has_pushed_wrapper: false, + has_weak_zero2one: false, + has_strong_zero2one: false, + }, + death_list: List::new(), + oneway_todo: List::new(), + freeze_list: KVVec::new(), + has_oneway_transaction: false, + active_inc_refs: 0, + refs: List::new(), + }, + ), + debug_id: super::next_debug_id(), + ptr, + cookie, + flags, + owner, + links_track <- AtomicTracker::new(), + }) + } + + pub(crate) fn has_oneway_transaction(&self, owner_inner: &mut ProcessInner) -> bool { + let inner = self.inner.access_mut(owner_inner); + inner.has_oneway_transaction + } + + #[inline(never)] + pub(crate) fn full_debug_print( + &self, + m: &SeqFile, + owner_inner: &mut ProcessInner, + ) -> Result<()> { + let inner = self.inner.access_mut(owner_inner); + seq_print!( + m, + " node {}: u{:016x} c{:016x} hs {} hw {} cs {} cw {}", + self.debug_id, + self.ptr, + self.cookie, + inner.strong.has_count, + inner.weak.has_count, + inner.strong.count, + inner.weak.count, + ); + if !inner.refs.is_empty() { + seq_print!(m, " proc"); + for node_ref in &inner.refs { + seq_print!(m, " {}", node_ref.process.task.pid()); + } + } + seq_print!(m, "\n"); + for t in &inner.oneway_todo { + t.debug_print_inner(m, " pending async transaction "); + } + Ok(()) + } + + /// Insert the `NodeRef` into this `refs` list. + /// + /// # Safety + /// + /// It must be the case that `info.node_ref.node` is this node. + pub(crate) unsafe fn insert_node_info( + &self, + info: ListArc, + ) { + self.inner + .access_mut(&mut self.owner.inner.lock()) + .refs + .push_front(info); + } + + /// Insert the `NodeRef` into this `refs` list. + /// + /// # Safety + /// + /// It must be the case that `info.node_ref.node` is this node. + pub(crate) unsafe fn remove_node_info( + &self, + info: &NodeRefInfo, + ) -> Option> { + // SAFETY: We always insert `NodeRefInfo` objects into the `refs` list of the node that it + // references in `info.node_ref.node`. That is this node, so `info` cannot possibly be in + // the `refs` list of another node. + unsafe { + self.inner + .access_mut(&mut self.owner.inner.lock()) + .refs + .remove(info) + } + } + + /// An id that is unique across all binder nodes on the system. Used as the key in the + /// `by_node` map. + pub(crate) fn global_id(&self) -> usize { + self as *const Node as usize + } + + pub(crate) fn get_id(&self) -> (u64, u64) { + (self.ptr, self.cookie) + } + + pub(crate) fn add_death( + &self, + death: ListArc, 1>, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) { + self.inner.access_mut(guard).death_list.push_back(death); + } + + pub(crate) fn inc_ref_done_locked( + self: &DArc, + _strong: bool, + owner_inner: &mut ProcessInner, + ) -> Option> { + let inner = self.inner.access_mut(owner_inner); + if inner.active_inc_refs == 0 { + pr_err!("inc_ref_done called when no active inc_refs"); + return None; + } + + inner.active_inc_refs -= 1; + if inner.active_inc_refs == 0 { + // Having active inc_refs can inhibit dropping of ref-counts. Calculate whether we + // would send a refcount decrement, and if so, tell the caller to schedule us. + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + + let should_drop_weak = !weak && has_weak; + let should_drop_strong = !strong && has_strong; + + // If we want to drop the ref-count again, tell the caller to schedule a work node for + // that. + let need_push = should_drop_weak || should_drop_strong; + + if need_push && inner.delivery_state.should_normal_push() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_normal_push(); + Some(list_arc) + } else { + None + } + } else { + None + } + } + + pub(crate) fn update_refcount_locked( + self: &DArc, + inc: bool, + strong: bool, + count: usize, + owner_inner: &mut ProcessInner, + ) -> Option> { + let is_dead = owner_inner.is_dead; + let inner = self.inner.access_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update the count and determine whether we need to push work. + let need_push = if inc { + state.count += count; + // TODO: This method shouldn't be used for zero-to-one increments. + !is_dead && !state.has_count + } else { + if state.count < count { + pr_err!("Failure: refcount underflow!"); + return None; + } + state.count -= count; + !is_dead && state.count == 0 && state.has_count + }; + + if need_push && inner.delivery_state.should_normal_push() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_normal_push(); + Some(list_arc) + } else { + None + } + } + + pub(crate) fn incr_refcount_allow_zero2one( + self: &DArc, + strong: bool, + owner_inner: &mut ProcessInner, + ) -> Result>, CouldNotDeliverCriticalIncrement> { + let is_dead = owner_inner.is_dead; + let inner = self.inner.access_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update the count and determine whether we need to push work. + state.count += 1; + if is_dead || state.has_count { + return Ok(None); + } + + // Userspace needs to be notified of this. + if !strong && inner.delivery_state.should_push_weak_zero2one() { + assert!(inner.delivery_state.can_push_weak_zero2one_normally()); + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_push_weak_zero2one(); + Ok(Some(list_arc)) + } else if strong && inner.delivery_state.should_push_strong_zero2one() { + if inner.delivery_state.can_push_strong_zero2one_normally() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_push_strong_zero2one(); + Ok(Some(list_arc)) + } else { + state.count -= 1; + Err(CouldNotDeliverCriticalIncrement) + } + } else { + // Work is already pushed, and we don't need to push again. + Ok(None) + } + } + + pub(crate) fn incr_refcount_allow_zero2one_with_wrapper( + self: &DArc, + strong: bool, + wrapper: CritIncrWrapper, + owner_inner: &mut ProcessInner, + ) -> Option> { + match self.incr_refcount_allow_zero2one(strong, owner_inner) { + Ok(Some(node)) => Some(node as _), + Ok(None) => None, + Err(CouldNotDeliverCriticalIncrement) => { + assert!(strong); + let inner = self.inner.access_mut(owner_inner); + inner.strong.count += 1; + inner.delivery_state.did_push_strong_zero2one_wrapper(); + Some(wrapper.init(self.clone())) + } + } + } + + pub(crate) fn update_refcount(self: &DArc, inc: bool, count: usize, strong: bool) { + self.owner + .inner + .lock() + .update_node_refcount(self, inc, strong, count, None); + } + + pub(crate) fn populate_counts( + &self, + out: &mut BinderNodeInfoForRef, + guard: &Guard<'_, ProcessInner, SpinLockBackend>, + ) { + let inner = self.inner.access(guard); + out.strong_count = inner.strong.count as _; + out.weak_count = inner.weak.count as _; + } + + pub(crate) fn populate_debug_info( + &self, + out: &mut BinderNodeDebugInfo, + guard: &Guard<'_, ProcessInner, SpinLockBackend>, + ) { + out.ptr = self.ptr as _; + out.cookie = self.cookie as _; + let inner = self.inner.access(guard); + if inner.strong.has_count { + out.has_strong_ref = 1; + } + if inner.weak.has_count { + out.has_weak_ref = 1; + } + } + + pub(crate) fn force_has_count(&self, guard: &mut Guard<'_, ProcessInner, SpinLockBackend>) { + let inner = self.inner.access_mut(guard); + inner.strong.has_count = true; + inner.weak.has_count = true; + } + + fn write(&self, writer: &mut BinderReturnWriter<'_>, code: u32) -> Result { + writer.write_code(code)?; + writer.write_payload(&self.ptr)?; + writer.write_payload(&self.cookie)?; + Ok(()) + } + + pub(crate) fn submit_oneway( + &self, + transaction: DLArc, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Result<(), (BinderError, DLArc)> { + if guard.is_dead { + return Err((BinderError::new_dead(), transaction)); + } + + let inner = self.inner.access_mut(guard); + if inner.has_oneway_transaction { + inner.oneway_todo.push_back(transaction); + } else { + inner.has_oneway_transaction = true; + guard.push_work(transaction)?; + } + Ok(()) + } + + pub(crate) fn release(&self) { + let mut guard = self.owner.inner.lock(); + while let Some(work) = self.inner.access_mut(&mut guard).oneway_todo.pop_front() { + drop(guard); + work.into_arc().cancel(); + guard = self.owner.inner.lock(); + } + + let death_list = core::mem::take(&mut self.inner.access_mut(&mut guard).death_list); + drop(guard); + for death in death_list { + death.into_arc().set_dead(); + } + } + + pub(crate) fn pending_oneway_finished(&self) { + let mut guard = self.owner.inner.lock(); + if guard.is_dead { + // Cleanup will happen in `Process::deferred_release`. + return; + } + + let inner = self.inner.access_mut(&mut guard); + + let transaction = inner.oneway_todo.pop_front(); + inner.has_oneway_transaction = transaction.is_some(); + if let Some(transaction) = transaction { + match guard.push_work(transaction) { + Ok(()) => {} + Err((_err, work)) => { + // Process is dead. + // This shouldn't happen due to the `is_dead` check, but if it does, just drop + // the transaction and return. + drop(guard); + drop(work); + } + } + } + } + + /// Finds an outdated transaction that the given transaction can replace. + /// + /// If one is found, it is removed from the list and returned. + pub(crate) fn take_outdated_transaction( + &self, + new: &Transaction, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Option> { + let inner = self.inner.access_mut(guard); + let mut cursor = inner.oneway_todo.cursor_front(); + while let Some(next) = cursor.peek_next() { + if new.can_replace(&next) { + return Some(next.remove()); + } + cursor.move_next(); + } + None + } + + /// This is split into a separate function since it's called by both `Node::do_work` and + /// `NodeWrapper::do_work`. + fn do_work_locked( + &self, + writer: &mut BinderReturnWriter<'_>, + mut guard: Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Result { + let inner = self.inner.access_mut(&mut guard); + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + + if weak && !has_weak { + inner.weak.has_count = true; + inner.active_inc_refs += 1; + } + + if strong && !has_strong { + inner.strong.has_count = true; + inner.active_inc_refs += 1; + } + + let no_active_inc_refs = inner.active_inc_refs == 0; + let should_drop_weak = no_active_inc_refs && (!weak && has_weak); + let should_drop_strong = no_active_inc_refs && (!strong && has_strong); + if should_drop_weak { + inner.weak.has_count = false; + } + if should_drop_strong { + inner.strong.has_count = false; + } + if no_active_inc_refs && !weak { + // Remove the node if there are no references to it. + guard.remove_node(self.ptr); + } + drop(guard); + + if weak && !has_weak { + self.write(writer, BR_INCREFS)?; + } + if strong && !has_strong { + self.write(writer, BR_ACQUIRE)?; + } + if should_drop_strong { + self.write(writer, BR_RELEASE)?; + } + if should_drop_weak { + self.write(writer, BR_DECREFS)?; + } + + Ok(true) + } + + pub(crate) fn add_freeze_listener( + &self, + process: &Arc, + flags: kernel::alloc::Flags, + ) -> Result { + let mut vec_alloc = KVVec::>::new(); + loop { + let mut guard = self.owner.inner.lock(); + // Do not check for `guard.dead`. The `dead` flag that matters here is the owner of the + // listener, no the target. + let inner = self.inner.access_mut(&mut guard); + let len = inner.freeze_list.len(); + if len >= inner.freeze_list.capacity() { + if len >= vec_alloc.capacity() { + drop(guard); + vec_alloc = KVVec::with_capacity((1 + len).next_power_of_two(), flags)?; + continue; + } + mem::swap(&mut inner.freeze_list, &mut vec_alloc); + for elem in vec_alloc.drain_all() { + inner.freeze_list.push_within_capacity(elem)?; + } + } + inner.freeze_list.push_within_capacity(process.clone())?; + return Ok(()); + } + } + + pub(crate) fn remove_freeze_listener(&self, p: &Arc) { + let _unused_capacity; + let mut guard = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut guard); + let len = inner.freeze_list.len(); + inner.freeze_list.retain(|proc| !Arc::ptr_eq(proc, p)); + if len == inner.freeze_list.len() { + pr_warn!( + "Could not remove freeze listener for {}\n", + p.pid_in_current_ns() + ); + } + if inner.freeze_list.is_empty() { + _unused_capacity = mem::replace(&mut inner.freeze_list, KVVec::new()); + } + } + + pub(crate) fn freeze_list<'a>(&'a self, guard: &'a ProcessInner) -> &'a [Arc] { + &self.inner.access(guard).freeze_list + } +} + +impl DeliverToRead for Node { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let mut owner_inner = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut owner_inner); + + assert!(inner.delivery_state.has_pushed_node); + if inner.delivery_state.has_pushed_wrapper { + // If the wrapper is scheduled, then we are either a normal push or weak zero2one + // increment, and the wrapper is a strong zero2one increment, so the wrapper always + // takes precedence over us. + assert!(inner.delivery_state.has_strong_zero2one); + inner.delivery_state.has_pushed_node = false; + inner.delivery_state.has_weak_zero2one = false; + return Ok(true); + } + + inner.delivery_state.has_pushed_node = false; + inner.delivery_state.has_weak_zero2one = false; + inner.delivery_state.has_strong_zero2one = false; + + self.do_work_locked(writer, owner_inner) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}node work {}: u{:016x} c{:016x}\n", + prefix, + self.debug_id, + self.ptr, + self.cookie, + ); + Ok(()) + } +} + +/// Represents something that holds one or more ref-counts to a `Node`. +/// +/// Whenever process A holds a refcount to a node owned by a different process B, then process A +/// will store a `NodeRef` that refers to the `Node` in process B. When process A releases the +/// refcount, we destroy the NodeRef, which decrements the ref-count in process A. +/// +/// This type is also used for some other cases. For example, a transaction allocation holds a +/// refcount on the target node, and this is implemented by storing a `NodeRef` in the allocation +/// so that the destructor of the allocation will drop a refcount of the `Node`. +pub(crate) struct NodeRef { + pub(crate) node: DArc, + /// How many times does this NodeRef hold a refcount on the Node? + strong_node_count: usize, + weak_node_count: usize, + /// How many times does userspace hold a refcount on this NodeRef? + strong_count: usize, + weak_count: usize, +} + +impl NodeRef { + pub(crate) fn new(node: DArc, strong_count: usize, weak_count: usize) -> Self { + Self { + node, + strong_node_count: strong_count, + weak_node_count: weak_count, + strong_count, + weak_count, + } + } + + pub(crate) fn absorb(&mut self, mut other: Self) { + assert!( + Arc::ptr_eq(&self.node, &other.node), + "absorb called with differing nodes" + ); + self.strong_node_count += other.strong_node_count; + self.weak_node_count += other.weak_node_count; + self.strong_count += other.strong_count; + self.weak_count += other.weak_count; + other.strong_count = 0; + other.weak_count = 0; + other.strong_node_count = 0; + other.weak_node_count = 0; + + if self.strong_node_count >= 2 || self.weak_node_count >= 2 { + let mut guard = self.node.owner.inner.lock(); + let inner = self.node.inner.access_mut(&mut guard); + + if self.strong_node_count >= 2 { + inner.strong.count -= self.strong_node_count - 1; + self.strong_node_count = 1; + assert_ne!(inner.strong.count, 0); + } + if self.weak_node_count >= 2 { + inner.weak.count -= self.weak_node_count - 1; + self.weak_node_count = 1; + assert_ne!(inner.weak.count, 0); + } + } + } + + pub(crate) fn get_count(&self) -> (usize, usize) { + (self.strong_count, self.weak_count) + } + + pub(crate) fn clone(&self, strong: bool) -> Result { + if strong && self.strong_count == 0 { + return Err(EINVAL); + } + Ok(self + .node + .owner + .inner + .lock() + .new_node_ref(self.node.clone(), strong, None)) + } + + /// Updates (increments or decrements) the number of references held against the node. If the + /// count being updated transitions from 0 to 1 or from 1 to 0, the node is notified by having + /// its `update_refcount` function called. + /// + /// Returns whether `self` should be removed (when both counts are zero). + pub(crate) fn update(&mut self, inc: bool, strong: bool) -> bool { + if strong && self.strong_count == 0 { + return false; + } + let (count, node_count, other_count) = if strong { + ( + &mut self.strong_count, + &mut self.strong_node_count, + self.weak_count, + ) + } else { + ( + &mut self.weak_count, + &mut self.weak_node_count, + self.strong_count, + ) + }; + if inc { + if *count == 0 { + *node_count = 1; + self.node.update_refcount(true, 1, strong); + } + *count += 1; + } else { + if *count == 0 { + pr_warn!( + "pid {} performed invalid decrement on ref\n", + kernel::current!().pid() + ); + return false; + } + *count -= 1; + if *count == 0 { + self.node.update_refcount(false, *node_count, strong); + *node_count = 0; + return other_count == 0; + } + } + false + } +} + +impl Drop for NodeRef { + // This destructor is called conditionally from `Allocation::drop`. That branch is often + // mispredicted. Inlining this method call reduces the cost of those branch mispredictions. + #[inline(always)] + fn drop(&mut self) { + if self.strong_node_count > 0 { + self.node + .update_refcount(false, self.strong_node_count, true); + } + if self.weak_node_count > 0 { + self.node + .update_refcount(false, self.weak_node_count, false); + } + } +} + +struct NodeDeathInner { + dead: bool, + cleared: bool, + notification_done: bool, + /// Indicates whether the normal flow was interrupted by removing the handle. In this case, we + /// need behave as if the death notification didn't exist (i.e., we don't deliver anything to + /// the user. + aborted: bool, +} + +/// Used to deliver notifications when a process dies. +/// +/// A process can request to be notified when a process dies using `BC_REQUEST_DEATH_NOTIFICATION`. +/// This will make the driver send a `BR_DEAD_BINDER` to userspace when the process dies (or +/// immediately if it is already dead). Userspace is supposed to respond with `BC_DEAD_BINDER_DONE` +/// once it has processed the notification. +/// +/// Userspace can unregister from death notifications using the `BC_CLEAR_DEATH_NOTIFICATION` +/// command. In this case, the kernel will respond with `BR_CLEAR_DEATH_NOTIFICATION_DONE` once the +/// notification has been removed. Note that if the remote process dies before the kernel has +/// responded with `BR_CLEAR_DEATH_NOTIFICATION_DONE`, then the kernel will still send a +/// `BR_DEAD_BINDER`, which userspace must be able to process. In this case, the kernel will wait +/// for the `BC_DEAD_BINDER_DONE` command before it sends `BR_CLEAR_DEATH_NOTIFICATION_DONE`. +/// +/// Note that even if the kernel sends a `BR_DEAD_BINDER`, this does not remove the death +/// notification. Userspace must still remove it manually using `BC_CLEAR_DEATH_NOTIFICATION`. +/// +/// If a process uses `BC_RELEASE` to destroy its last refcount on a node that has an active death +/// registration, then the death registration is immediately deleted (we implement this using the +/// `aborted` field). However, userspace is not supposed to delete a `NodeRef` without first +/// deregistering death notifications, so this codepath is not executed under normal circumstances. +#[pin_data] +pub(crate) struct NodeDeath { + node: DArc, + process: Arc, + pub(crate) cookie: u64, + #[pin] + links_track: AtomicTracker<0>, + /// Used by the owner `Node` to store a list of registered death notifications. + /// + /// # Invariants + /// + /// Only ever used with the `death_list` list of `self.node`. + #[pin] + death_links: ListLinks<1>, + /// Used by the process to keep track of the death notifications for which we have sent a + /// `BR_DEAD_BINDER` but not yet received a `BC_DEAD_BINDER_DONE`. + /// + /// # Invariants + /// + /// Only ever used with the `delivered_deaths` list of `self.process`. + #[pin] + delivered_links: ListLinks<2>, + #[pin] + delivered_links_track: AtomicTracker<2>, + #[pin] + inner: SpinLock, +} + +impl NodeDeath { + /// Constructs a new node death notification object. + pub(crate) fn new( + node: DArc, + process: Arc, + cookie: u64, + ) -> impl PinInit> { + DTRWrap::new(pin_init!( + Self { + node, + process, + cookie, + links_track <- AtomicTracker::new(), + death_links <- ListLinks::new(), + delivered_links <- ListLinks::new(), + delivered_links_track <- AtomicTracker::new(), + inner <- kernel::new_spinlock!(NodeDeathInner { + dead: false, + cleared: false, + notification_done: false, + aborted: false, + }, "NodeDeath::inner"), + } + )) + } + + /// Sets the cleared flag to `true`. + /// + /// It removes `self` from the node's death notification list if needed. + /// + /// Returns whether it needs to be queued. + pub(crate) fn set_cleared(self: &DArc, abort: bool) -> bool { + let (needs_removal, needs_queueing) = { + // Update state and determine if we need to queue a work item. We only need to do it + // when the node is not dead or if the user already completed the death notification. + let mut inner = self.inner.lock(); + if abort { + inner.aborted = true; + } + if inner.cleared { + // Already cleared. + return false; + } + inner.cleared = true; + (!inner.dead, !inner.dead || inner.notification_done) + }; + + // Remove death notification from node. + if needs_removal { + let mut owner_inner = self.node.owner.inner.lock(); + let node_inner = self.node.inner.access_mut(&mut owner_inner); + // SAFETY: A `NodeDeath` is never inserted into the death list of any node other than + // its owner, so it is either in this death list or in no death list. + unsafe { node_inner.death_list.remove(self) }; + } + needs_queueing + } + + /// Sets the 'notification done' flag to `true`. + pub(crate) fn set_notification_done(self: DArc, thread: &Thread) { + let needs_queueing = { + let mut inner = self.inner.lock(); + inner.notification_done = true; + inner.cleared + }; + if needs_queueing { + if let Some(death) = ListArc::try_from_arc_or_drop(self) { + let _ = thread.push_work_if_looper(death); + } + } + } + + /// Sets the 'dead' flag to `true` and queues work item if needed. + pub(crate) fn set_dead(self: DArc) { + let needs_queueing = { + let mut inner = self.inner.lock(); + if inner.cleared { + false + } else { + inner.dead = true; + true + } + }; + if needs_queueing { + // Push the death notification to the target process. There is nothing else to do if + // it's already dead. + if let Some(death) = ListArc::try_from_arc_or_drop(self) { + let process = death.process.clone(); + let _ = process.push_work(death); + } + } + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for NodeDeath { + tracked_by links_track: AtomicTracker; + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<1> for DTRWrap { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<1> for DTRWrap { + using ListLinks { self.wrapped.death_links }; + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<2> for DTRWrap { + tracked_by wrapped: NodeDeath; + } +} +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<2> for NodeDeath { + tracked_by delivered_links_track: AtomicTracker<2>; + } +} +kernel::list::impl_list_item! { + impl ListItem<2> for DTRWrap { + using ListLinks { self.wrapped.delivered_links }; + } +} + +impl DeliverToRead for NodeDeath { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let done = { + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + inner.cleared && (!inner.dead || inner.notification_done) + }; + + let cookie = self.cookie; + let cmd = if done { + BR_CLEAR_DEATH_NOTIFICATION_DONE + } else { + let process = self.process.clone(); + let mut process_inner = process.inner.lock(); + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + // We're still holding the inner lock, so it cannot be aborted while we insert it into + // the delivered list. + process_inner.death_delivered(self.clone()); + BR_DEAD_BINDER + }; + + writer.write_code(cmd)?; + writer.write_payload(&cookie)?; + // DEAD_BINDER notifications can cause transactions, so stop processing work items when we + // get to a death notification. + Ok(cmd != BR_DEAD_BINDER) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + let inner = self.inner.lock(); + + let dead_binder = inner.dead && !inner.notification_done; + + if dead_binder { + if inner.cleared { + seq_print!(m, "{}has cleared dead binder\n", prefix); + } else { + seq_print!(m, "{}has dead binder\n", prefix); + } + } else { + seq_print!(m, "{}has cleared death notification\n", prefix); + } + + Ok(()) + } +} diff --git a/drivers/android/binder/node/wrapper.rs b/drivers/android/binder/node/wrapper.rs new file mode 100644 index 000000000000..43294c050502 --- /dev/null +++ b/drivers/android/binder/node/wrapper.rs @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{list::ListArc, prelude::*, seq_file::SeqFile, seq_print, sync::UniqueArc}; + +use crate::{node::Node, thread::Thread, BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead}; + +use core::mem::MaybeUninit; + +pub(crate) struct CritIncrWrapper { + inner: UniqueArc>>, +} + +impl CritIncrWrapper { + pub(crate) fn new() -> Result { + Ok(CritIncrWrapper { + inner: UniqueArc::new_uninit(GFP_KERNEL)?, + }) + } + + pub(super) fn init(self, node: DArc) -> DLArc { + match self.inner.pin_init_with(DTRWrap::new(NodeWrapper { node })) { + Ok(initialized) => ListArc::from(initialized) as _, + Err(err) => match err {}, + } + } +} + +struct NodeWrapper { + node: DArc, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for NodeWrapper { + untracked; + } +} + +impl DeliverToRead for NodeWrapper { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let node = &self.node; + let mut owner_inner = node.owner.inner.lock(); + let inner = node.inner.access_mut(&mut owner_inner); + + let ds = &mut inner.delivery_state; + + assert!(ds.has_pushed_wrapper); + assert!(ds.has_strong_zero2one); + ds.has_pushed_wrapper = false; + ds.has_strong_zero2one = false; + + node.do_work_locked(writer, owner_inner) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}node work {}: u{:016x} c{:016x}\n", + prefix, + self.node.debug_id, + self.node.ptr, + self.node.cookie, + ); + Ok(()) + } +} diff --git a/drivers/android/binder/page_range.rs b/drivers/android/binder/page_range.rs new file mode 100644 index 000000000000..9379038f61f5 --- /dev/null +++ b/drivers/android/binder/page_range.rs @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module has utilities for managing a page range where unused pages may be reclaimed by a +//! vma shrinker. + +// To avoid deadlocks, locks are taken in the order: +// +// 1. mmap lock +// 2. spinlock +// 3. lru spinlock +// +// The shrinker will use trylock methods because it locks them in a different order. + +use core::{ + marker::PhantomPinned, + mem::{size_of, size_of_val, MaybeUninit}, + ptr, +}; + +use kernel::{ + bindings, + error::Result, + ffi::{c_ulong, c_void}, + mm::{virt, Mm, MmWithUser}, + new_mutex, new_spinlock, + page::{Page, PAGE_SHIFT, PAGE_SIZE}, + prelude::*, + str::CStr, + sync::{aref::ARef, Mutex, SpinLock}, + task::Pid, + transmute::FromBytes, + types::Opaque, + uaccess::UserSliceReader, +}; + +/// Represents a shrinker that can be registered with the kernel. +/// +/// Each shrinker can be used by many `ShrinkablePageRange` objects. +#[repr(C)] +pub(crate) struct Shrinker { + inner: Opaque<*mut bindings::shrinker>, + list_lru: Opaque, +} + +// SAFETY: The shrinker and list_lru are thread safe. +unsafe impl Send for Shrinker {} +// SAFETY: The shrinker and list_lru are thread safe. +unsafe impl Sync for Shrinker {} + +impl Shrinker { + /// Create a new shrinker. + /// + /// # Safety + /// + /// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have + /// been called exactly once, and it must not have returned an error. + pub(crate) const unsafe fn new() -> Self { + Self { + inner: Opaque::uninit(), + list_lru: Opaque::uninit(), + } + } + + /// Register this shrinker with the kernel. + pub(crate) fn register(&'static self, name: &CStr) -> Result<()> { + // SAFETY: These fields are not yet used, so it's okay to zero them. + unsafe { + self.inner.get().write(ptr::null_mut()); + self.list_lru.get().write_bytes(0, 1); + } + + // SAFETY: The field is not yet used, so we can initialize it. + let ret = unsafe { bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut()) }; + if ret != 0 { + return Err(Error::from_errno(ret)); + } + + // SAFETY: The `name` points at a valid c string. + let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) }; + if shrinker.is_null() { + // SAFETY: We initialized it, so its okay to destroy it. + unsafe { bindings::list_lru_destroy(self.list_lru.get()) }; + return Err(Error::from_errno(ret)); + } + + // SAFETY: We're about to register the shrinker, and these are the fields we need to + // initialize. (All other fields are already zeroed.) + unsafe { + (&raw mut (*shrinker).count_objects).write(Some(rust_shrink_count)); + (&raw mut (*shrinker).scan_objects).write(Some(rust_shrink_scan)); + (&raw mut (*shrinker).private_data).write(self.list_lru.get().cast()); + } + + // SAFETY: The new shrinker has been fully initialized, so we can register it. + unsafe { bindings::shrinker_register(shrinker) }; + + // SAFETY: This initializes the pointer to the shrinker so that we can use it. + unsafe { self.inner.get().write(shrinker) }; + + Ok(()) + } +} + +/// A container that manages a page range in a vma. +/// +/// The pages can be thought of as an array of booleans of whether the pages are usable. The +/// methods `use_range` and `stop_using_range` set all booleans in a range to true or false +/// respectively. Initially, no pages are allocated. When a page is not used, it is not freed +/// immediately. Instead, it is made available to the memory shrinker to free it if the device is +/// under memory pressure. +/// +/// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no +/// way to know whether an index ends up with true or false if a call to `use_range` races with +/// another call to `stop_using_range` on a given index. +/// +/// It's also okay for the two methods to race with themselves, e.g. if two threads call +/// `use_range` on the same index, then that's fine and neither call will return until the page is +/// allocated and mapped. +/// +/// The methods that read or write to a range require that the page is marked as in use. So it is +/// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or +/// write to the page. +#[pin_data(PinnedDrop)] +pub(crate) struct ShrinkablePageRange { + /// Shrinker object registered with the kernel. + shrinker: &'static Shrinker, + /// Pid using this page range. Only used as debugging information. + pid: Pid, + /// The mm for the relevant process. + mm: ARef, + /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`. + #[pin] + mm_lock: Mutex<()>, + /// Spinlock protecting changes to pages. + #[pin] + lock: SpinLock, + + /// Must not move, since page info has pointers back. + #[pin] + _pin: PhantomPinned, +} + +struct Inner { + /// Array of pages. + /// + /// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive + /// ownership. To deal with that, we manage it using raw pointers. + pages: *mut PageInfo, + /// Length of the `pages` array. + size: usize, + /// The address of the vma to insert the pages into. + vma_addr: usize, +} + +// SAFETY: proper locking is in place for `Inner` +unsafe impl Send for Inner {} + +type StableMmGuard = + kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>; + +/// An array element that describes the current state of a page. +/// +/// There are three states: +/// +/// * Free. The page is None. The `lru` element is not queued. +/// * Available. The page is Some. The `lru` element is queued to the shrinker's lru. +/// * Used. The page is Some. The `lru` element is not queued. +/// +/// When an element is available, the shrinker is able to free the page. +#[repr(C)] +struct PageInfo { + lru: bindings::list_head, + page: Option, + range: *const ShrinkablePageRange, +} + +impl PageInfo { + /// # Safety + /// + /// The caller ensures that writing to `me.page` is ok, and that the page is not currently set. + unsafe fn set_page(me: *mut PageInfo, page: Page) { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw mut (*me).page }; + + // SAFETY: The pointer is valid for writing, so also valid for reading. + if unsafe { (*ptr).is_some() } { + pr_err!("set_page called when there is already a page"); + // SAFETY: We will initialize the page again below. + unsafe { ptr::drop_in_place(ptr) }; + } + + // SAFETY: The pointer is valid for writing. + unsafe { ptr::write(ptr, Some(page)) }; + } + + /// # Safety + /// + /// The caller ensures that reading from `me.page` is ok for the duration of 'a. + unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw const (*me).page }; + + // SAFETY: The pointer is valid for reading. + unsafe { (*ptr).as_ref() } + } + + /// # Safety + /// + /// The caller ensures that writing to `me.page` is ok for the duration of 'a. + unsafe fn take_page(me: *mut PageInfo) -> Option { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw mut (*me).page }; + + // SAFETY: The pointer is valid for reading. + unsafe { (*ptr).take() } + } + + /// Add this page to the lru list, if not already in the list. + /// + /// # Safety + /// + /// The pointer must be valid, and it must be the right shrinker and nid. + unsafe fn list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) { + // SAFETY: This pointer offset is in bounds. + let lru_ptr = unsafe { &raw mut (*me).lru }; + // SAFETY: The lru pointer is valid, and we're not using it with any other lru list. + unsafe { bindings::list_lru_add(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) }; + } + + /// Remove this page from the lru list, if it is in the list. + /// + /// # Safety + /// + /// The pointer must be valid, and it must be the right shrinker and nid. + unsafe fn list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) { + // SAFETY: This pointer offset is in bounds. + let lru_ptr = unsafe { &raw mut (*me).lru }; + // SAFETY: The lru pointer is valid, and we're not using it with any other lru list. + unsafe { bindings::list_lru_del(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) }; + } +} + +impl ShrinkablePageRange { + /// Create a new `ShrinkablePageRange` using the given shrinker. + pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit { + try_pin_init!(Self { + shrinker, + pid: kernel::current!().pid(), + mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?), + mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"), + lock <- new_spinlock!(Inner { + pages: ptr::null_mut(), + size: 0, + vma_addr: 0, + }, "ShrinkablePageRange"), + _pin: PhantomPinned, + }) + } + + pub(crate) fn stable_trylock_mm(&self) -> Option { + // SAFETY: This extends the duration of the reference. Since this call happens before + // `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block + // until the returned guard is dropped. This ensures that the guard is valid until dropped. + let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) }; + + mm_lock.try_lock() + } + + /// Register a vma with this page range. Returns the size of the region. + pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result { + let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let num_pages = num_bytes >> PAGE_SHIFT; + + if !ptr::eq::(&*self.mm, &**vma.mm()) { + pr_debug!("Failed to register with vma: invalid vma->vm_mm"); + return Err(EINVAL); + } + if num_pages == 0 { + pr_debug!("Failed to register with vma: size zero"); + return Err(EINVAL); + } + + let mut pages = KVVec::::with_capacity(num_pages, GFP_KERNEL)?; + + // SAFETY: This just initializes the pages array. + unsafe { + let self_ptr = self as *const ShrinkablePageRange; + for i in 0..num_pages { + let info = pages.as_mut_ptr().add(i); + (&raw mut (*info).range).write(self_ptr); + (&raw mut (*info).page).write(None); + let lru = &raw mut (*info).lru; + (&raw mut (*lru).next).write(lru); + (&raw mut (*lru).prev).write(lru); + } + } + + let mut inner = self.lock.lock(); + if inner.size > 0 { + pr_debug!("Failed to register with vma: already registered"); + drop(inner); + return Err(EBUSY); + } + + inner.pages = pages.into_raw_parts().0; + inner.size = num_pages; + inner.vma_addr = vma.start(); + + Ok(num_pages) + } + + /// Make sure that the given pages are allocated and mapped. + /// + /// Must not be called from an atomic context. + pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> { + if start >= end { + return Ok(()); + } + let mut inner = self.lock.lock(); + assert!(end <= inner.size); + + for i in start..end { + // SAFETY: This pointer offset is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // Since we're going to use the page, we should remove it from the lru list so that + // the shrinker will not free it. + // + // SAFETY: The pointer is valid, and this is the right shrinker. + // + // The shrinker can't free the page between the check and this call to + // `list_lru_del` because we hold the lock. + unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) }; + } else { + // We have to allocate a new page. Use the slow path. + drop(inner); + // SAFETY: `i < end <= inner.size` so `i` is in bounds. + match unsafe { self.use_page_slow(i) } { + Ok(()) => {} + Err(err) => { + pr_warn!("Error in use_page_slow: {:?}", err); + return Err(err); + } + } + inner = self.lock.lock(); + } + } + Ok(()) + } + + /// Mark the given page as in use, slow path. + /// + /// Must not be called from an atomic context. + /// + /// # Safety + /// + /// Assumes that `i` is in bounds. + #[cold] + unsafe fn use_page_slow(&self, i: usize) -> Result<()> { + let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?; + + let mm_mutex = self.mm_lock.lock(); + let inner = self.lock.lock(); + + // SAFETY: This pointer offset is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // The page was already there, or someone else added the page while we didn't hold the + // spinlock. + // + // SAFETY: The pointer is valid, and this is the right shrinker. + // + // The shrinker can't free the page between the check and this call to + // `list_lru_del` because we hold the lock. + unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) }; + return Ok(()); + } + + let vma_addr = inner.vma_addr; + // Release the spinlock while we insert the page into the vma. + drop(inner); + + // No overflow since we stay in bounds of the vma. + let user_page_addr = vma_addr + (i << PAGE_SHIFT); + + // We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from + // a remote process. If the call to `mmput` races with the process shutting down, then the + // caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't + // happen until it returns to userspace. However, the caller might instead go to sleep and + // wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the + // middle of a shutdown process that won't complete until the `mm` is dropped. This can + // amount to a deadlock. + // + // Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a + // workqueue. + MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?) + .mmap_read_lock() + .vma_lookup(vma_addr) + .ok_or(ESRCH)? + .as_mixedmap_vma() + .ok_or(ESRCH)? + .vm_insert_page(user_page_addr, &new_page) + .inspect_err(|err| { + pr_warn!( + "Failed to vm_insert_page({}): vma_addr:{} i:{} err:{:?}", + user_page_addr, + vma_addr, + i, + err + ) + })?; + + let inner = self.lock.lock(); + + // SAFETY: The `page_info` pointer is valid and currently does not have a page. The page + // can be written to since we hold the lock. + // + // We released and reacquired the spinlock since we checked that the page is null, but we + // always hold the mm_lock mutex when setting the page to a non-null value, so it's not + // possible for someone else to have changed it since our check. + unsafe { PageInfo::set_page(page_info, new_page) }; + + drop(inner); + drop(mm_mutex); + + Ok(()) + } + + /// If the given page is in use, then mark it as available so that the shrinker can free it. + /// + /// May be called from an atomic context. + pub(crate) fn stop_using_range(&self, start: usize, end: usize) { + if start >= end { + return; + } + let inner = self.lock.lock(); + assert!(end <= inner.size); + + for i in (start..end).rev() { + // SAFETY: The pointer is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: Okay for reading since we have the lock. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // SAFETY: The pointer is valid, and it's the right shrinker. + unsafe { PageInfo::list_lru_add(page_info, page.nid(), self.shrinker) }; + } + } + } + + /// Helper for reading or writing to a range of bytes that may overlap with several pages. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + unsafe fn iterate(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result + where + T: FnMut(&Page, usize, usize) -> Result, + { + if size == 0 { + return Ok(()); + } + + let (pages, num_pages) = { + let inner = self.lock.lock(); + (inner.pages, inner.size) + }; + let num_bytes = num_pages << PAGE_SHIFT; + + // Check that the request is within the buffer. + if offset.checked_add(size).ok_or(EFAULT)? > num_bytes { + return Err(EFAULT); + } + + let mut page_index = offset >> PAGE_SHIFT; + offset &= PAGE_SIZE - 1; + while size > 0 { + let available = usize::min(size, PAGE_SIZE - offset); + // SAFETY: The pointer is in bounds. + let page_info = unsafe { pages.add(page_index) }; + // SAFETY: The caller guarantees that this page is in the "in use" state for the + // duration of this call to `iterate`, so nobody will change the page. + let page = unsafe { PageInfo::get_page(page_info) }; + if page.is_none() { + pr_warn!("Page is null!"); + } + let page = page.ok_or(EFAULT)?; + cb(page, offset, available)?; + size -= available; + page_index += 1; + offset = 0; + } + Ok(()) + } + + /// Copy from userspace into this page range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn copy_from_user_slice( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + // SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`. + unsafe { + self.iterate(offset, size, |page, offset, to_copy| { + page.copy_from_user_slice_raw(reader, offset, to_copy) + }) + } + } + + /// Copy from this page range into kernel space. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn read(&self, offset: usize) -> Result { + let mut out = MaybeUninit::::uninit(); + let mut out_offset = 0; + // SAFETY: `self.iterate` has the same safety requirements as `read`. + unsafe { + self.iterate(offset, size_of::(), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset); + // SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid. + page.read_raw(obj_ptr, offset, to_copy)?; + out_offset += to_copy; + Ok(()) + })?; + } + // SAFETY: We just initialised the data. + Ok(unsafe { out.assume_init() }) + } + + /// Copy from kernel space into this page range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn write(&self, offset: usize, obj: &T) -> Result { + let mut obj_offset = 0; + // SAFETY: `self.iterate` has the same safety requirements as `write`. + unsafe { + self.iterate(offset, size_of_val(obj), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = (obj as *const T as *const u8).add(obj_offset); + // SAFETY: We have a reference to the object, so the pointer is valid. + page.write_raw(obj_ptr, offset, to_copy)?; + obj_offset += to_copy; + Ok(()) + }) + } + } + + /// Write zeroes to the given range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result { + // SAFETY: `self.iterate` has the same safety requirements as `copy_into`. + unsafe { + self.iterate(offset, size, |page, offset, len| { + page.fill_zero_raw(offset, len) + }) + } + } +} + +#[pinned_drop] +impl PinnedDrop for ShrinkablePageRange { + fn drop(self: Pin<&mut Self>) { + let (pages, size) = { + let lock = self.lock.lock(); + (lock.pages, lock.size) + }; + + if size == 0 { + return; + } + + // Note: This call is also necessary for the safety of `stable_trylock_mm`. + let mm_lock = self.mm_lock.lock(); + + // This is the destructor, so unlike the other methods, we only need to worry about races + // with the shrinker here. Since we hold the `mm_lock`, we also can't race with the + // shrinker, and after this loop, the shrinker will not access any of our pages since we + // removed them from the lru list. + for i in 0..size { + // SAFETY: Loop is in-bounds of the size. + let p_ptr = unsafe { pages.add(i) }; + // SAFETY: No other readers, so we can read. + if let Some(p) = unsafe { PageInfo::get_page(p_ptr) } { + // SAFETY: The pointer is valid and it's the right shrinker. + unsafe { PageInfo::list_lru_del(p_ptr, p.nid(), self.shrinker) }; + } + } + + drop(mm_lock); + + // SAFETY: `pages` was allocated as an `KVVec` with capacity `size`. Furthermore, + // all `size` elements are initialized. Also, the array is no longer shared with the + // shrinker due to the above loop. + drop(unsafe { KVVec::from_raw_parts(pages, size, size) }); + } +} + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_count( + shrink: *mut bindings::shrinker, + _sc: *mut bindings::shrink_control, +) -> c_ulong { + // SAFETY: We can access our own private data. + let list_lru = unsafe { (*shrink).private_data.cast::() }; + // SAFETY: Accessing the lru list is okay. Just an FFI call. + unsafe { bindings::list_lru_count(list_lru) } +} + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_scan( + shrink: *mut bindings::shrinker, + sc: *mut bindings::shrink_control, +) -> c_ulong { + // SAFETY: We can access our own private data. + let list_lru = unsafe { (*shrink).private_data.cast::() }; + // SAFETY: Caller guarantees that it is safe to read this field. + let nr_to_scan = unsafe { (*sc).nr_to_scan }; + // SAFETY: Accessing the lru list is okay. Just an FFI call. + unsafe { + bindings::list_lru_walk( + list_lru, + Some(bindings::rust_shrink_free_page_wrap), + ptr::null_mut(), + nr_to_scan, + ) + } +} + +const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP; +const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY; + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_free_page( + item: *mut bindings::list_head, + lru: *mut bindings::list_lru_one, + _cb_arg: *mut c_void, +) -> bindings::lru_status { + // Fields that should survive after unlocking the lru lock. + let page; + let page_index; + let mm; + let mmap_read; + let mm_mutex; + let vma_addr; + + { + // CAST: The `list_head` field is first in `PageInfo`. + let info = item as *mut PageInfo; + // SAFETY: The `range` field of `PageInfo` is immutable. + let range = unsafe { &*((*info).range) }; + + mm = match range.mm.mmget_not_zero() { + Some(mm) => MmWithUser::into_mmput_async(mm), + None => return LRU_SKIP, + }; + + mm_mutex = match range.stable_trylock_mm() { + Some(guard) => guard, + None => return LRU_SKIP, + }; + + mmap_read = match mm.mmap_read_trylock() { + Some(guard) => guard, + None => return LRU_SKIP, + }; + + // We can't lock it normally here, since we hold the lru lock. + let inner = match range.lock.try_lock() { + Some(inner) => inner, + None => return LRU_SKIP, + }; + + // SAFETY: The item is in this lru list, so it's okay to remove it. + unsafe { bindings::list_lru_isolate(lru, item) }; + + // SAFETY: Both pointers are in bounds of the same allocation. + page_index = unsafe { info.offset_from(inner.pages) } as usize; + + // SAFETY: We hold the spinlock, so we can take the page. + // + // This sets the page pointer to zero before we unmap it from the vma. However, we call + // `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to + // insert a new page until after our call to `zap_page_range`. + page = unsafe { PageInfo::take_page(info) }; + vma_addr = inner.vma_addr; + + // From this point on, we don't access this PageInfo or ShrinkablePageRange again, because + // they can be freed at any point after we unlock `lru_lock`. This is with the exception of + // `mm_mutex` which is kept alive by holding the lock. + } + + // SAFETY: The lru lock is locked when this method is called. + unsafe { bindings::spin_unlock(&raw mut (*lru).lock) }; + + if let Some(vma) = mmap_read.vma_lookup(vma_addr) { + let user_page_addr = vma_addr + (page_index << PAGE_SHIFT); + vma.zap_page_range_single(user_page_addr, PAGE_SIZE); + } + + drop(mmap_read); + drop(mm_mutex); + drop(mm); + drop(page); + + // SAFETY: We just unlocked the lru lock, but it should be locked when we return. + unsafe { bindings::spin_lock(&raw mut (*lru).lock) }; + + LRU_REMOVED_ENTRY +} diff --git a/drivers/android/binder/page_range_helper.c b/drivers/android/binder/page_range_helper.c new file mode 100644 index 000000000000..496887723ee0 --- /dev/null +++ b/drivers/android/binder/page_range_helper.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* C helper for page_range.rs to work around a CFI violation. + * + * Bindgen currently pretends that `enum lru_status` is the same as an integer. + * This assumption is fine ABI-wise, but once you add CFI to the mix, it + * triggers a CFI violation because `enum lru_status` gets a different CFI tag. + * + * This file contains a workaround until bindgen can be fixed. + * + * Copyright (C) 2025 Google LLC. + */ +#include "page_range_helper.h" + +unsigned int rust_shrink_free_page(struct list_head *item, + struct list_lru_one *list, + void *cb_arg); + +enum lru_status +rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list, + void *cb_arg) +{ + return rust_shrink_free_page(item, list, cb_arg); +} diff --git a/drivers/android/binder/page_range_helper.h b/drivers/android/binder/page_range_helper.h new file mode 100644 index 000000000000..18dd2dd117b2 --- /dev/null +++ b/drivers/android/binder/page_range_helper.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#ifndef _LINUX_PAGE_RANGE_HELPER_H +#define _LINUX_PAGE_RANGE_HELPER_H + +#include + +enum lru_status +rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list, + void *cb_arg); + +#endif /* _LINUX_PAGE_RANGE_HELPER_H */ diff --git a/drivers/android/binder/process.rs b/drivers/android/binder/process.rs new file mode 100644 index 000000000000..f13a747e784c --- /dev/null +++ b/drivers/android/binder/process.rs @@ -0,0 +1,1696 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module defines the `Process` type, which represents a process using a particular binder +//! context. +//! +//! The `Process` object keeps track of all of the resources that this process owns in the binder +//! context. +//! +//! There is one `Process` object for each binder fd that a process has opened, so processes using +//! several binder contexts have several `Process` objects. This ensures that the contexts are +//! fully separated. + +use core::mem::take; + +use kernel::{ + bindings, + cred::Credential, + error::Error, + fs::file::{self, File}, + list::{List, ListArc, ListArcField, ListLinks}, + mm, + prelude::*, + rbtree::{self, RBTree, RBTreeNode, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + sync::poll::PollTable, + sync::{ + lock::{spinlock::SpinLockBackend, Guard}, + Arc, ArcBorrow, CondVar, CondVarTimeoutResult, Mutex, SpinLock, UniqueArc, + }, + task::Task, + types::ARef, + uaccess::{UserSlice, UserSliceReader}, + uapi, + workqueue::{self, Work}, +}; + +use crate::{ + allocation::{Allocation, AllocationInfo, NewAllocation}, + context::Context, + defs::*, + error::{BinderError, BinderResult}, + node::{CouldNotDeliverCriticalIncrement, CritIncrWrapper, Node, NodeDeath, NodeRef}, + page_range::ShrinkablePageRange, + range_alloc::{RangeAllocator, ReserveNew, ReserveNewArgs}, + stats::BinderStats, + thread::{PushWorkRes, Thread}, + BinderfsProcFile, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +#[path = "freeze.rs"] +mod freeze; +use self::freeze::{FreezeCookie, FreezeListener}; + +struct Mapping { + address: usize, + alloc: RangeAllocator, +} + +impl Mapping { + fn new(address: usize, size: usize) -> Self { + Self { + address, + alloc: RangeAllocator::new(size), + } + } +} + +// bitflags for defer_work. +const PROC_DEFER_FLUSH: u8 = 1; +const PROC_DEFER_RELEASE: u8 = 2; + +/// The fields of `Process` protected by the spinlock. +pub(crate) struct ProcessInner { + is_manager: bool, + pub(crate) is_dead: bool, + threads: RBTree>, + /// INVARIANT: Threads pushed to this list must be owned by this process. + ready_threads: List, + nodes: RBTree>, + mapping: Option, + work: List>, + delivered_deaths: List, 2>, + + /// The number of requested threads that haven't registered yet. + requested_thread_count: u32, + /// The maximum number of threads used by the process thread pool. + max_threads: u32, + /// The number of threads the started and registered with the thread pool. + started_thread_count: u32, + + /// Bitmap of deferred work to do. + defer_work: u8, + + /// Number of transactions to be transmitted before processes in freeze_wait + /// are woken up. + outstanding_txns: u32, + /// Process is frozen and unable to service binder transactions. + pub(crate) is_frozen: bool, + /// Process received sync transactions since last frozen. + pub(crate) sync_recv: bool, + /// Process received async transactions since last frozen. + pub(crate) async_recv: bool, + pub(crate) binderfs_file: Option, + /// Check for oneway spam + oneway_spam_detection_enabled: bool, +} + +impl ProcessInner { + fn new() -> Self { + Self { + is_manager: false, + is_dead: false, + threads: RBTree::new(), + ready_threads: List::new(), + mapping: None, + nodes: RBTree::new(), + work: List::new(), + delivered_deaths: List::new(), + requested_thread_count: 0, + max_threads: 0, + started_thread_count: 0, + defer_work: 0, + outstanding_txns: 0, + is_frozen: false, + sync_recv: false, + async_recv: false, + binderfs_file: None, + oneway_spam_detection_enabled: false, + } + } + + /// Schedule the work item for execution on this process. + /// + /// If any threads are ready for work, then the work item is given directly to that thread and + /// it is woken up. Otherwise, it is pushed to the process work list. + /// + /// This call can fail only if the process is dead. In this case, the work item is returned to + /// the caller so that the caller can drop it after releasing the inner process lock. This is + /// necessary since the destructor of `Transaction` will take locks that can't necessarily be + /// taken while holding the inner process lock. + pub(crate) fn push_work( + &mut self, + work: DLArc, + ) -> Result<(), (BinderError, DLArc)> { + // Try to find a ready thread to which to push the work. + if let Some(thread) = self.ready_threads.pop_front() { + // Push to thread while holding state lock. This prevents the thread from giving up + // (for example, because of a signal) when we're about to deliver work. + match thread.push_work(work) { + PushWorkRes::Ok => Ok(()), + PushWorkRes::FailedDead(work) => Err((BinderError::new_dead(), work)), + } + } else if self.is_dead { + Err((BinderError::new_dead(), work)) + } else { + let sync = work.should_sync_wakeup(); + + // Didn't find a thread waiting for proc work; this can happen + // in two scenarios: + // 1. All threads are busy handling transactions + // In that case, one of those threads should call back into + // the kernel driver soon and pick up this work. + // 2. Threads are using the (e)poll interface, in which case + // they may be blocked on the waitqueue without having been + // added to waiting_threads. For this case, we just iterate + // over all threads not handling transaction work, and + // wake them all up. We wake all because we don't know whether + // a thread that called into (e)poll is handling non-binder + // work currently. + self.work.push_back(work); + + // Wake up polling threads, if any. + for thread in self.threads.values() { + thread.notify_if_poll_ready(sync); + } + + Ok(()) + } + } + + pub(crate) fn remove_node(&mut self, ptr: u64) { + self.nodes.remove(&ptr); + } + + /// Updates the reference count on the given node. + pub(crate) fn update_node_refcount( + &mut self, + node: &DArc, + inc: bool, + strong: bool, + count: usize, + othread: Option<&Thread>, + ) { + let push = node.update_refcount_locked(inc, strong, count, self); + + // If we decided that we need to push work, push either to the process or to a thread if + // one is specified. + if let Some(node) = push { + if let Some(thread) = othread { + thread.push_work_deferred(node); + } else { + let _ = self.push_work(node); + // Nothing to do: `push_work` may fail if the process is dead, but that's ok as in + // that case, it doesn't care about the notification. + } + } + } + + pub(crate) fn new_node_ref( + &mut self, + node: DArc, + strong: bool, + thread: Option<&Thread>, + ) -> NodeRef { + self.update_node_refcount(&node, true, strong, 1, thread); + let strong_count = if strong { 1 } else { 0 }; + NodeRef::new(node, strong_count, 1 - strong_count) + } + + pub(crate) fn new_node_ref_with_thread( + &mut self, + node: DArc, + strong: bool, + thread: &Thread, + wrapper: Option, + ) -> Result { + let push = match wrapper { + None => node + .incr_refcount_allow_zero2one(strong, self)? + .map(|node| node as _), + Some(wrapper) => node.incr_refcount_allow_zero2one_with_wrapper(strong, wrapper, self), + }; + if let Some(node) = push { + thread.push_work_deferred(node); + } + let strong_count = if strong { 1 } else { 0 }; + Ok(NodeRef::new(node, strong_count, 1 - strong_count)) + } + + /// Returns an existing node with the given pointer and cookie, if one exists. + /// + /// Returns an error if a node with the given pointer but a different cookie exists. + fn get_existing_node(&self, ptr: u64, cookie: u64) -> Result>> { + match self.nodes.get(&ptr) { + None => Ok(None), + Some(node) => { + let (_, node_cookie) = node.get_id(); + if node_cookie == cookie { + Ok(Some(node.clone())) + } else { + Err(EINVAL) + } + } + } + } + + fn register_thread(&mut self) -> bool { + if self.requested_thread_count == 0 { + return false; + } + + self.requested_thread_count -= 1; + self.started_thread_count += 1; + true + } + + /// Finds a delivered death notification with the given cookie, removes it from the thread's + /// delivered list, and returns it. + fn pull_delivered_death(&mut self, cookie: u64) -> Option> { + let mut cursor = self.delivered_deaths.cursor_front(); + while let Some(next) = cursor.peek_next() { + if next.cookie == cookie { + return Some(next.remove().into_arc()); + } + cursor.move_next(); + } + None + } + + pub(crate) fn death_delivered(&mut self, death: DArc) { + if let Some(death) = ListArc::try_from_arc_or_drop(death) { + self.delivered_deaths.push_back(death); + } else { + pr_warn!("Notification added to `delivered_deaths` twice."); + } + } + + pub(crate) fn add_outstanding_txn(&mut self) { + self.outstanding_txns += 1; + } + + fn txns_pending_locked(&self) -> bool { + if self.outstanding_txns > 0 { + return true; + } + for thread in self.threads.values() { + if thread.has_current_transaction() { + return true; + } + } + false + } +} + +/// Used to keep track of a node that this process has a handle to. +#[pin_data] +pub(crate) struct NodeRefInfo { + debug_id: usize, + /// The refcount that this process owns to the node. + node_ref: ListArcField, + death: ListArcField>, { Self::LIST_PROC }>, + /// Cookie of the active freeze listener for this node. + freeze: ListArcField, { Self::LIST_PROC }>, + /// Used to store this `NodeRefInfo` in the node's `refs` list. + #[pin] + links: ListLinks<{ Self::LIST_NODE }>, + /// The handle for this `NodeRefInfo`. + handle: u32, + /// The process that has a handle to the node. + pub(crate) process: Arc, +} + +impl NodeRefInfo { + /// The id used for the `Node::refs` list. + pub(crate) const LIST_NODE: u64 = 0x2da16350fb724a10; + /// The id used for the `ListArc` in `ProcessNodeRefs`. + const LIST_PROC: u64 = 0xd703a5263dcc8650; + + fn new(node_ref: NodeRef, handle: u32, process: Arc) -> impl PinInit { + pin_init!(Self { + debug_id: super::next_debug_id(), + node_ref: ListArcField::new(node_ref), + death: ListArcField::new(None), + freeze: ListArcField::new(None), + links <- ListLinks::new(), + handle, + process, + }) + } + + kernel::list::define_list_arc_field_getter! { + pub(crate) fn death(&mut self<{Self::LIST_PROC}>) -> &mut Option> { death } + pub(crate) fn freeze(&mut self<{Self::LIST_PROC}>) -> &mut Option { freeze } + pub(crate) fn node_ref(&mut self<{Self::LIST_PROC}>) -> &mut NodeRef { node_ref } + pub(crate) fn node_ref2(&self<{Self::LIST_PROC}>) -> &NodeRef { node_ref } + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<{Self::LIST_NODE}> for NodeRefInfo { untracked; } + impl ListArcSafe<{Self::LIST_PROC}> for NodeRefInfo { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<{Self::LIST_NODE}> for NodeRefInfo { + using ListLinks { self.links }; + } +} + +/// Keeps track of references this process has to nodes owned by other processes. +/// +/// TODO: Currently, the rbtree requires two allocations per node reference, and two tree +/// traversals to look up a node by `Node::global_id`. Once the rbtree is more powerful, these +/// extra costs should be eliminated. +struct ProcessNodeRefs { + /// Used to look up nodes using the 32-bit id that this process knows it by. + by_handle: RBTree>, + /// Used to look up nodes without knowing their local 32-bit id. The usize is the address of + /// the underlying `Node` struct as returned by `Node::global_id`. + by_node: RBTree, + /// Used to look up a `FreezeListener` by cookie. + /// + /// There might be multiple freeze listeners for the same node, but at most one of them is + /// active. + freeze_listeners: RBTree, +} + +impl ProcessNodeRefs { + fn new() -> Self { + Self { + by_handle: RBTree::new(), + by_node: RBTree::new(), + freeze_listeners: RBTree::new(), + } + } +} + +/// A process using binder. +/// +/// Strictly speaking, there can be multiple of these per process. There is one for each binder fd +/// that a process has opened, so processes using several binder contexts have several `Process` +/// objects. This ensures that the contexts are fully separated. +#[pin_data] +pub(crate) struct Process { + pub(crate) ctx: Arc, + + // The task leader (process). + pub(crate) task: ARef, + + // Credential associated with file when `Process` is created. + pub(crate) cred: ARef, + + #[pin] + pub(crate) inner: SpinLock, + + #[pin] + pub(crate) pages: ShrinkablePageRange, + + // Waitqueue of processes waiting for all outstanding transactions to be + // processed. + #[pin] + freeze_wait: CondVar, + + // Node references are in a different lock to avoid recursive acquisition when + // incrementing/decrementing a node in another process. + #[pin] + node_refs: Mutex, + + // Work node for deferred work item. + #[pin] + defer_work: Work, + + // Links for process list in Context. + #[pin] + links: ListLinks, + + pub(crate) stats: BinderStats, +} + +kernel::impl_has_work! { + impl HasWork for Process { self.defer_work } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Process { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Process { + using ListLinks { self.links }; + } +} + +impl workqueue::WorkItem for Process { + type Pointer = Arc; + + fn run(me: Arc) { + let defer; + { + let mut inner = me.inner.lock(); + defer = inner.defer_work; + inner.defer_work = 0; + } + + if defer & PROC_DEFER_FLUSH != 0 { + me.deferred_flush(); + } + if defer & PROC_DEFER_RELEASE != 0 { + me.deferred_release(); + } + } +} + +impl Process { + fn new(ctx: Arc, cred: ARef) -> Result> { + let current = kernel::current!(); + let list_process = ListArc::pin_init::( + try_pin_init!(Process { + ctx, + cred, + inner <- kernel::new_spinlock!(ProcessInner::new(), "Process::inner"), + pages <- ShrinkablePageRange::new(&super::BINDER_SHRINKER), + node_refs <- kernel::new_mutex!(ProcessNodeRefs::new(), "Process::node_refs"), + freeze_wait <- kernel::new_condvar!("Process::freeze_wait"), + task: current.group_leader().into(), + defer_work <- kernel::new_work!("Process::defer_work"), + links <- ListLinks::new(), + stats: BinderStats::new(), + }), + GFP_KERNEL, + )?; + + let process = list_process.clone_arc(); + process.ctx.register_process(list_process); + + Ok(process) + } + + pub(crate) fn pid_in_current_ns(&self) -> kernel::task::Pid { + self.task.tgid_nr_ns(None) + } + + #[inline(never)] + pub(crate) fn debug_print_stats(&self, m: &SeqFile, ctx: &Context) -> Result<()> { + seq_print!(m, "proc {}\n", self.pid_in_current_ns()); + seq_print!(m, "context {}\n", &*ctx.name); + + let inner = self.inner.lock(); + seq_print!(m, " threads: {}\n", inner.threads.iter().count()); + seq_print!( + m, + " requested threads: {}+{}/{}\n", + inner.requested_thread_count, + inner.started_thread_count, + inner.max_threads, + ); + if let Some(mapping) = &inner.mapping { + seq_print!( + m, + " free oneway space: {}\n", + mapping.alloc.free_oneway_space() + ); + seq_print!(m, " buffers: {}\n", mapping.alloc.count_buffers()); + } + seq_print!( + m, + " outstanding transactions: {}\n", + inner.outstanding_txns + ); + seq_print!(m, " nodes: {}\n", inner.nodes.iter().count()); + drop(inner); + + { + let mut refs = self.node_refs.lock(); + let (mut count, mut weak, mut strong) = (0, 0, 0); + for r in refs.by_handle.values_mut() { + let node_ref = r.node_ref(); + let (nstrong, nweak) = node_ref.get_count(); + count += 1; + weak += nweak; + strong += nstrong; + } + seq_print!(m, " refs: {count} s {strong} w {weak}\n"); + } + + self.stats.debug_print(" ", m); + + Ok(()) + } + + #[inline(never)] + pub(crate) fn debug_print(&self, m: &SeqFile, ctx: &Context, print_all: bool) -> Result<()> { + seq_print!(m, "proc {}\n", self.pid_in_current_ns()); + seq_print!(m, "context {}\n", &*ctx.name); + + let mut all_threads = KVec::new(); + let mut all_nodes = KVec::new(); + loop { + let inner = self.inner.lock(); + let num_threads = inner.threads.iter().count(); + let num_nodes = inner.nodes.iter().count(); + + if all_threads.capacity() < num_threads || all_nodes.capacity() < num_nodes { + drop(inner); + all_threads.reserve(num_threads, GFP_KERNEL)?; + all_nodes.reserve(num_nodes, GFP_KERNEL)?; + continue; + } + + for thread in inner.threads.values() { + assert!(all_threads.len() < all_threads.capacity()); + let _ = all_threads.push(thread.clone(), GFP_ATOMIC); + } + + for node in inner.nodes.values() { + assert!(all_nodes.len() < all_nodes.capacity()); + let _ = all_nodes.push(node.clone(), GFP_ATOMIC); + } + + break; + } + + for thread in all_threads { + thread.debug_print(m, print_all)?; + } + + let mut inner = self.inner.lock(); + for node in all_nodes { + if print_all || node.has_oneway_transaction(&mut inner) { + node.full_debug_print(m, &mut inner)?; + } + } + drop(inner); + + if print_all { + let mut refs = self.node_refs.lock(); + for r in refs.by_handle.values_mut() { + let node_ref = r.node_ref(); + let dead = node_ref.node.owner.inner.lock().is_dead; + let (strong, weak) = node_ref.get_count(); + let debug_id = node_ref.node.debug_id; + + seq_print!( + m, + " ref {}: desc {} {}node {debug_id} s {strong} w {weak}", + r.debug_id, + r.handle, + if dead { "dead " } else { "" }, + ); + } + } + + let inner = self.inner.lock(); + for work in &inner.work { + work.debug_print(m, " ", " pending transaction ")?; + } + for _death in &inner.delivered_deaths { + seq_print!(m, " has delivered dead binder\n"); + } + if let Some(mapping) = &inner.mapping { + mapping.alloc.debug_print(m)?; + } + drop(inner); + + Ok(()) + } + + /// Attempts to fetch a work item from the process queue. + pub(crate) fn get_work(&self) -> Option> { + self.inner.lock().work.pop_front() + } + + /// Attempts to fetch a work item from the process queue. If none is available, it registers the + /// given thread as ready to receive work directly. + /// + /// This must only be called when the thread is not participating in a transaction chain; when + /// it is, work will always be delivered directly to the thread (and not through the process + /// queue). + pub(crate) fn get_work_or_register<'a>( + &'a self, + thread: &'a Arc, + ) -> GetWorkOrRegister<'a> { + let mut inner = self.inner.lock(); + // Try to get work from the process queue. + if let Some(work) = inner.work.pop_front() { + return GetWorkOrRegister::Work(work); + } + + // Register the thread as ready. + GetWorkOrRegister::Register(Registration::new(thread, &mut inner)) + } + + fn get_current_thread(self: ArcBorrow<'_, Self>) -> Result> { + let id = { + let current = kernel::current!(); + if !core::ptr::eq(current.group_leader(), &*self.task) { + pr_err!("get_current_thread was called from the wrong process."); + return Err(EINVAL); + } + current.pid() + }; + + { + let inner = self.inner.lock(); + if let Some(thread) = inner.threads.get(&id) { + return Ok(thread.clone()); + } + } + + // Allocate a new `Thread` without holding any locks. + let reservation = RBTreeNodeReservation::new(GFP_KERNEL)?; + let ta: Arc = Thread::new(id, self.into())?; + + let mut inner = self.inner.lock(); + match inner.threads.entry(id) { + rbtree::Entry::Vacant(entry) => { + entry.insert(ta.clone(), reservation); + Ok(ta) + } + rbtree::Entry::Occupied(_entry) => { + pr_err!("Cannot create two threads with the same id."); + Err(EINVAL) + } + } + } + + pub(crate) fn push_work(&self, work: DLArc) -> BinderResult { + // If push_work fails, drop the work item outside the lock. + let res = self.inner.lock().push_work(work); + match res { + Ok(()) => Ok(()), + Err((err, work)) => { + drop(work); + Err(err) + } + } + } + + fn set_as_manager( + self: ArcBorrow<'_, Self>, + info: Option, + thread: &Thread, + ) -> Result { + let (ptr, cookie, flags) = if let Some(obj) = info { + ( + // SAFETY: The object type for this ioctl is implicitly `BINDER_TYPE_BINDER`, so it + // is safe to access the `binder` field. + unsafe { obj.__bindgen_anon_1.binder }, + obj.cookie, + obj.flags, + ) + } else { + (0, 0, 0) + }; + let node_ref = self.get_node(ptr, cookie, flags as _, true, thread)?; + let node = node_ref.node.clone(); + self.ctx.set_manager_node(node_ref)?; + self.inner.lock().is_manager = true; + + // Force the state of the node to prevent the delivery of acquire/increfs. + let mut owner_inner = node.owner.inner.lock(); + node.force_has_count(&mut owner_inner); + Ok(()) + } + + fn get_node_inner( + self: ArcBorrow<'_, Self>, + ptr: u64, + cookie: u64, + flags: u32, + strong: bool, + thread: &Thread, + wrapper: Option, + ) -> Result> { + // Try to find an existing node. + { + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node(ptr, cookie)? { + return Ok(inner.new_node_ref_with_thread(node, strong, thread, wrapper)); + } + } + + // Allocate the node before reacquiring the lock. + let node = DTRWrap::arc_pin_init(Node::new(ptr, cookie, flags, self.into()))?.into_arc(); + let rbnode = RBTreeNode::new(ptr, node.clone(), GFP_KERNEL)?; + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node(ptr, cookie)? { + return Ok(inner.new_node_ref_with_thread(node, strong, thread, wrapper)); + } + + inner.nodes.insert(rbnode); + // This can only fail if someone has already pushed the node to a list, but we just created + // it and still hold the lock, so it can't fail right now. + let node_ref = inner + .new_node_ref_with_thread(node, strong, thread, wrapper) + .unwrap(); + + Ok(Ok(node_ref)) + } + + pub(crate) fn get_node( + self: ArcBorrow<'_, Self>, + ptr: u64, + cookie: u64, + flags: u32, + strong: bool, + thread: &Thread, + ) -> Result { + let mut wrapper = None; + for _ in 0..2 { + match self.get_node_inner(ptr, cookie, flags, strong, thread, wrapper) { + Err(err) => return Err(err), + Ok(Ok(node_ref)) => return Ok(node_ref), + Ok(Err(CouldNotDeliverCriticalIncrement)) => { + wrapper = Some(CritIncrWrapper::new()?); + } + } + } + // We only get a `CouldNotDeliverCriticalIncrement` error if `wrapper` is `None`, so the + // loop should run at most twice. + unreachable!() + } + + pub(crate) fn insert_or_update_handle( + self: ArcBorrow<'_, Process>, + node_ref: NodeRef, + is_mananger: bool, + ) -> Result { + { + let mut refs = self.node_refs.lock(); + + // Do a lookup before inserting. + if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref().absorb(node_ref); + return Ok(handle); + } + } + + // Reserve memory for tree nodes. + let reserve1 = RBTreeNodeReservation::new(GFP_KERNEL)?; + let reserve2 = RBTreeNodeReservation::new(GFP_KERNEL)?; + let info = UniqueArc::new_uninit(GFP_KERNEL)?; + + let mut refs = self.node_refs.lock(); + + // Do a lookup again as node may have been inserted before the lock was reacquired. + if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref().absorb(node_ref); + return Ok(handle); + } + + // Find id. + let mut target: u32 = if is_mananger { 0 } else { 1 }; + for handle in refs.by_handle.keys() { + if *handle > target { + break; + } + if *handle == target { + target = target.checked_add(1).ok_or(ENOMEM)?; + } + } + + let gid = node_ref.node.global_id(); + let (info_proc, info_node) = { + let info_init = NodeRefInfo::new(node_ref, target, self.into()); + match info.pin_init_with(info_init) { + Ok(info) => ListArc::pair_from_pin_unique(info), + // error is infallible + Err(err) => match err {}, + } + }; + + // Ensure the process is still alive while we insert a new reference. + // + // This releases the lock before inserting the nodes, but since `is_dead` is set as the + // first thing in `deferred_release`, process cleanup will not miss the items inserted into + // `refs` below. + if self.inner.lock().is_dead { + return Err(ESRCH); + } + + // SAFETY: `info_proc` and `info_node` reference the same node, so we are inserting + // `info_node` into the right node's `refs` list. + unsafe { info_proc.node_ref2().node.insert_node_info(info_node) }; + + refs.by_node.insert(reserve1.into_node(gid, target)); + refs.by_handle.insert(reserve2.into_node(target, info_proc)); + Ok(target) + } + + pub(crate) fn get_transaction_node(&self, handle: u32) -> BinderResult { + // When handle is zero, try to get the context manager. + if handle == 0 { + Ok(self.ctx.get_manager_node(true)?) + } else { + Ok(self.get_node_from_handle(handle, true)?) + } + } + + pub(crate) fn get_node_from_handle(&self, handle: u32, strong: bool) -> Result { + self.node_refs + .lock() + .by_handle + .get_mut(&handle) + .ok_or(ENOENT)? + .node_ref() + .clone(strong) + } + + pub(crate) fn remove_from_delivered_deaths(&self, death: &DArc) { + let mut inner = self.inner.lock(); + // SAFETY: By the invariant on the `delivered_links` field, this is the right linked list. + let removed = unsafe { inner.delivered_deaths.remove(death) }; + drop(inner); + drop(removed); + } + + pub(crate) fn update_ref( + self: ArcBorrow<'_, Process>, + handle: u32, + inc: bool, + strong: bool, + ) -> Result { + if inc && handle == 0 { + if let Ok(node_ref) = self.ctx.get_manager_node(strong) { + if core::ptr::eq(&*self, &*node_ref.node.owner) { + return Err(EINVAL); + } + let _ = self.insert_or_update_handle(node_ref, true); + return Ok(()); + } + } + + // To preserve original binder behaviour, we only fail requests where the manager tries to + // increment references on itself. + let mut refs = self.node_refs.lock(); + if let Some(info) = refs.by_handle.get_mut(&handle) { + if info.node_ref().update(inc, strong) { + // Clean up death if there is one attached to this node reference. + if let Some(death) = info.death().take() { + death.set_cleared(true); + self.remove_from_delivered_deaths(&death); + } + + // Remove reference from process tables, and from the node's `refs` list. + + // SAFETY: We are removing the `NodeRefInfo` from the right node. + unsafe { info.node_ref2().node.remove_node_info(info) }; + + let id = info.node_ref().node.global_id(); + refs.by_handle.remove(&handle); + refs.by_node.remove(&id); + } + } else { + // All refs are cleared in process exit, so this warning is expected in that case. + if !self.inner.lock().is_dead { + pr_warn!("{}: no such ref {handle}\n", self.pid_in_current_ns()); + } + } + Ok(()) + } + + /// Decrements the refcount of the given node, if one exists. + pub(crate) fn update_node(&self, ptr: u64, cookie: u64, strong: bool) { + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + inner.update_node_refcount(&node, false, strong, 1, None); + } + } + + pub(crate) fn inc_ref_done(&self, reader: &mut UserSliceReader, strong: bool) -> Result { + let ptr = reader.read::()?; + let cookie = reader.read::()?; + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + if let Some(node) = node.inc_ref_done_locked(strong, &mut inner) { + // This only fails if the process is dead. + let _ = inner.push_work(node); + } + } + Ok(()) + } + + pub(crate) fn buffer_alloc( + self: &Arc, + debug_id: usize, + size: usize, + is_oneway: bool, + from_pid: i32, + ) -> BinderResult { + use kernel::page::PAGE_SIZE; + + let mut reserve_new_args = ReserveNewArgs { + debug_id, + size, + is_oneway, + pid: from_pid, + ..ReserveNewArgs::default() + }; + + let (new_alloc, addr) = loop { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut().ok_or_else(BinderError::new_dead)?; + let alloc_request = match mapping.alloc.reserve_new(reserve_new_args)? { + ReserveNew::Success(new_alloc) => break (new_alloc, mapping.address), + ReserveNew::NeedAlloc(request) => request, + }; + drop(inner); + // We need to allocate memory and then call `reserve_new` again. + reserve_new_args = alloc_request.make_alloc()?; + }; + + let res = Allocation::new( + self.clone(), + debug_id, + new_alloc.offset, + size, + addr + new_alloc.offset, + new_alloc.oneway_spam_detected, + ); + + // This allocation will be marked as in use until the `Allocation` is used to free it. + // + // This method can't be called while holding a lock, so we release the lock first. It's + // okay for several threads to use the method on the same index at the same time. In that + // case, one of the calls will allocate the given page (if missing), and the other call + // will wait for the other call to finish allocating the page. + // + // We will not call `stop_using_range` in parallel with this on the same page, because the + // allocation can only be removed via the destructor of the `Allocation` object that we + // currently own. + match self.pages.use_range( + new_alloc.offset / PAGE_SIZE, + (new_alloc.offset + size).div_ceil(PAGE_SIZE), + ) { + Ok(()) => {} + Err(err) => { + pr_warn!("use_range failure {:?}", err); + return Err(err.into()); + } + } + + Ok(NewAllocation(res)) + } + + pub(crate) fn buffer_get(self: &Arc, ptr: usize) -> Option { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut()?; + let offset = ptr.checked_sub(mapping.address)?; + let (size, debug_id, odata) = mapping.alloc.reserve_existing(offset).ok()?; + let mut alloc = Allocation::new(self.clone(), debug_id, offset, size, ptr, false); + if let Some(data) = odata { + alloc.set_info(data); + } + Some(alloc) + } + + pub(crate) fn buffer_raw_free(&self, ptr: usize) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + let offset = match ptr.checked_sub(mapping.address) { + Some(offset) => offset, + None => return, + }; + + let freed_range = match mapping.alloc.reservation_abort(offset) { + Ok(freed_range) => freed_range, + Err(_) => { + pr_warn!( + "Pointer {:x} failed to free, base = {:x}\n", + ptr, + mapping.address + ); + return; + } + }; + + // No more allocations in this range. Mark them as not in use. + // + // Must be done before we release the lock so that `use_range` is not used on these + // indices until `stop_using_range` returns. + self.pages + .stop_using_range(freed_range.start_page_idx, freed_range.end_page_idx); + } + } + + pub(crate) fn buffer_make_freeable(&self, offset: usize, mut data: Option) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + if mapping.alloc.reservation_commit(offset, &mut data).is_err() { + pr_warn!("Offset {} failed to be marked freeable\n", offset); + } + } + } + + fn create_mapping(&self, vma: &mm::virt::VmaNew) -> Result { + use kernel::page::PAGE_SIZE; + let size = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let mapping = Mapping::new(vma.start(), size); + let page_count = self.pages.register_with_vma(vma)?; + if page_count * PAGE_SIZE != size { + return Err(EINVAL); + } + + // Save range allocator for later. + self.inner.lock().mapping = Some(mapping); + + Ok(()) + } + + fn version(&self, data: UserSlice) -> Result { + data.writer().write(&BinderVersion::current()) + } + + pub(crate) fn register_thread(&self) -> bool { + self.inner.lock().register_thread() + } + + fn remove_thread(&self, thread: Arc) { + self.inner.lock().threads.remove(&thread.id); + thread.release(); + } + + fn set_max_threads(&self, max: u32) { + self.inner.lock().max_threads = max; + } + + fn set_oneway_spam_detection_enabled(&self, enabled: u32) { + self.inner.lock().oneway_spam_detection_enabled = enabled != 0; + } + + pub(crate) fn is_oneway_spam_detection_enabled(&self) -> bool { + self.inner.lock().oneway_spam_detection_enabled + } + + fn get_node_debug_info(&self, data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + // Read the starting point. + let ptr = reader.read::()?.ptr; + let mut out = BinderNodeDebugInfo::default(); + + { + let inner = self.inner.lock(); + for (node_ptr, node) in &inner.nodes { + if *node_ptr > ptr { + node.populate_debug_info(&mut out, &inner); + break; + } + } + } + + writer.write(&out) + } + + fn get_node_info_from_ref(&self, data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut out = reader.read::()?; + + if out.strong_count != 0 + || out.weak_count != 0 + || out.reserved1 != 0 + || out.reserved2 != 0 + || out.reserved3 != 0 + { + return Err(EINVAL); + } + + // Only the context manager is allowed to use this ioctl. + if !self.inner.lock().is_manager { + return Err(EPERM); + } + + { + let mut node_refs = self.node_refs.lock(); + let node_info = node_refs.by_handle.get_mut(&out.handle).ok_or(ENOENT)?; + let node_ref = node_info.node_ref(); + let owner_inner = node_ref.node.owner.inner.lock(); + node_ref.node.populate_counts(&mut out, &owner_inner); + } + + // Write the result back. + writer.write(&out) + } + + pub(crate) fn needs_thread(&self) -> bool { + let mut inner = self.inner.lock(); + let ret = inner.requested_thread_count == 0 + && inner.ready_threads.is_empty() + && inner.started_thread_count < inner.max_threads; + if ret { + inner.requested_thread_count += 1 + } + ret + } + + pub(crate) fn request_death( + self: &Arc, + reader: &mut UserSliceReader, + thread: &Thread, + ) -> Result { + let handle: u32 = reader.read()?; + let cookie: u64 = reader.read()?; + + // Queue BR_ERROR if we can't allocate memory for the death notification. + let death = UniqueArc::new_uninit(GFP_KERNEL).inspect_err(|_| { + thread.push_return_work(BR_ERROR); + })?; + let mut refs = self.node_refs.lock(); + let Some(info) = refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_REQUEST_DEATH_NOTIFICATION invalid ref {handle}\n"); + return Ok(()); + }; + + // Nothing to do if there is already a death notification request for this handle. + if info.death().is_some() { + pr_warn!("BC_REQUEST_DEATH_NOTIFICATION death notification already set\n"); + return Ok(()); + } + + let death = { + let death_init = NodeDeath::new(info.node_ref().node.clone(), self.clone(), cookie); + match death.pin_init_with(death_init) { + Ok(death) => death, + // error is infallible + Err(err) => match err {}, + } + }; + + // Register the death notification. + { + let owner = info.node_ref2().node.owner.clone(); + let mut owner_inner = owner.inner.lock(); + if owner_inner.is_dead { + let death = Arc::from(death); + *info.death() = Some(death.clone()); + drop(owner_inner); + death.set_dead(); + } else { + let death = ListArc::from(death); + *info.death() = Some(death.clone_arc()); + info.node_ref().node.add_death(death, &mut owner_inner); + } + } + Ok(()) + } + + pub(crate) fn clear_death(&self, reader: &mut UserSliceReader, thread: &Thread) -> Result { + let handle: u32 = reader.read()?; + let cookie: u64 = reader.read()?; + + let mut refs = self.node_refs.lock(); + let Some(info) = refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION invalid ref {handle}\n"); + return Ok(()); + }; + + let Some(death) = info.death().take() else { + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION death notification not active\n"); + return Ok(()); + }; + if death.cookie != cookie { + *info.death() = Some(death); + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch\n"); + return Ok(()); + } + + // Update state and determine if we need to queue a work item. We only need to do it when + // the node is not dead or if the user already completed the death notification. + if death.set_cleared(false) { + if let Some(death) = ListArc::try_from_arc_or_drop(death) { + let _ = thread.push_work_if_looper(death); + } + } + + Ok(()) + } + + pub(crate) fn dead_binder_done(&self, cookie: u64, thread: &Thread) { + if let Some(death) = self.inner.lock().pull_delivered_death(cookie) { + death.set_notification_done(thread); + } + } + + /// Locks the spinlock and move the `nodes` rbtree out. + /// + /// This allows you to iterate through `nodes` while also allowing you to give other parts of + /// the codebase exclusive access to `ProcessInner`. + pub(crate) fn lock_with_nodes(&self) -> WithNodes<'_> { + let mut inner = self.inner.lock(); + WithNodes { + nodes: take(&mut inner.nodes), + inner, + } + } + + fn deferred_flush(&self) { + let inner = self.inner.lock(); + for thread in inner.threads.values() { + thread.exit_looper(); + } + } + + fn deferred_release(self: Arc) { + let is_manager = { + let mut inner = self.inner.lock(); + inner.is_dead = true; + inner.is_frozen = false; + inner.sync_recv = false; + inner.async_recv = false; + inner.is_manager + }; + + if is_manager { + self.ctx.unset_manager_node(); + } + + self.ctx.deregister_process(&self); + + let binderfs_file = self.inner.lock().binderfs_file.take(); + drop(binderfs_file); + + // Release threads. + let threads = { + let mut inner = self.inner.lock(); + let threads = take(&mut inner.threads); + let ready = take(&mut inner.ready_threads); + drop(inner); + drop(ready); + + for thread in threads.values() { + thread.release(); + } + threads + }; + + // Release nodes. + { + while let Some(node) = { + let mut lock = self.inner.lock(); + lock.nodes.cursor_front().map(|c| c.remove_current().1) + } { + node.to_key_value().1.release(); + } + } + + // Clean up death listeners and remove nodes from external node info lists. + for info in self.node_refs.lock().by_handle.values_mut() { + // SAFETY: We are removing the `NodeRefInfo` from the right node. + unsafe { info.node_ref2().node.remove_node_info(info) }; + + // Remove all death notifications from the nodes (that belong to a different process). + let death = if let Some(existing) = info.death().take() { + existing + } else { + continue; + }; + death.set_cleared(false); + } + + // Clean up freeze listeners. + let freeze_listeners = take(&mut self.node_refs.lock().freeze_listeners); + for listener in freeze_listeners.values() { + listener.on_process_exit(&self); + } + drop(freeze_listeners); + + // Release refs on foreign nodes. + { + let mut refs = self.node_refs.lock(); + let by_handle = take(&mut refs.by_handle); + let by_node = take(&mut refs.by_node); + drop(refs); + drop(by_node); + drop(by_handle); + } + + // Cancel all pending work items. + while let Some(work) = self.get_work() { + work.into_arc().cancel(); + } + + let delivered_deaths = take(&mut self.inner.lock().delivered_deaths); + drop(delivered_deaths); + + // Free any resources kept alive by allocated buffers. + let omapping = self.inner.lock().mapping.take(); + if let Some(mut mapping) = omapping { + let address = mapping.address; + mapping + .alloc + .take_for_each(|offset, size, debug_id, odata| { + let ptr = offset + address; + pr_warn!( + "{}: removing orphan mapping {offset}:{size}\n", + self.pid_in_current_ns() + ); + let mut alloc = + Allocation::new(self.clone(), debug_id, offset, size, ptr, false); + if let Some(data) = odata { + alloc.set_info(data); + } + drop(alloc) + }); + } + + // calls to synchronize_rcu() in thread drop will happen here + drop(threads); + } + + pub(crate) fn drop_outstanding_txn(&self) { + let wake = { + let mut inner = self.inner.lock(); + if inner.outstanding_txns == 0 { + pr_err!("outstanding_txns underflow"); + return; + } + inner.outstanding_txns -= 1; + inner.is_frozen && inner.outstanding_txns == 0 + }; + + if wake { + self.freeze_wait.notify_all(); + } + } + + pub(crate) fn ioctl_freeze(&self, info: &BinderFreezeInfo) -> Result { + if info.enable == 0 { + let msgs = self.prepare_freeze_messages()?; + let mut inner = self.inner.lock(); + inner.sync_recv = false; + inner.async_recv = false; + inner.is_frozen = false; + drop(inner); + msgs.send_messages(); + return Ok(()); + } + + let mut inner = self.inner.lock(); + inner.sync_recv = false; + inner.async_recv = false; + inner.is_frozen = true; + + if info.timeout_ms > 0 { + let mut jiffies = kernel::time::msecs_to_jiffies(info.timeout_ms); + while jiffies > 0 { + if inner.outstanding_txns == 0 { + break; + } + + match self + .freeze_wait + .wait_interruptible_timeout(&mut inner, jiffies) + { + CondVarTimeoutResult::Signal { .. } => { + inner.is_frozen = false; + return Err(ERESTARTSYS); + } + CondVarTimeoutResult::Woken { jiffies: remaining } => { + jiffies = remaining; + } + CondVarTimeoutResult::Timeout => { + jiffies = 0; + } + } + } + } + + if inner.txns_pending_locked() { + inner.is_frozen = false; + Err(EAGAIN) + } else { + drop(inner); + match self.prepare_freeze_messages() { + Ok(batch) => { + batch.send_messages(); + Ok(()) + } + Err(kernel::alloc::AllocError) => { + self.inner.lock().is_frozen = false; + Err(ENOMEM) + } + } + } + } +} + +fn get_frozen_status(data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + let mut info = reader.read::()?; + info.sync_recv = 0; + info.async_recv = 0; + let mut found = false; + + for ctx in crate::context::get_all_contexts()? { + ctx.for_each_proc(|proc| { + if proc.task.pid() == info.pid as _ { + found = true; + let inner = proc.inner.lock(); + let txns_pending = inner.txns_pending_locked(); + info.async_recv |= inner.async_recv as u32; + info.sync_recv |= inner.sync_recv as u32; + info.sync_recv |= (txns_pending as u32) << 1; + } + }); + } + + if found { + writer.write(&info)?; + Ok(()) + } else { + Err(EINVAL) + } +} + +fn ioctl_freeze(reader: &mut UserSliceReader) -> Result { + let info = reader.read::()?; + + // Very unlikely for there to be more than 3, since a process normally uses at most binder and + // hwbinder. + let mut procs = KVec::with_capacity(3, GFP_KERNEL)?; + + let ctxs = crate::context::get_all_contexts()?; + for ctx in ctxs { + for proc in ctx.get_procs_with_pid(info.pid as i32)? { + procs.push(proc, GFP_KERNEL)?; + } + } + + for proc in procs { + proc.ioctl_freeze(&info)?; + } + Ok(()) +} + +/// The ioctl handler. +impl Process { + /// Ioctls that are write-only from the perspective of userspace. + /// + /// The kernel will only read from the pointer that userspace provided to us. + fn ioctl_write_only( + this: ArcBorrow<'_, Process>, + _file: &File, + cmd: u32, + reader: &mut UserSliceReader, + ) -> Result { + let thread = this.get_current_thread()?; + match cmd { + uapi::BINDER_SET_MAX_THREADS => this.set_max_threads(reader.read()?), + uapi::BINDER_THREAD_EXIT => this.remove_thread(thread), + uapi::BINDER_SET_CONTEXT_MGR => this.set_as_manager(None, &thread)?, + uapi::BINDER_SET_CONTEXT_MGR_EXT => { + this.set_as_manager(Some(reader.read()?), &thread)? + } + uapi::BINDER_ENABLE_ONEWAY_SPAM_DETECTION => { + this.set_oneway_spam_detection_enabled(reader.read()?) + } + uapi::BINDER_FREEZE => ioctl_freeze(reader)?, + _ => return Err(EINVAL), + } + Ok(()) + } + + /// Ioctls that are read/write from the perspective of userspace. + /// + /// The kernel will both read from and write to the pointer that userspace provided to us. + fn ioctl_write_read( + this: ArcBorrow<'_, Process>, + file: &File, + cmd: u32, + data: UserSlice, + ) -> Result { + let thread = this.get_current_thread()?; + let blocking = (file.flags() & file::flags::O_NONBLOCK) == 0; + match cmd { + uapi::BINDER_WRITE_READ => thread.write_read(data, blocking)?, + uapi::BINDER_GET_NODE_DEBUG_INFO => this.get_node_debug_info(data)?, + uapi::BINDER_GET_NODE_INFO_FOR_REF => this.get_node_info_from_ref(data)?, + uapi::BINDER_VERSION => this.version(data)?, + uapi::BINDER_GET_FROZEN_INFO => get_frozen_status(data)?, + uapi::BINDER_GET_EXTENDED_ERROR => thread.get_extended_error(data)?, + _ => return Err(EINVAL), + } + Ok(()) + } +} + +/// The file operations supported by `Process`. +impl Process { + pub(crate) fn open(ctx: ArcBorrow<'_, Context>, file: &File) -> Result> { + Self::new(ctx.into(), ARef::from(file.cred())) + } + + pub(crate) fn release(this: Arc, _file: &File) { + let binderfs_file; + let should_schedule; + { + let mut inner = this.inner.lock(); + should_schedule = inner.defer_work == 0; + inner.defer_work |= PROC_DEFER_RELEASE; + binderfs_file = inner.binderfs_file.take(); + } + + if should_schedule { + // Ignore failures to schedule to the workqueue. Those just mean that we're already + // scheduled for execution. + let _ = workqueue::system().enqueue(this); + } + + drop(binderfs_file); + } + + pub(crate) fn flush(this: ArcBorrow<'_, Process>) -> Result { + let should_schedule; + { + let mut inner = this.inner.lock(); + should_schedule = inner.defer_work == 0; + inner.defer_work |= PROC_DEFER_FLUSH; + } + + if should_schedule { + // Ignore failures to schedule to the workqueue. Those just mean that we're already + // scheduled for execution. + let _ = workqueue::system().enqueue(Arc::from(this)); + } + Ok(()) + } + + pub(crate) fn ioctl(this: ArcBorrow<'_, Process>, file: &File, cmd: u32, arg: usize) -> Result { + use kernel::ioctl::{_IOC_DIR, _IOC_SIZE}; + use kernel::uapi::{_IOC_READ, _IOC_WRITE}; + + crate::trace::trace_ioctl(cmd, arg); + + let user_slice = UserSlice::new(UserPtr::from_addr(arg), _IOC_SIZE(cmd)); + + const _IOC_READ_WRITE: u32 = _IOC_READ | _IOC_WRITE; + + match _IOC_DIR(cmd) { + _IOC_WRITE => Self::ioctl_write_only(this, file, cmd, &mut user_slice.reader()), + _IOC_READ_WRITE => Self::ioctl_write_read(this, file, cmd, user_slice), + _ => Err(EINVAL), + } + } + + pub(crate) fn compat_ioctl( + this: ArcBorrow<'_, Process>, + file: &File, + cmd: u32, + arg: usize, + ) -> Result { + Self::ioctl(this, file, cmd, arg) + } + + pub(crate) fn mmap( + this: ArcBorrow<'_, Process>, + _file: &File, + vma: &mm::virt::VmaNew, + ) -> Result { + // We don't allow mmap to be used in a different process. + if !core::ptr::eq(kernel::current!().group_leader(), &*this.task) { + return Err(EINVAL); + } + if vma.start() == 0 { + return Err(EINVAL); + } + + vma.try_clear_maywrite().map_err(|_| EPERM)?; + vma.set_dontcopy(); + vma.set_mixedmap(); + + // TODO: Set ops. We need to learn when the user unmaps so that we can stop using it. + this.create_mapping(vma) + } + + pub(crate) fn poll( + this: ArcBorrow<'_, Process>, + file: &File, + table: PollTable<'_>, + ) -> Result { + let thread = this.get_current_thread()?; + let (from_proc, mut mask) = thread.poll(file, table); + if mask == 0 && from_proc && !this.inner.lock().work.is_empty() { + mask |= bindings::POLLIN; + } + Ok(mask) + } +} + +/// Represents that a thread has registered with the `ready_threads` list of its process. +/// +/// The destructor of this type will unregister the thread from the list of ready threads. +pub(crate) struct Registration<'a> { + thread: &'a Arc, +} + +impl<'a> Registration<'a> { + fn new(thread: &'a Arc, guard: &mut Guard<'_, ProcessInner, SpinLockBackend>) -> Self { + assert!(core::ptr::eq(&thread.process.inner, guard.lock_ref())); + // INVARIANT: We are pushing this thread to the right `ready_threads` list. + if let Ok(list_arc) = ListArc::try_from_arc(thread.clone()) { + guard.ready_threads.push_front(list_arc); + } else { + // It is an error to hit this branch, and it should not be reachable. We try to do + // something reasonable when the failure path happens. Most likely, the thread in + // question will sleep forever. + pr_err!("Same thread registered with `ready_threads` twice."); + } + Self { thread } + } +} + +impl Drop for Registration<'_> { + fn drop(&mut self) { + let mut inner = self.thread.process.inner.lock(); + // SAFETY: The thread has the invariant that we never push it to any other linked list than + // the `ready_threads` list of its parent process. Therefore, the thread is either in that + // list, or in no list. + unsafe { inner.ready_threads.remove(self.thread) }; + } +} + +pub(crate) struct WithNodes<'a> { + pub(crate) inner: Guard<'a, ProcessInner, SpinLockBackend>, + pub(crate) nodes: RBTree>, +} + +impl Drop for WithNodes<'_> { + fn drop(&mut self) { + core::mem::swap(&mut self.nodes, &mut self.inner.nodes); + if self.nodes.iter().next().is_some() { + pr_err!("nodes array was modified while using lock_with_nodes\n"); + } + } +} + +pub(crate) enum GetWorkOrRegister<'a> { + Work(DLArc), + Register(Registration<'a>), +} diff --git a/drivers/android/binder/range_alloc/array.rs b/drivers/android/binder/range_alloc/array.rs new file mode 100644 index 000000000000..07e1dec2ce63 --- /dev/null +++ b/drivers/android/binder/range_alloc/array.rs @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + page::{PAGE_MASK, PAGE_SIZE}, + prelude::*, + seq_file::SeqFile, + seq_print, + task::Pid, +}; + +use crate::range_alloc::{DescriptorState, FreedRange, Range}; + +/// Keeps track of allocations in a process' mmap. +/// +/// Each process has an mmap where the data for incoming transactions will be placed. This struct +/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that +/// has metadata related to the allocation. We also keep track of available free space. +pub(super) struct ArrayRangeAllocator { + /// This stores all ranges that are allocated. Unlike the tree based allocator, we do *not* + /// store the free ranges. + /// + /// Sorted by offset. + pub(super) ranges: KVec>, + size: usize, + free_oneway_space: usize, +} + +struct FindEmptyRes { + /// Which index in `ranges` should we insert the new range at? + /// + /// Inserting the new range at this index keeps `ranges` sorted. + insert_at_idx: usize, + /// Which offset should we insert the new range at? + insert_at_offset: usize, +} + +impl ArrayRangeAllocator { + pub(crate) fn new(size: usize, alloc: EmptyArrayAlloc) -> Self { + Self { + ranges: alloc.ranges, + size, + free_oneway_space: size / 2, + } + } + + pub(crate) fn free_oneway_space(&self) -> usize { + self.free_oneway_space + } + + pub(crate) fn count_buffers(&self) -> usize { + self.ranges.len() + } + + pub(crate) fn total_size(&self) -> usize { + self.size + } + + pub(crate) fn is_full(&self) -> bool { + self.ranges.len() == self.ranges.capacity() + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + for range in &self.ranges { + seq_print!( + m, + " buffer {}: {} size {} pid {} oneway {}", + 0, + range.offset, + range.size, + range.state.pid(), + range.state.is_oneway(), + ); + if let DescriptorState::Reserved(_) = range.state { + seq_print!(m, " reserved\n"); + } else { + seq_print!(m, " allocated\n"); + } + } + Ok(()) + } + + /// Find somewhere to put a new range. + /// + /// Unlike the tree implementation, we do not bother to find the smallest gap. The idea is that + /// fragmentation isn't a big issue when we don't have many ranges. + /// + /// Returns the index that the new range should have in `self.ranges` after insertion. + fn find_empty_range(&self, size: usize) -> Option { + let after_last_range = self.ranges.last().map(Range::endpoint).unwrap_or(0); + + if size <= self.total_size() - after_last_range { + // We can put the range at the end, so just do that. + Some(FindEmptyRes { + insert_at_idx: self.ranges.len(), + insert_at_offset: after_last_range, + }) + } else { + let mut end_of_prev = 0; + for (i, range) in self.ranges.iter().enumerate() { + // Does it fit before the i'th range? + if size <= range.offset - end_of_prev { + return Some(FindEmptyRes { + insert_at_idx: i, + insert_at_offset: end_of_prev, + }); + } + end_of_prev = range.endpoint(); + } + None + } + } + + pub(crate) fn reserve_new( + &mut self, + debug_id: usize, + size: usize, + is_oneway: bool, + pid: Pid, + ) -> Result { + // Compute new value of free_oneway_space, which is set only on success. + let new_oneway_space = if is_oneway { + match self.free_oneway_space.checked_sub(size) { + Some(new_oneway_space) => new_oneway_space, + None => return Err(ENOSPC), + } + } else { + self.free_oneway_space + }; + + let FindEmptyRes { + insert_at_idx, + insert_at_offset, + } = self.find_empty_range(size).ok_or(ENOSPC)?; + self.free_oneway_space = new_oneway_space; + + let new_range = Range { + offset: insert_at_offset, + size, + state: DescriptorState::new(is_oneway, debug_id, pid), + }; + // Insert the value at the given index to keep the array sorted. + self.ranges + .insert_within_capacity(insert_at_idx, new_range) + .ok() + .unwrap(); + + Ok(insert_at_offset) + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + // This could use a binary search, but linear scans are usually faster for small arrays. + let i = self + .ranges + .iter() + .position(|range| range.offset == offset) + .ok_or(EINVAL)?; + let range = &self.ranges[i]; + + if let DescriptorState::Allocated(_) = range.state { + return Err(EPERM); + } + + let size = range.size; + let offset = range.offset; + + if range.state.is_oneway() { + self.free_oneway_space += size; + } + + // This computes the range of pages that are no longer used by *any* allocated range. The + // caller will mark them as unused, which means that they can be freed if the system comes + // under memory pressure. + let mut freed_range = FreedRange::interior_pages(offset, size); + #[expect(clippy::collapsible_if)] // reads better like this + if offset % PAGE_SIZE != 0 { + if i == 0 || self.ranges[i - 1].endpoint() <= (offset & PAGE_MASK) { + freed_range.start_page_idx -= 1; + } + } + if range.endpoint() % PAGE_SIZE != 0 { + let page_after = (range.endpoint() & PAGE_MASK) + PAGE_SIZE; + if i + 1 == self.ranges.len() || page_after <= self.ranges[i + 1].offset { + freed_range.end_page_idx += 1; + } + } + + self.ranges.remove(i)?; + Ok(freed_range) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + // This could use a binary search, but linear scans are usually faster for small arrays. + let range = self + .ranges + .iter_mut() + .find(|range| range.offset == offset) + .ok_or(ENOENT)?; + + let DescriptorState::Reserved(reservation) = &range.state else { + return Err(ENOENT); + }; + + range.state = DescriptorState::Allocated(reservation.clone().allocate(data.take())); + Ok(()) + } + + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + // This could use a binary search, but linear scans are usually faster for small arrays. + let range = self + .ranges + .iter_mut() + .find(|range| range.offset == offset) + .ok_or(ENOENT)?; + + let DescriptorState::Allocated(allocation) = &mut range.state else { + return Err(ENOENT); + }; + + let data = allocation.take(); + let debug_id = allocation.reservation.debug_id; + range.state = DescriptorState::Reserved(allocation.reservation.clone()); + Ok((range.size, debug_id, data)) + } + + pub(crate) fn take_for_each)>(&mut self, callback: F) { + for range in self.ranges.iter_mut() { + if let DescriptorState::Allocated(allocation) = &mut range.state { + callback( + range.offset, + range.size, + allocation.reservation.debug_id, + allocation.data.take(), + ); + } + } + } +} + +pub(crate) struct EmptyArrayAlloc { + ranges: KVec>, +} + +impl EmptyArrayAlloc { + pub(crate) fn try_new(capacity: usize) -> Result { + Ok(Self { + ranges: KVec::with_capacity(capacity, GFP_KERNEL)?, + }) + } +} diff --git a/drivers/android/binder/range_alloc/mod.rs b/drivers/android/binder/range_alloc/mod.rs new file mode 100644 index 000000000000..2301e2bc1a1f --- /dev/null +++ b/drivers/android/binder/range_alloc/mod.rs @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{page::PAGE_SIZE, prelude::*, seq_file::SeqFile, task::Pid}; + +mod tree; +use self::tree::{FromArrayAllocs, ReserveNewTreeAlloc, TreeRangeAllocator}; + +mod array; +use self::array::{ArrayRangeAllocator, EmptyArrayAlloc}; + +enum DescriptorState { + Reserved(Reservation), + Allocated(Allocation), +} + +impl DescriptorState { + fn new(is_oneway: bool, debug_id: usize, pid: Pid) -> Self { + DescriptorState::Reserved(Reservation { + debug_id, + is_oneway, + pid, + }) + } + + fn pid(&self) -> Pid { + match self { + DescriptorState::Reserved(inner) => inner.pid, + DescriptorState::Allocated(inner) => inner.reservation.pid, + } + } + + fn is_oneway(&self) -> bool { + match self { + DescriptorState::Reserved(inner) => inner.is_oneway, + DescriptorState::Allocated(inner) => inner.reservation.is_oneway, + } + } +} + +#[derive(Clone)] +struct Reservation { + debug_id: usize, + is_oneway: bool, + pid: Pid, +} + +impl Reservation { + fn allocate(self, data: Option) -> Allocation { + Allocation { + data, + reservation: self, + } + } +} + +struct Allocation { + reservation: Reservation, + data: Option, +} + +impl Allocation { + fn deallocate(self) -> (Reservation, Option) { + (self.reservation, self.data) + } + + fn debug_id(&self) -> usize { + self.reservation.debug_id + } + + fn take(&mut self) -> Option { + self.data.take() + } +} + +/// The array implementation must switch to the tree if it wants to go beyond this number of +/// ranges. +const TREE_THRESHOLD: usize = 8; + +/// Represents a range of pages that have just become completely free. +#[derive(Copy, Clone)] +pub(crate) struct FreedRange { + pub(crate) start_page_idx: usize, + pub(crate) end_page_idx: usize, +} + +impl FreedRange { + fn interior_pages(offset: usize, size: usize) -> FreedRange { + FreedRange { + // Divide round up + start_page_idx: offset.div_ceil(PAGE_SIZE), + // Divide round down + end_page_idx: (offset + size) / PAGE_SIZE, + } + } +} + +struct Range { + offset: usize, + size: usize, + state: DescriptorState, +} + +impl Range { + fn endpoint(&self) -> usize { + self.offset + self.size + } +} + +pub(crate) struct RangeAllocator { + inner: Impl, +} + +enum Impl { + Empty(usize), + Array(ArrayRangeAllocator), + Tree(TreeRangeAllocator), +} + +impl RangeAllocator { + pub(crate) fn new(size: usize) -> Self { + Self { + inner: Impl::Empty(size), + } + } + + pub(crate) fn free_oneway_space(&self) -> usize { + match &self.inner { + Impl::Empty(size) => size / 2, + Impl::Array(array) => array.free_oneway_space(), + Impl::Tree(tree) => tree.free_oneway_space(), + } + } + + pub(crate) fn count_buffers(&self) -> usize { + match &self.inner { + Impl::Empty(_size) => 0, + Impl::Array(array) => array.count_buffers(), + Impl::Tree(tree) => tree.count_buffers(), + } + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + match &self.inner { + Impl::Empty(_size) => Ok(()), + Impl::Array(array) => array.debug_print(m), + Impl::Tree(tree) => tree.debug_print(m), + } + } + + /// Try to reserve a new buffer, using the provided allocation if necessary. + pub(crate) fn reserve_new(&mut self, mut args: ReserveNewArgs) -> Result> { + match &mut self.inner { + Impl::Empty(size) => { + let empty_array = match args.empty_array_alloc.take() { + Some(empty_array) => ArrayRangeAllocator::new(*size, empty_array), + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: true, + need_new_tree_alloc: false, + need_tree_alloc: false, + })) + } + }; + + self.inner = Impl::Array(empty_array); + self.reserve_new(args) + } + Impl::Array(array) if array.is_full() => { + let allocs = match args.new_tree_alloc { + Some(ref mut allocs) => allocs, + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: false, + need_new_tree_alloc: true, + need_tree_alloc: true, + })) + } + }; + + let new_tree = + TreeRangeAllocator::from_array(array.total_size(), &mut array.ranges, allocs); + + self.inner = Impl::Tree(new_tree); + self.reserve_new(args) + } + Impl::Array(array) => { + let offset = + array.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid)?; + Ok(ReserveNew::Success(ReserveNewSuccess { + offset, + oneway_spam_detected: false, + _empty_array_alloc: args.empty_array_alloc, + _new_tree_alloc: args.new_tree_alloc, + _tree_alloc: args.tree_alloc, + })) + } + Impl::Tree(tree) => { + let alloc = match args.tree_alloc { + Some(alloc) => alloc, + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: false, + need_new_tree_alloc: false, + need_tree_alloc: true, + })); + } + }; + let (offset, oneway_spam_detected) = + tree.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid, alloc)?; + Ok(ReserveNew::Success(ReserveNewSuccess { + offset, + oneway_spam_detected, + _empty_array_alloc: args.empty_array_alloc, + _new_tree_alloc: args.new_tree_alloc, + _tree_alloc: None, + })) + } + } + } + + /// Deletes the allocations at `offset`. + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reservation_abort(offset), + Impl::Tree(tree) => { + let freed_range = tree.reservation_abort(offset)?; + if tree.is_empty() { + self.inner = Impl::Empty(tree.total_size()); + } + Ok(freed_range) + } + } + } + + /// Called when an allocation is no longer in use by the kernel. + /// + /// The value in `data` will be stored, if any. A mutable reference is used to avoid dropping + /// the `T` when an error is returned. + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reservation_commit(offset, data), + Impl::Tree(tree) => tree.reservation_commit(offset, data), + } + } + + /// Called when the kernel starts using an allocation. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reserve_existing(offset), + Impl::Tree(tree) => tree.reserve_existing(offset), + } + } + + /// Call the provided callback at every allocated region. + /// + /// This destroys the range allocator. Used only during shutdown. + pub(crate) fn take_for_each)>(&mut self, callback: F) { + match &mut self.inner { + Impl::Empty(_size) => {} + Impl::Array(array) => array.take_for_each(callback), + Impl::Tree(tree) => tree.take_for_each(callback), + } + } +} + +/// The arguments for `reserve_new`. +#[derive(Default)] +pub(crate) struct ReserveNewArgs { + pub(crate) size: usize, + pub(crate) is_oneway: bool, + pub(crate) debug_id: usize, + pub(crate) pid: Pid, + pub(crate) empty_array_alloc: Option>, + pub(crate) new_tree_alloc: Option>, + pub(crate) tree_alloc: Option>, +} + +/// The return type of `ReserveNew`. +pub(crate) enum ReserveNew { + Success(ReserveNewSuccess), + NeedAlloc(ReserveNewNeedAlloc), +} + +/// Returned by `reserve_new` when the reservation was successul. +pub(crate) struct ReserveNewSuccess { + pub(crate) offset: usize, + pub(crate) oneway_spam_detected: bool, + + // If the user supplied an allocation that we did not end up using, then we return it here. + // The caller will kfree it outside of the lock. + _empty_array_alloc: Option>, + _new_tree_alloc: Option>, + _tree_alloc: Option>, +} + +/// Returned by `reserve_new` to request the caller to make an allocation before calling the method +/// again. +pub(crate) struct ReserveNewNeedAlloc { + args: ReserveNewArgs, + need_empty_array_alloc: bool, + need_new_tree_alloc: bool, + need_tree_alloc: bool, +} + +impl ReserveNewNeedAlloc { + /// Make the necessary allocations for another call to `reserve_new`. + pub(crate) fn make_alloc(mut self) -> Result> { + if self.need_empty_array_alloc && self.args.empty_array_alloc.is_none() { + self.args.empty_array_alloc = Some(EmptyArrayAlloc::try_new(TREE_THRESHOLD)?); + } + if self.need_new_tree_alloc && self.args.new_tree_alloc.is_none() { + self.args.new_tree_alloc = Some(FromArrayAllocs::try_new(TREE_THRESHOLD)?); + } + if self.need_tree_alloc && self.args.tree_alloc.is_none() { + self.args.tree_alloc = Some(ReserveNewTreeAlloc::try_new()?); + } + Ok(self.args) + } +} diff --git a/drivers/android/binder/range_alloc/tree.rs b/drivers/android/binder/range_alloc/tree.rs new file mode 100644 index 000000000000..7b1a248fcb02 --- /dev/null +++ b/drivers/android/binder/range_alloc/tree.rs @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + page::PAGE_SIZE, + prelude::*, + rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + task::Pid, +}; + +use crate::range_alloc::{DescriptorState, FreedRange, Range}; + +/// Keeps track of allocations in a process' mmap. +/// +/// Each process has an mmap where the data for incoming transactions will be placed. This struct +/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that +/// has metadata related to the allocation. We also keep track of available free space. +pub(super) struct TreeRangeAllocator { + /// This collection contains descriptors for *both* ranges containing an allocation, *and* free + /// ranges between allocations. The free ranges get merged, so there are never two free ranges + /// next to each other. + tree: RBTree>, + /// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size, + /// letting us look up the smallest range whose size is at least some lower bound. + free_tree: RBTree, + size: usize, + free_oneway_space: usize, +} + +impl TreeRangeAllocator { + pub(crate) fn from_array( + size: usize, + ranges: &mut KVec>, + alloc: &mut FromArrayAllocs, + ) -> Self { + let mut tree = TreeRangeAllocator { + tree: RBTree::new(), + free_tree: RBTree::new(), + size, + free_oneway_space: size / 2, + }; + + let mut free_offset = 0; + for range in ranges.drain_all() { + let free_size = range.offset - free_offset; + if free_size > 0 { + let free_node = alloc.free_tree.pop().unwrap(); + tree.free_tree + .insert(free_node.into_node((free_size, free_offset), ())); + let tree_node = alloc.tree.pop().unwrap(); + tree.tree.insert( + tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)), + ); + } + free_offset = range.endpoint(); + + if range.state.is_oneway() { + tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size); + } + + let free_res = alloc.free_tree.pop().unwrap(); + let tree_node = alloc.tree.pop().unwrap(); + let mut desc = Descriptor::new(range.offset, range.size); + desc.state = Some((range.state, free_res)); + tree.tree.insert(tree_node.into_node(range.offset, desc)); + } + + // After the last range, we may need a free range. + if free_offset < size { + let free_size = size - free_offset; + let free_node = alloc.free_tree.pop().unwrap(); + tree.free_tree + .insert(free_node.into_node((free_size, free_offset), ())); + let tree_node = alloc.tree.pop().unwrap(); + tree.tree + .insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size))); + } + + tree + } + + pub(crate) fn is_empty(&self) -> bool { + let mut tree_iter = self.tree.values(); + // There's always at least one range, because index zero is either the start of a free or + // allocated range. + let first_value = tree_iter.next().unwrap(); + if tree_iter.next().is_some() { + // There are never two free ranges next to each other, so if there is more than one + // descriptor, then at least one of them must hold an allocated range. + return false; + } + // There is only one descriptor. Return true if it is for a free range. + first_value.state.is_none() + } + + pub(crate) fn total_size(&self) -> usize { + self.size + } + + pub(crate) fn free_oneway_space(&self) -> usize { + self.free_oneway_space + } + + pub(crate) fn count_buffers(&self) -> usize { + self.tree + .values() + .filter(|desc| desc.state.is_some()) + .count() + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + for desc in self.tree.values() { + let state = match &desc.state { + Some(state) => &state.0, + None => continue, + }; + seq_print!( + m, + " buffer: {} size {} pid {}", + desc.offset, + desc.size, + state.pid(), + ); + if state.is_oneway() { + seq_print!(m, " oneway"); + } + match state { + DescriptorState::Reserved(_res) => { + seq_print!(m, " reserved\n"); + } + DescriptorState::Allocated(_alloc) => { + seq_print!(m, " allocated\n"); + } + } + } + Ok(()) + } + + fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor> { + let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?; + let ((_, offset), ()) = free_cursor.current(); + self.tree.get_mut(offset) + } + + /// Try to reserve a new buffer, using the provided allocation if necessary. + pub(crate) fn reserve_new( + &mut self, + debug_id: usize, + size: usize, + is_oneway: bool, + pid: Pid, + alloc: ReserveNewTreeAlloc, + ) -> Result<(usize, bool)> { + // Compute new value of free_oneway_space, which is set only on success. + let new_oneway_space = if is_oneway { + match self.free_oneway_space.checked_sub(size) { + Some(new_oneway_space) => new_oneway_space, + None => return Err(ENOSPC), + } + } else { + self.free_oneway_space + }; + + // Start detecting spammers once we have less than 20% + // of async space left (which is less than 10% of total + // buffer size). + // + // (This will short-circut, so `low_oneway_space` is + // only called when necessary.) + let oneway_spam_detected = + is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid); + + let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) { + None => { + pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size); + return Err(ENOSPC); + } + Some(desc) => { + let found_size = desc.size; + let found_offset = desc.offset; + + // In case we need to break up the descriptor + let new_desc = Descriptor::new(found_offset + size, found_size - size); + let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc); + + desc.state = Some(( + DescriptorState::new(is_oneway, debug_id, pid), + desc_node_res, + )); + desc.size = size; + + (found_size, found_offset, tree_node, free_tree_node) + } + }; + self.free_oneway_space = new_oneway_space; + self.free_tree.remove(&(found_size, found_off)); + + if found_size != size { + self.tree.insert(tree_node); + self.free_tree.insert(free_tree_node); + } + + Ok((found_off, oneway_spam_detected)) + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + EINVAL + })?; + + let (_, desc) = cursor.current_mut(); + + if desc.offset != offset { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + return Err(EINVAL); + } + + let (reservation, free_node_res) = desc.try_change_state(|state| match state { + Some((DescriptorState::Reserved(reservation), free_node_res)) => { + (None, Ok((reservation, free_node_res))) + } + None => { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + (None, Err(EINVAL)) + } + allocated => { + pr_warn!( + "EPERM from range_alloc.reservation_abort - offset: {}", + offset + ); + (allocated, Err(EPERM)) + } + })?; + + let mut size = desc.size; + let mut offset = desc.offset; + let free_oneway_space_add = if reservation.is_oneway { size } else { 0 }; + + self.free_oneway_space += free_oneway_space_add; + + let mut freed_range = FreedRange::interior_pages(offset, size); + // Compute how large the next free region needs to be to include one more page in + // the newly freed range. + let add_next_page_needed = match (offset + size) % PAGE_SIZE { + 0 => usize::MAX, + unalign => PAGE_SIZE - unalign, + }; + // Compute how large the previous free region needs to be to include one more page + // in the newly freed range. + let add_prev_page_needed = match offset % PAGE_SIZE { + 0 => usize::MAX, + unalign => unalign, + }; + + // Merge next into current if next is free + let remove_next = match cursor.peek_next() { + Some((_, next)) if next.state.is_none() => { + if next.size >= add_next_page_needed { + freed_range.end_page_idx += 1; + } + self.free_tree.remove(&(next.size, next.offset)); + size += next.size; + true + } + _ => false, + }; + + if remove_next { + let (_, desc) = cursor.current_mut(); + desc.size = size; + cursor.remove_next(); + } + + // Merge current into prev if prev is free + match cursor.peek_prev_mut() { + Some((_, prev)) if prev.state.is_none() => { + if prev.size >= add_prev_page_needed { + freed_range.start_page_idx -= 1; + } + // merge previous with current, remove current + self.free_tree.remove(&(prev.size, prev.offset)); + offset = prev.offset; + size += prev.size; + prev.size = size; + cursor.remove_current(); + } + _ => {} + }; + + self.free_tree + .insert(free_node_res.into_node((size, offset), ())); + + Ok(freed_range) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + let desc = self.tree.get_mut(&offset).ok_or(ENOENT)?; + + desc.try_change_state(|state| match state { + Some((DescriptorState::Reserved(reservation), free_node_res)) => ( + Some(( + DescriptorState::Allocated(reservation.allocate(data.take())), + free_node_res, + )), + Ok(()), + ), + other => (other, Err(ENOENT)), + }) + } + + /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to + /// [`DescriptorState::Reserved`]. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + let desc = self.tree.get_mut(&offset).ok_or_else(|| { + pr_warn!( + "ENOENT from range_alloc.reserve_existing - offset: {}", + offset + ); + ENOENT + })?; + + let (debug_id, data) = desc.try_change_state(|state| match state { + Some((DescriptorState::Allocated(allocation), free_node_res)) => { + let (reservation, data) = allocation.deallocate(); + let debug_id = reservation.debug_id; + ( + Some((DescriptorState::Reserved(reservation), free_node_res)), + Ok((debug_id, data)), + ) + } + other => { + pr_warn!( + "ENOENT from range_alloc.reserve_existing - offset: {}", + offset + ); + (other, Err(ENOENT)) + } + })?; + + Ok((desc.size, debug_id, data)) + } + + /// Call the provided callback at every allocated region. + /// + /// This destroys the range allocator. Used only during shutdown. + pub(crate) fn take_for_each)>(&mut self, callback: F) { + for (_, desc) in self.tree.iter_mut() { + if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state { + callback( + desc.offset, + desc.size, + allocation.debug_id(), + allocation.take(), + ); + } + } + } + + /// Find the amount and size of buffers allocated by the current caller. + /// + /// The idea is that once we cross the threshold, whoever is responsible + /// for the low async space is likely to try to send another async transaction, + /// and at some point we'll catch them in the act. This is more efficient + /// than keeping a map per pid. + fn low_oneway_space(&self, calling_pid: Pid) -> bool { + let mut total_alloc_size = 0; + let mut num_buffers = 0; + for (_, desc) in self.tree.iter() { + if let Some((state, _)) = &desc.state { + if state.is_oneway() && state.pid() == calling_pid { + total_alloc_size += desc.size; + num_buffers += 1; + } + } + } + + // Warn if this pid has more than 50 transactions, or more than 50% of + // async space (which is 25% of total buffer size). Oneway spam is only + // detected when the threshold is exceeded. + num_buffers > 50 || total_alloc_size > self.size / 4 + } +} + +type TreeDescriptorState = (DescriptorState, FreeNodeRes); +struct Descriptor { + size: usize, + offset: usize, + state: Option>, +} + +impl Descriptor { + fn new(offset: usize, size: usize) -> Self { + Self { + size, + offset, + state: None, + } + } + + fn try_change_state(&mut self, f: F) -> Result + where + F: FnOnce(Option>) -> (Option>, Result), + { + let (new_state, result) = f(self.state.take()); + self.state = new_state; + result + } +} + +// (Descriptor.size, Descriptor.offset) +type FreeKey = (usize, usize); +type FreeNodeRes = RBTreeNodeReservation; + +/// An allocation for use by `reserve_new`. +pub(crate) struct ReserveNewTreeAlloc { + tree_node_res: RBTreeNodeReservation>, + free_tree_node_res: FreeNodeRes, + desc_node_res: FreeNodeRes, +} + +impl ReserveNewTreeAlloc { + pub(crate) fn try_new() -> Result { + let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + Ok(Self { + tree_node_res, + free_tree_node_res, + desc_node_res, + }) + } + + fn initialize( + self, + desc: Descriptor, + ) -> ( + RBTreeNode>, + RBTreeNode, + FreeNodeRes, + ) { + let size = desc.size; + let offset = desc.offset; + ( + self.tree_node_res.into_node(offset, desc), + self.free_tree_node_res.into_node((size, offset), ()), + self.desc_node_res, + ) + } +} + +/// An allocation for creating a tree from an `ArrayRangeAllocator`. +pub(crate) struct FromArrayAllocs { + tree: KVec>>, + free_tree: KVec>, +} + +impl FromArrayAllocs { + pub(crate) fn try_new(len: usize) -> Result { + let num_descriptors = 2 * len + 1; + + let mut tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; + for _ in 0..num_descriptors { + tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; + } + + let mut free_tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; + for _ in 0..num_descriptors { + free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; + } + + Ok(Self { tree, free_tree }) + } +} diff --git a/drivers/android/binder/rust_binder.h b/drivers/android/binder/rust_binder.h new file mode 100644 index 000000000000..31806890ed1a --- /dev/null +++ b/drivers/android/binder/rust_binder.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#ifndef _LINUX_RUST_BINDER_H +#define _LINUX_RUST_BINDER_H + +#include +#include + +/* + * These symbols are exposed by `rust_binderfs.c` and exist here so that Rust + * Binder can call them. + */ +int init_rust_binderfs(void); + +struct dentry; +struct inode; +struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid); +void rust_binderfs_remove_file(struct dentry *dentry); + +#endif diff --git a/drivers/android/binder/rust_binder_events.c b/drivers/android/binder/rust_binder_events.c new file mode 100644 index 000000000000..488b1470060c --- /dev/null +++ b/drivers/android/binder/rust_binder_events.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* rust_binder_events.c + * + * Rust Binder tracepoints. + * + * Copyright 2025 Google LLC + */ + +#include "rust_binder.h" + +const char * const binder_command_strings[] = { + "BC_TRANSACTION", + "BC_REPLY", + "BC_ACQUIRE_RESULT", + "BC_FREE_BUFFER", + "BC_INCREFS", + "BC_ACQUIRE", + "BC_RELEASE", + "BC_DECREFS", + "BC_INCREFS_DONE", + "BC_ACQUIRE_DONE", + "BC_ATTEMPT_ACQUIRE", + "BC_REGISTER_LOOPER", + "BC_ENTER_LOOPER", + "BC_EXIT_LOOPER", + "BC_REQUEST_DEATH_NOTIFICATION", + "BC_CLEAR_DEATH_NOTIFICATION", + "BC_DEAD_BINDER_DONE", + "BC_TRANSACTION_SG", + "BC_REPLY_SG", +}; + +const char * const binder_return_strings[] = { + "BR_ERROR", + "BR_OK", + "BR_TRANSACTION", + "BR_REPLY", + "BR_ACQUIRE_RESULT", + "BR_DEAD_REPLY", + "BR_TRANSACTION_COMPLETE", + "BR_INCREFS", + "BR_ACQUIRE", + "BR_RELEASE", + "BR_DECREFS", + "BR_ATTEMPT_ACQUIRE", + "BR_NOOP", + "BR_SPAWN_LOOPER", + "BR_FINISHED", + "BR_DEAD_BINDER", + "BR_CLEAR_DEATH_NOTIFICATION_DONE", + "BR_FAILED_REPLY", + "BR_FROZEN_REPLY", + "BR_ONEWAY_SPAM_SUSPECT", + "BR_TRANSACTION_PENDING_FROZEN" +}; + +#define CREATE_TRACE_POINTS +#define CREATE_RUST_TRACE_POINTS +#include "rust_binder_events.h" diff --git a/drivers/android/binder/rust_binder_events.h b/drivers/android/binder/rust_binder_events.h new file mode 100644 index 000000000000..2f3efbf9dba6 --- /dev/null +++ b/drivers/android/binder/rust_binder_events.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#undef TRACE_SYSTEM +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_SYSTEM rust_binder +#define TRACE_INCLUDE_FILE rust_binder_events +#define TRACE_INCLUDE_PATH ../drivers/android/binder + +#if !defined(_RUST_BINDER_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _RUST_BINDER_TRACE_H + +#include + +TRACE_EVENT(rust_binder_ioctl, + TP_PROTO(unsigned int cmd, unsigned long arg), + TP_ARGS(cmd, arg), + + TP_STRUCT__entry( + __field(unsigned int, cmd) + __field(unsigned long, arg) + ), + TP_fast_assign( + __entry->cmd = cmd; + __entry->arg = arg; + ), + TP_printk("cmd=0x%x arg=0x%lx", __entry->cmd, __entry->arg) +); + +#endif /* _RUST_BINDER_TRACE_H */ + +/* This part must be outside protection */ +#include diff --git a/drivers/android/binder/rust_binder_internal.h b/drivers/android/binder/rust_binder_internal.h new file mode 100644 index 000000000000..78288fe7964d --- /dev/null +++ b/drivers/android/binder/rust_binder_internal.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* rust_binder_internal.h + * + * This file contains internal data structures used by Rust Binder. Mostly, + * these are type definitions used only by binderfs or things that Rust Binder + * define and export to binderfs. + * + * It does not include things exported by binderfs to Rust Binder since this + * file is not included as input to bindgen. + * + * Copyright (C) 2025 Google LLC. + */ + +#ifndef _LINUX_RUST_BINDER_INTERNAL_H +#define _LINUX_RUST_BINDER_INTERNAL_H + +#define RUST_BINDERFS_SUPER_MAGIC 0x6c6f6f71 + +#include +#include +#include + +/* + * The internal data types in the Rust Binder driver are opaque to C, so we use + * void pointer typedefs for these types. + */ +typedef void *rust_binder_context; + +/** + * struct binder_device - information about a binder device node + * @minor: the minor number used by this device + * @ctx: the Rust Context used by this device, or null for binder-control + * + * This is used as the private data for files directly in binderfs, but not + * files in the binder_logs subdirectory. This struct owns a refcount on `ctx` + * and the entry for `minor` in `binderfs_minors`. For binder-control `ctx` is + * null. + */ +struct binder_device { + int minor; + rust_binder_context ctx; +}; + +int rust_binder_stats_show(struct seq_file *m, void *unused); +int rust_binder_state_show(struct seq_file *m, void *unused); +int rust_binder_transactions_show(struct seq_file *m, void *unused); +int rust_binder_proc_show(struct seq_file *m, void *pid); + +extern const struct file_operations rust_binder_fops; +rust_binder_context rust_binder_new_context(char *name); +void rust_binder_remove_context(rust_binder_context device); + +/** + * binderfs_mount_opts - mount options for binderfs + * @max: maximum number of allocatable binderfs binder devices + * @stats_mode: enable binder stats in binderfs. + */ +struct binderfs_mount_opts { + int max; + int stats_mode; +}; + +/** + * binderfs_info - information about a binderfs mount + * @ipc_ns: The ipc namespace the binderfs mount belongs to. + * @control_dentry: This records the dentry of this binderfs mount + * binder-control device. + * @root_uid: uid that needs to be used when a new binder device is + * created. + * @root_gid: gid that needs to be used when a new binder device is + * created. + * @mount_opts: The mount options in use. + * @device_count: The current number of allocated binder devices. + * @proc_log_dir: Pointer to the directory dentry containing process-specific + * logs. + */ +struct binderfs_info { + struct ipc_namespace *ipc_ns; + struct dentry *control_dentry; + kuid_t root_uid; + kgid_t root_gid; + struct binderfs_mount_opts mount_opts; + int device_count; + struct dentry *proc_log_dir; +}; + +#endif /* _LINUX_RUST_BINDER_INTERNAL_H */ diff --git a/drivers/android/binder/rust_binder_main.rs b/drivers/android/binder/rust_binder_main.rs new file mode 100644 index 000000000000..6773b7c273ec --- /dev/null +++ b/drivers/android/binder/rust_binder_main.rs @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Binder -- the Android IPC mechanism. +#![recursion_limit = "256"] +#![allow( + clippy::as_underscore, + clippy::ref_as_ptr, + clippy::ptr_as_ptr, + clippy::cast_lossless +)] + +use kernel::{ + bindings::{self, seq_file}, + fs::File, + list::{ListArc, ListArcSafe, ListLinksSelfPtr, TryNewListArc}, + prelude::*, + seq_file::SeqFile, + seq_print, + sync::poll::PollTable, + sync::Arc, + task::Pid, + transmute::AsBytes, + types::ForeignOwnable, + uaccess::UserSliceWriter, +}; + +use crate::{context::Context, page_range::Shrinker, process::Process, thread::Thread}; + +use core::{ + ptr::NonNull, + sync::atomic::{AtomicBool, AtomicUsize, Ordering}, +}; + +mod allocation; +mod context; +mod deferred_close; +mod defs; +mod error; +mod node; +mod page_range; +mod process; +mod range_alloc; +mod stats; +mod thread; +mod trace; +mod transaction; + +#[allow(warnings)] // generated bindgen code +mod binderfs { + use kernel::bindings::{dentry, inode}; + + extern "C" { + pub fn init_rust_binderfs() -> kernel::ffi::c_int; + } + extern "C" { + pub fn rust_binderfs_create_proc_file( + nodp: *mut inode, + pid: kernel::ffi::c_int, + ) -> *mut dentry; + } + extern "C" { + pub fn rust_binderfs_remove_file(dentry: *mut dentry); + } + pub type rust_binder_context = *mut kernel::ffi::c_void; + #[repr(C)] + #[derive(Copy, Clone)] + pub struct binder_device { + pub minor: kernel::ffi::c_int, + pub ctx: rust_binder_context, + } + impl Default for binder_device { + fn default() -> Self { + let mut s = ::core::mem::MaybeUninit::::uninit(); + unsafe { + ::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1); + s.assume_init() + } + } + } +} + +module! { + type: BinderModule, + name: "rust_binder", + authors: ["Wedson Almeida Filho", "Alice Ryhl"], + description: "Android Binder", + license: "GPL", +} + +fn next_debug_id() -> usize { + static NEXT_DEBUG_ID: AtomicUsize = AtomicUsize::new(0); + + NEXT_DEBUG_ID.fetch_add(1, Ordering::Relaxed) +} + +/// Provides a single place to write Binder return values via the +/// supplied `UserSliceWriter`. +pub(crate) struct BinderReturnWriter<'a> { + writer: UserSliceWriter, + thread: &'a Thread, +} + +impl<'a> BinderReturnWriter<'a> { + fn new(writer: UserSliceWriter, thread: &'a Thread) -> Self { + BinderReturnWriter { writer, thread } + } + + /// Write a return code back to user space. + /// Should be a `BR_` constant from [`defs`] e.g. [`defs::BR_TRANSACTION_COMPLETE`]. + fn write_code(&mut self, code: u32) -> Result { + stats::GLOBAL_STATS.inc_br(code); + self.thread.process.stats.inc_br(code); + self.writer.write(&code) + } + + /// Write something *other than* a return code to user space. + fn write_payload(&mut self, payload: &T) -> Result { + self.writer.write(payload) + } + + fn len(&self) -> usize { + self.writer.len() + } +} + +/// Specifies how a type should be delivered to the read part of a BINDER_WRITE_READ ioctl. +/// +/// When a value is pushed to the todo list for a process or thread, it is stored as a trait object +/// with the type `Arc`. Trait objects are a Rust feature that lets you +/// implement dynamic dispatch over many different types. This lets us store many different types +/// in the todo list. +trait DeliverToRead: ListArcSafe + Send + Sync { + /// Performs work. Returns true if remaining work items in the queue should be processed + /// immediately, or false if it should return to caller before processing additional work + /// items. + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result; + + /// Cancels the given work item. This is called instead of [`DeliverToRead::do_work`] when work + /// won't be delivered. + fn cancel(self: DArc); + + /// Should we use `wake_up_interruptible_sync` or `wake_up_interruptible` when scheduling this + /// work item? + /// + /// Generally only set to true for non-oneway transactions. + fn should_sync_wakeup(&self) -> bool; + + fn debug_print(&self, m: &SeqFile, prefix: &str, transaction_prefix: &str) -> Result<()>; +} + +// Wrapper around a `DeliverToRead` with linked list links. +#[pin_data] +struct DTRWrap { + #[pin] + links: ListLinksSelfPtr>, + #[pin] + wrapped: T, +} +kernel::list::impl_list_arc_safe! { + impl{T: ListArcSafe + ?Sized} ListArcSafe<0> for DTRWrap { + tracked_by wrapped: T; + } +} +kernel::list::impl_list_item! { + impl ListItem<0> for DTRWrap { + using ListLinksSelfPtr { self.links }; + } +} + +impl core::ops::Deref for DTRWrap { + type Target = T; + fn deref(&self) -> &T { + &self.wrapped + } +} + +type DArc = kernel::sync::Arc>; +type DLArc = kernel::list::ListArc>; + +impl DTRWrap { + fn new(val: impl PinInit) -> impl PinInit { + pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped <- val, + }) + } + + fn arc_try_new(val: T) -> Result, kernel::alloc::AllocError> { + ListArc::pin_init( + try_pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped: val, + }), + GFP_KERNEL, + ) + .map_err(|_| kernel::alloc::AllocError) + } + + fn arc_pin_init(init: impl PinInit) -> Result, kernel::error::Error> { + ListArc::pin_init( + try_pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped <- init, + }), + GFP_KERNEL, + ) + } +} + +struct DeliverCode { + code: u32, + skip: AtomicBool, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for DeliverCode { untracked; } +} + +impl DeliverCode { + fn new(code: u32) -> Self { + Self { + code, + skip: AtomicBool::new(false), + } + } + + /// Disable this DeliverCode and make it do nothing. + /// + /// This is used instead of removing it from the work list, since `LinkedList::remove` is + /// unsafe, whereas this method is not. + fn skip(&self) { + self.skip.store(true, Ordering::Relaxed); + } +} + +impl DeliverToRead for DeliverCode { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + if !self.skip.load(Ordering::Relaxed) { + writer.write_code(self.code)?; + } + Ok(true) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!(m, "{}", prefix); + if self.skip.load(Ordering::Relaxed) { + seq_print!(m, "(skipped) "); + } + if self.code == defs::BR_TRANSACTION_COMPLETE { + seq_print!(m, "transaction complete\n"); + } else { + seq_print!(m, "transaction error: {}\n", self.code); + } + Ok(()) + } +} + +fn ptr_align(value: usize) -> Option { + let size = core::mem::size_of::() - 1; + Some(value.checked_add(size)? & !size) +} + +// SAFETY: We call register in `init`. +static BINDER_SHRINKER: Shrinker = unsafe { Shrinker::new() }; + +struct BinderModule {} + +impl kernel::Module for BinderModule { + fn init(_module: &'static kernel::ThisModule) -> Result { + // SAFETY: The module initializer never runs twice, so we only call this once. + unsafe { crate::context::CONTEXTS.init() }; + + pr_warn!("Loaded Rust Binder."); + + BINDER_SHRINKER.register(kernel::c_str!("android-binder"))?; + + // SAFETY: The module is being loaded, so we can initialize binderfs. + unsafe { kernel::error::to_result(binderfs::init_rust_binderfs())? }; + + Ok(Self {}) + } +} + +/// Makes the inner type Sync. +#[repr(transparent)] +pub struct AssertSync(T); +// SAFETY: Used only to insert `file_operations` into a global, which is safe. +unsafe impl Sync for AssertSync {} + +/// File operations that rust_binderfs.c can use. +#[no_mangle] +#[used] +pub static rust_binder_fops: AssertSync = { + // SAFETY: All zeroes is safe for the `file_operations` type. + let zeroed_ops = unsafe { core::mem::MaybeUninit::zeroed().assume_init() }; + + let ops = kernel::bindings::file_operations { + owner: THIS_MODULE.as_ptr(), + poll: Some(rust_binder_poll), + unlocked_ioctl: Some(rust_binder_unlocked_ioctl), + compat_ioctl: Some(rust_binder_compat_ioctl), + mmap: Some(rust_binder_mmap), + open: Some(rust_binder_open), + release: Some(rust_binder_release), + flush: Some(rust_binder_flush), + ..zeroed_ops + }; + AssertSync(ops) +}; + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_new_context( + name: *const kernel::ffi::c_char, +) -> *mut kernel::ffi::c_void { + // SAFETY: The caller will always provide a valid c string here. + let name = unsafe { kernel::str::CStr::from_char_ptr(name) }; + match Context::new(name) { + Ok(ctx) => Arc::into_foreign(ctx), + Err(_err) => core::ptr::null_mut(), + } +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_remove_context(device: *mut kernel::ffi::c_void) { + if !device.is_null() { + // SAFETY: The caller ensures that the `device` pointer came from a previous call to + // `rust_binder_new_device`. + let ctx = unsafe { Arc::::from_foreign(device) }; + ctx.deregister(); + drop(ctx); + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_open( + inode: *mut bindings::inode, + file_ptr: *mut bindings::file, +) -> kernel::ffi::c_int { + // SAFETY: The `rust_binderfs.c` file ensures that `i_private` is set to a + // `struct binder_device`. + let device = unsafe { (*inode).i_private } as *const binderfs::binder_device; + + assert!(!device.is_null()); + + // SAFETY: The `rust_binderfs.c` file ensures that `device->ctx` holds a binder context when + // using the rust binder fops. + let ctx = unsafe { Arc::::borrow((*device).ctx) }; + + // SAFETY: The caller provides a valid file pointer to a new `struct file`. + let file = unsafe { File::from_raw_file(file_ptr) }; + let process = match Process::open(ctx, file) { + Ok(process) => process, + Err(err) => return err.to_errno(), + }; + + // SAFETY: This is an `inode` for a newly created binder file. + match unsafe { BinderfsProcFile::new(inode, process.task.pid()) } { + Ok(Some(file)) => process.inner.lock().binderfs_file = Some(file), + Ok(None) => { /* pid already exists */ } + Err(err) => return err.to_errno(), + } + + // SAFETY: This file is associated with Rust binder, so we own the `private_data` field. + unsafe { (*file_ptr).private_data = process.into_foreign() }; + 0 +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_release( + _inode: *mut bindings::inode, + file: *mut bindings::file, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let process = unsafe { Arc::::from_foreign((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + let file = unsafe { File::from_raw_file(file) }; + Process::release(process, file); + 0 +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_compat_ioctl( + file: *mut bindings::file, + cmd: kernel::ffi::c_uint, + arg: kernel::ffi::c_ulong, +) -> kernel::ffi::c_long { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + match Process::compat_ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) { + Ok(()) => 0, + Err(err) => err.to_errno() as isize, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_unlocked_ioctl( + file: *mut bindings::file, + cmd: kernel::ffi::c_uint, + arg: kernel::ffi::c_ulong, +) -> kernel::ffi::c_long { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + match Process::ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) { + Ok(()) => 0, + Err(err) => err.to_errno() as isize, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_mmap( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the vma is valid. + let area = unsafe { kernel::mm::virt::VmaNew::from_raw(vma) }; + // SAFETY: The caller ensures that the file is valid. + match Process::mmap(f, unsafe { File::from_raw_file(file) }, area) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_poll( + file: *mut bindings::file, + wait: *mut bindings::poll_table_struct, +) -> bindings::__poll_t { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + let fileref = unsafe { File::from_raw_file(file) }; + // SAFETY: The caller ensures that the `PollTable` is valid. + match Process::poll(f, fileref, unsafe { PollTable::from_raw(wait) }) { + Ok(v) => v, + Err(_) => bindings::POLLERR, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_flush( + file: *mut bindings::file, + _id: bindings::fl_owner_t, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + match Process::flush(f) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_stats_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_stats_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_state_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_state_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_proc_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: Accessing the private field of `seq_file` is okay. + let pid = (unsafe { (*ptr).private }) as usize as Pid; + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_proc_show_impl(m, pid) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_transactions_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_transactions_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +fn rust_binder_transactions_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder transactions:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print(m, &ctx, false)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_stats_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder stats:\n"); + stats::GLOBAL_STATS.debug_print("", m); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print_stats(m, &ctx)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_state_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder state:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print(m, &ctx, true)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_proc_show_impl(m: &SeqFile, pid: Pid) -> Result<()> { + seq_print!(m, "binder proc state:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_procs_with_pid(pid)?; + for proc in procs { + proc.debug_print(m, &ctx, true)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +struct BinderfsProcFile(NonNull); + +// SAFETY: Safe to drop any thread. +unsafe impl Send for BinderfsProcFile {} + +impl BinderfsProcFile { + /// # Safety + /// + /// Takes an inode from a newly created binder file. + unsafe fn new(nodp: *mut bindings::inode, pid: i32) -> Result> { + // SAFETY: The caller passes an `inode` for a newly created binder file. + let dentry = unsafe { binderfs::rust_binderfs_create_proc_file(nodp, pid) }; + match kernel::error::from_err_ptr(dentry) { + Ok(dentry) => Ok(NonNull::new(dentry).map(Self)), + Err(err) if err == EEXIST => Ok(None), + Err(err) => Err(err), + } + } +} + +impl Drop for BinderfsProcFile { + fn drop(&mut self) { + // SAFETY: This is a dentry from `rust_binderfs_remove_file` that has not been deleted yet. + unsafe { binderfs::rust_binderfs_remove_file(self.0.as_ptr()) }; + } +} diff --git a/drivers/android/binder/rust_binderfs.c b/drivers/android/binder/rust_binderfs.c new file mode 100644 index 000000000000..6b497146b698 --- /dev/null +++ b/drivers/android/binder/rust_binderfs.c @@ -0,0 +1,850 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rust_binder.h" +#include "rust_binder_internal.h" + +#define FIRST_INODE 1 +#define SECOND_INODE 2 +#define INODE_OFFSET 3 +#define BINDERFS_MAX_MINOR (1U << MINORBITS) +/* Ensure that the initial ipc namespace always has devices available. */ +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4) + +DEFINE_SHOW_ATTRIBUTE(rust_binder_stats); +DEFINE_SHOW_ATTRIBUTE(rust_binder_state); +DEFINE_SHOW_ATTRIBUTE(rust_binder_transactions); +DEFINE_SHOW_ATTRIBUTE(rust_binder_proc); + +char *rust_binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(rust_devices, rust_binder_devices_param, charp, 0444); + +static dev_t binderfs_dev; +static DEFINE_MUTEX(binderfs_minors_mutex); +static DEFINE_IDA(binderfs_minors); + +enum binderfs_param { + Opt_max, + Opt_stats_mode, +}; + +enum binderfs_stats_mode { + binderfs_stats_mode_unset, + binderfs_stats_mode_global, +}; + +struct binder_features { + bool oneway_spam_detection; + bool extended_error; + bool freeze_notification; +}; + +static const struct constant_table binderfs_param_stats[] = { + { "global", binderfs_stats_mode_global }, + {} +}; + +static const struct fs_parameter_spec binderfs_fs_parameters[] = { + fsparam_u32("max", Opt_max), + fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats), + {} +}; + +static struct binder_features binder_features = { + .oneway_spam_detection = true, + .extended_error = true, + .freeze_notification = true, +}; + +static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +/** + * binderfs_binder_device_create - allocate inode from super block of a + * binderfs mount + * @ref_inode: inode from wich the super block will be taken + * @userp: buffer to copy information about new device for userspace to + * @req: struct binderfs_device as copied from userspace + * + * This function allocates a new binder_device and reserves a new minor + * number for it. + * Minor numbers are limited and tracked globally in binderfs_minors. The + * function will stash a struct binder_device for the specific binder + * device in i_private of the inode. + * It will go on to allocate a new inode from the super block of the + * filesystem mount, stash a struct binder_device in its i_private field + * and attach a dentry to that inode. + * + * Return: 0 on success, negative errno on failure + */ +static int binderfs_binder_device_create(struct inode *ref_inode, + struct binderfs_device __user *userp, + struct binderfs_device *req) +{ + int minor, ret; + struct dentry *dentry, *root; + struct binder_device *device = NULL; + rust_binder_context ctx = NULL; + struct inode *inode = NULL; + struct super_block *sb = ref_inode->i_sb; + struct binderfs_info *info = sb->s_fs_info; +#if defined(CONFIG_IPC_NS) + bool use_reserve = (info->ipc_ns == &init_ipc_ns); +#else + bool use_reserve = true; +#endif + + /* Reserve new minor number for the new device. */ + mutex_lock(&binderfs_minors_mutex); + if (++info->device_count <= info->mount_opts.max) + minor = ida_alloc_max(&binderfs_minors, + use_reserve ? BINDERFS_MAX_MINOR : + BINDERFS_MAX_MINOR_CAPPED, + GFP_KERNEL); + else + minor = -ENOSPC; + if (minor < 0) { + --info->device_count; + mutex_unlock(&binderfs_minors_mutex); + return minor; + } + mutex_unlock(&binderfs_minors_mutex); + + ret = -ENOMEM; + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + goto err; + + req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */ + + ctx = rust_binder_new_context(req->name); + if (!ctx) + goto err; + + inode = new_inode(sb); + if (!inode) + goto err; + + inode->i_ino = minor + INODE_OFFSET; + simple_inode_init_ts(inode); + init_special_inode(inode, S_IFCHR | 0600, + MKDEV(MAJOR(binderfs_dev), minor)); + inode->i_fop = &rust_binder_fops; + inode->i_uid = info->root_uid; + inode->i_gid = info->root_gid; + + req->major = MAJOR(binderfs_dev); + req->minor = minor; + device->ctx = ctx; + device->minor = minor; + + if (userp && copy_to_user(userp, req, sizeof(*req))) { + ret = -EFAULT; + goto err; + } + + root = sb->s_root; + inode_lock(d_inode(root)); + + /* look it up */ + dentry = lookup_noperm(&QSTR(req->name), root); + if (IS_ERR(dentry)) { + inode_unlock(d_inode(root)); + ret = PTR_ERR(dentry); + goto err; + } + + if (d_really_is_positive(dentry)) { + /* already exists */ + dput(dentry); + inode_unlock(d_inode(root)); + ret = -EEXIST; + goto err; + } + + inode->i_private = device; + d_instantiate(dentry, inode); + fsnotify_create(root->d_inode, dentry); + inode_unlock(d_inode(root)); + + return 0; + +err: + kfree(device); + rust_binder_remove_context(ctx); + mutex_lock(&binderfs_minors_mutex); + --info->device_count; + ida_free(&binderfs_minors, minor); + mutex_unlock(&binderfs_minors_mutex); + iput(inode); + + return ret; +} + +/** + * binder_ctl_ioctl - handle binder device node allocation requests + * + * The request handler for the binder-control device. All requests operate on + * the binderfs mount the binder-control device resides in: + * - BINDER_CTL_ADD + * Allocate a new binder device. + * + * Return: %0 on success, negative errno on failure. + */ +static long binder_ctl_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct inode *inode = file_inode(file); + struct binderfs_device __user *device = (struct binderfs_device __user *)arg; + struct binderfs_device device_req; + + switch (cmd) { + case BINDER_CTL_ADD: + ret = copy_from_user(&device_req, device, sizeof(device_req)); + if (ret) { + ret = -EFAULT; + break; + } + + ret = binderfs_binder_device_create(inode, device, &device_req); + break; + default: + break; + } + + return ret; +} + +static void binderfs_evict_inode(struct inode *inode) +{ + struct binder_device *device = inode->i_private; + struct binderfs_info *info = BINDERFS_SB(inode->i_sb); + + clear_inode(inode); + + if (!S_ISCHR(inode->i_mode) || !device) + return; + + mutex_lock(&binderfs_minors_mutex); + --info->device_count; + ida_free(&binderfs_minors, device->minor); + mutex_unlock(&binderfs_minors_mutex); + + /* ctx is null for binder-control, but this function ignores null pointers */ + rust_binder_remove_context(device->ctx); + + kfree(device); +} + +static int binderfs_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + int opt; + struct binderfs_mount_opts *ctx = fc->fs_private; + struct fs_parse_result result; + + opt = fs_parse(fc, binderfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_max: + if (result.uint_32 > BINDERFS_MAX_MINOR) + return invalfc(fc, "Bad value for '%s'", param->key); + + ctx->max = result.uint_32; + break; + case Opt_stats_mode: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ctx->stats_mode = result.uint_32; + break; + default: + return invalfc(fc, "Unsupported parameter '%s'", param->key); + } + + return 0; +} + +static int binderfs_fs_context_reconfigure(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx = fc->fs_private; + struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb); + + if (info->mount_opts.stats_mode != ctx->stats_mode) + return invalfc(fc, "Binderfs stats mode cannot be changed during a remount"); + + info->mount_opts.stats_mode = ctx->stats_mode; + info->mount_opts.max = ctx->max; + return 0; +} + +static int binderfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct binderfs_info *info = BINDERFS_SB(root->d_sb); + + if (info->mount_opts.max <= BINDERFS_MAX_MINOR) + seq_printf(seq, ",max=%d", info->mount_opts.max); + + switch (info->mount_opts.stats_mode) { + case binderfs_stats_mode_unset: + break; + case binderfs_stats_mode_global: + seq_puts(seq, ",stats=global"); + break; + } + + return 0; +} + +static const struct super_operations binderfs_super_ops = { + .evict_inode = binderfs_evict_inode, + .show_options = binderfs_show_options, + .statfs = simple_statfs, +}; + +static inline bool is_binderfs_control_device(const struct dentry *dentry) +{ + struct binderfs_info *info = dentry->d_sb->s_fs_info; + + return info->control_dentry == dentry; +} + +static int binderfs_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (is_binderfs_control_device(old_dentry) || + is_binderfs_control_device(new_dentry)) + return -EPERM; + + return simple_rename(idmap, old_dir, old_dentry, new_dir, + new_dentry, flags); +} + +static int binderfs_unlink(struct inode *dir, struct dentry *dentry) +{ + if (is_binderfs_control_device(dentry)) + return -EPERM; + + return simple_unlink(dir, dentry); +} + +static const struct file_operations binder_ctl_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = binder_ctl_ioctl, + .compat_ioctl = binder_ctl_ioctl, + .llseek = noop_llseek, +}; + +/** + * binderfs_binder_ctl_create - create a new binder-control device + * @sb: super block of the binderfs mount + * + * This function creates a new binder-control device node in the binderfs mount + * referred to by @sb. + * + * Return: 0 on success, negative errno on failure + */ +static int binderfs_binder_ctl_create(struct super_block *sb) +{ + int minor, ret; + struct dentry *dentry; + struct binder_device *device; + struct inode *inode = NULL; + struct dentry *root = sb->s_root; + struct binderfs_info *info = sb->s_fs_info; +#if defined(CONFIG_IPC_NS) + bool use_reserve = (info->ipc_ns == &init_ipc_ns); +#else + bool use_reserve = true; +#endif + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + return -ENOMEM; + + /* If we have already created a binder-control node, return. */ + if (info->control_dentry) { + ret = 0; + goto out; + } + + ret = -ENOMEM; + inode = new_inode(sb); + if (!inode) + goto out; + + /* Reserve a new minor number for the new device. */ + mutex_lock(&binderfs_minors_mutex); + minor = ida_alloc_max(&binderfs_minors, + use_reserve ? BINDERFS_MAX_MINOR : + BINDERFS_MAX_MINOR_CAPPED, + GFP_KERNEL); + mutex_unlock(&binderfs_minors_mutex); + if (minor < 0) { + ret = minor; + goto out; + } + + inode->i_ino = SECOND_INODE; + simple_inode_init_ts(inode); + init_special_inode(inode, S_IFCHR | 0600, + MKDEV(MAJOR(binderfs_dev), minor)); + inode->i_fop = &binder_ctl_fops; + inode->i_uid = info->root_uid; + inode->i_gid = info->root_gid; + + device->minor = minor; + device->ctx = NULL; + + dentry = d_alloc_name(root, "binder-control"); + if (!dentry) + goto out; + + inode->i_private = device; + info->control_dentry = dentry; + d_add(dentry, inode); + + return 0; + +out: + kfree(device); + iput(inode); + + return ret; +} + +static const struct inode_operations binderfs_dir_inode_operations = { + .lookup = simple_lookup, + .rename = binderfs_rename, + .unlink = binderfs_unlink, +}; + +static struct inode *binderfs_make_inode(struct super_block *sb, int mode) +{ + struct inode *ret; + + ret = new_inode(sb); + if (ret) { + ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET); + ret->i_mode = mode; + simple_inode_init_ts(ret); + } + return ret; +} + +static struct dentry *binderfs_create_dentry(struct dentry *parent, + const char *name) +{ + struct dentry *dentry; + + dentry = lookup_noperm(&QSTR(name), parent); + if (IS_ERR(dentry)) + return dentry; + + /* Return error if the file/dir already exists. */ + if (d_really_is_positive(dentry)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } + + return dentry; +} + +void rust_binderfs_remove_file(struct dentry *dentry) +{ + struct inode *parent_inode; + + parent_inode = d_inode(dentry->d_parent); + inode_lock(parent_inode); + if (simple_positive(dentry)) { + dget(dentry); + simple_unlink(parent_inode, dentry); + d_delete(dentry); + dput(dentry); + } + inode_unlock(parent_inode); +} + +static struct dentry *rust_binderfs_create_file(struct dentry *parent, const char *name, + const struct file_operations *fops, + void *data) +{ + struct dentry *dentry; + struct inode *new_inode, *parent_inode; + struct super_block *sb; + + parent_inode = d_inode(parent); + inode_lock(parent_inode); + + dentry = binderfs_create_dentry(parent, name); + if (IS_ERR(dentry)) + goto out; + + sb = parent_inode->i_sb; + new_inode = binderfs_make_inode(sb, S_IFREG | 0444); + if (!new_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + new_inode->i_fop = fops; + new_inode->i_private = data; + d_instantiate(dentry, new_inode); + fsnotify_create(parent_inode, dentry); + +out: + inode_unlock(parent_inode); + return dentry; +} + +struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid) +{ + struct binderfs_info *info = nodp->i_sb->s_fs_info; + struct dentry *dir = info->proc_log_dir; + char strbuf[20 + 1]; + void *data = (void *)(unsigned long) pid; + + if (!dir) + return NULL; + + snprintf(strbuf, sizeof(strbuf), "%u", pid); + return rust_binderfs_create_file(dir, strbuf, &rust_binder_proc_fops, data); +} + +static struct dentry *binderfs_create_dir(struct dentry *parent, + const char *name) +{ + struct dentry *dentry; + struct inode *new_inode, *parent_inode; + struct super_block *sb; + + parent_inode = d_inode(parent); + inode_lock(parent_inode); + + dentry = binderfs_create_dentry(parent, name); + if (IS_ERR(dentry)) + goto out; + + sb = parent_inode->i_sb; + new_inode = binderfs_make_inode(sb, S_IFDIR | 0755); + if (!new_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + new_inode->i_fop = &simple_dir_operations; + new_inode->i_op = &simple_dir_inode_operations; + + set_nlink(new_inode, 2); + d_instantiate(dentry, new_inode); + inc_nlink(parent_inode); + fsnotify_mkdir(parent_inode, dentry); + +out: + inode_unlock(parent_inode); + return dentry; +} + +static int binder_features_show(struct seq_file *m, void *unused) +{ + bool *feature = m->private; + + seq_printf(m, "%d\n", *feature); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(binder_features); + +static int init_binder_features(struct super_block *sb) +{ + struct dentry *dentry, *dir; + + dir = binderfs_create_dir(sb->s_root, "features"); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + dentry = rust_binderfs_create_file(dir, "oneway_spam_detection", + &binder_features_fops, + &binder_features.oneway_spam_detection); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + dentry = rust_binderfs_create_file(dir, "extended_error", + &binder_features_fops, + &binder_features.extended_error); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + dentry = rust_binderfs_create_file(dir, "freeze_notification", + &binder_features_fops, + &binder_features.freeze_notification); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + return 0; +} + +static int init_binder_logs(struct super_block *sb) +{ + struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir; + struct binderfs_info *info; + int ret = 0; + + binder_logs_root_dir = binderfs_create_dir(sb->s_root, + "binder_logs"); + if (IS_ERR(binder_logs_root_dir)) { + ret = PTR_ERR(binder_logs_root_dir); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "stats", + &rust_binder_stats_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "state", + &rust_binder_state_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "transactions", + &rust_binder_transactions_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc"); + if (IS_ERR(proc_log_dir)) { + ret = PTR_ERR(proc_log_dir); + goto out; + } + info = sb->s_fs_info; + info->proc_log_dir = proc_log_dir; + +out: + return ret; +} + +static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + int ret; + struct binderfs_info *info; + struct binderfs_mount_opts *ctx = fc->fs_private; + struct inode *inode = NULL; + struct binderfs_device device_info = {}; + const char *name; + size_t len; + + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + + /* + * The binderfs filesystem can be mounted by userns root in a + * non-initial userns. By default such mounts have the SB_I_NODEV flag + * set in s_iflags to prevent security issues where userns root can + * just create random device nodes via mknod() since it owns the + * filesystem mount. But binderfs does not allow to create any files + * including devices nodes. The only way to create binder devices nodes + * is through the binder-control device which userns root is explicitly + * allowed to do. So removing the SB_I_NODEV flag from s_iflags is both + * necessary and safe. + */ + sb->s_iflags &= ~SB_I_NODEV; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_magic = RUST_BINDERFS_SUPER_MAGIC; + sb->s_op = &binderfs_super_ops; + sb->s_time_gran = 1; + + sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL); + if (!sb->s_fs_info) + return -ENOMEM; + info = sb->s_fs_info; + + info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); + + info->root_gid = make_kgid(sb->s_user_ns, 0); + if (!gid_valid(info->root_gid)) + info->root_gid = GLOBAL_ROOT_GID; + info->root_uid = make_kuid(sb->s_user_ns, 0); + if (!uid_valid(info->root_uid)) + info->root_uid = GLOBAL_ROOT_UID; + info->mount_opts.max = ctx->max; + info->mount_opts.stats_mode = ctx->stats_mode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; + + inode->i_ino = FIRST_INODE; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | 0755; + simple_inode_init_ts(inode); + inode->i_op = &binderfs_dir_inode_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + ret = binderfs_binder_ctl_create(sb); + if (ret) + return ret; + + name = rust_binder_devices_param; + for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { + strscpy(device_info.name, name, len + 1); + ret = binderfs_binder_device_create(inode, NULL, &device_info); + if (ret) + return ret; + name += len; + if (*name == ',') + name++; + } + + ret = init_binder_features(sb); + if (ret) + return ret; + + if (info->mount_opts.stats_mode == binderfs_stats_mode_global) + return init_binder_logs(sb); + + return 0; +} + +static int binderfs_fs_context_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, binderfs_fill_super); +} + +static void binderfs_fs_context_free(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx = fc->fs_private; + + kfree(ctx); +} + +static const struct fs_context_operations binderfs_fs_context_ops = { + .free = binderfs_fs_context_free, + .get_tree = binderfs_fs_context_get_tree, + .parse_param = binderfs_fs_context_parse_param, + .reconfigure = binderfs_fs_context_reconfigure, +}; + +static int binderfs_init_fs_context(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx; + + ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max = BINDERFS_MAX_MINOR; + ctx->stats_mode = binderfs_stats_mode_unset; + + fc->fs_private = ctx; + fc->ops = &binderfs_fs_context_ops; + + return 0; +} + +static void binderfs_kill_super(struct super_block *sb) +{ + struct binderfs_info *info = sb->s_fs_info; + + /* + * During inode eviction struct binderfs_info is needed. + * So first wipe the super_block then free struct binderfs_info. + */ + kill_litter_super(sb); + + if (info && info->ipc_ns) + put_ipc_ns(info->ipc_ns); + + kfree(info); +} + +static struct file_system_type binder_fs_type = { + .name = "binder", + .init_fs_context = binderfs_init_fs_context, + .parameters = binderfs_fs_parameters, + .kill_sb = binderfs_kill_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +int init_rust_binderfs(void) +{ + int ret; + const char *name; + size_t len; + + /* Verify that the default binderfs device names are valid. */ + name = rust_binder_devices_param; + for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { + if (len > BINDERFS_MAX_NAME) + return -E2BIG; + name += len; + if (*name == ',') + name++; + } + + /* Allocate new major number for binderfs. */ + ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR, + "rust_binder"); + if (ret) + return ret; + + ret = register_filesystem(&binder_fs_type); + if (ret) { + unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR); + return ret; + } + + return ret; +} diff --git a/drivers/android/binder/stats.rs b/drivers/android/binder/stats.rs new file mode 100644 index 000000000000..a83ec111d2cb --- /dev/null +++ b/drivers/android/binder/stats.rs @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Keep track of statistics for binder_logs. + +use crate::defs::*; +use core::sync::atomic::{AtomicU32, Ordering::Relaxed}; +use kernel::{ioctl::_IOC_NR, seq_file::SeqFile, seq_print}; + +const BC_COUNT: usize = _IOC_NR(BC_REPLY_SG) as usize + 1; +const BR_COUNT: usize = _IOC_NR(BR_TRANSACTION_PENDING_FROZEN) as usize + 1; + +pub(crate) static GLOBAL_STATS: BinderStats = BinderStats::new(); + +pub(crate) struct BinderStats { + bc: [AtomicU32; BC_COUNT], + br: [AtomicU32; BR_COUNT], +} + +impl BinderStats { + pub(crate) const fn new() -> Self { + #[expect(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU32 = AtomicU32::new(0); + + Self { + bc: [ZERO; BC_COUNT], + br: [ZERO; BR_COUNT], + } + } + + pub(crate) fn inc_bc(&self, bc: u32) { + let idx = _IOC_NR(bc) as usize; + if let Some(bc_ref) = self.bc.get(idx) { + bc_ref.fetch_add(1, Relaxed); + } + } + + pub(crate) fn inc_br(&self, br: u32) { + let idx = _IOC_NR(br) as usize; + if let Some(br_ref) = self.br.get(idx) { + br_ref.fetch_add(1, Relaxed); + } + } + + pub(crate) fn debug_print(&self, prefix: &str, m: &SeqFile) { + for (i, cnt) in self.bc.iter().enumerate() { + let cnt = cnt.load(Relaxed); + if cnt > 0 { + seq_print!(m, "{}{}: {}\n", prefix, command_string(i), cnt); + } + } + for (i, cnt) in self.br.iter().enumerate() { + let cnt = cnt.load(Relaxed); + if cnt > 0 { + seq_print!(m, "{}{}: {}\n", prefix, return_string(i), cnt); + } + } + } +} + +mod strings { + use core::str::from_utf8_unchecked; + use kernel::str::CStr; + + extern "C" { + static binder_command_strings: [*const u8; super::BC_COUNT]; + static binder_return_strings: [*const u8; super::BR_COUNT]; + } + + pub(super) fn command_string(i: usize) -> &'static str { + // SAFETY: Accessing `binder_command_strings` is always safe. + let c_str_ptr = unsafe { binder_command_strings[i] }; + // SAFETY: The `binder_command_strings` array only contains nul-terminated strings. + let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes(); + // SAFETY: The `binder_command_strings` array only contains strings with ascii-chars. + unsafe { from_utf8_unchecked(bytes) } + } + + pub(super) fn return_string(i: usize) -> &'static str { + // SAFETY: Accessing `binder_return_strings` is always safe. + let c_str_ptr = unsafe { binder_return_strings[i] }; + // SAFETY: The `binder_command_strings` array only contains nul-terminated strings. + let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes(); + // SAFETY: The `binder_command_strings` array only contains strings with ascii-chars. + unsafe { from_utf8_unchecked(bytes) } + } +} +use strings::{command_string, return_string}; diff --git a/drivers/android/binder/thread.rs b/drivers/android/binder/thread.rs new file mode 100644 index 000000000000..7e34ccd394f8 --- /dev/null +++ b/drivers/android/binder/thread.rs @@ -0,0 +1,1596 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module defines the `Thread` type, which represents a userspace thread that is using +//! binder. +//! +//! The `Process` object stores all of the threads in an rb tree. + +use kernel::{ + bindings, + fs::{File, LocalFile}, + list::{AtomicTracker, List, ListArc, ListLinks, TryNewListArc}, + prelude::*, + security, + seq_file::SeqFile, + seq_print, + sync::poll::{PollCondVar, PollTable}, + sync::{Arc, SpinLock}, + task::Task, + types::ARef, + uaccess::UserSlice, + uapi, +}; + +use crate::{ + allocation::{Allocation, AllocationView, BinderObject, BinderObjectRef, NewAllocation}, + defs::*, + error::BinderResult, + process::{GetWorkOrRegister, Process}, + ptr_align, + stats::GLOBAL_STATS, + transaction::Transaction, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverCode, DeliverToRead, +}; + +use core::{ + mem::size_of, + sync::atomic::{AtomicU32, Ordering}, +}; + +/// Stores the layout of the scatter-gather entries. This is used during the `translate_objects` +/// call and is discarded when it returns. +struct ScatterGatherState { + /// A struct that tracks the amount of unused buffer space. + unused_buffer_space: UnusedBufferSpace, + /// Scatter-gather entries to copy. + sg_entries: KVec, + /// Indexes into `sg_entries` corresponding to the last binder_buffer_object that + /// was processed and all of its ancestors. The array is in sorted order. + ancestors: KVec, +} + +/// This entry specifies an additional buffer that should be copied using the scatter-gather +/// mechanism. +struct ScatterGatherEntry { + /// The index in the offset array of the BINDER_TYPE_PTR that this entry originates from. + obj_index: usize, + /// Offset in target buffer. + offset: usize, + /// User address in source buffer. + sender_uaddr: usize, + /// Number of bytes to copy. + length: usize, + /// The minimum offset of the next fixup in this buffer. + fixup_min_offset: usize, + /// The offsets within this buffer that contain pointers which should be translated. + pointer_fixups: KVec, +} + +/// This entry specifies that a fixup should happen at `target_offset` of the +/// buffer. If `skip` is nonzero, then the fixup is a `binder_fd_array_object` +/// and is applied later. Otherwise if `skip` is zero, then the size of the +/// fixup is `sizeof::()` and `pointer_value` is written to the buffer. +struct PointerFixupEntry { + /// The number of bytes to skip, or zero for a `binder_buffer_object` fixup. + skip: usize, + /// The translated pointer to write when `skip` is zero. + pointer_value: u64, + /// The offset at which the value should be written. The offset is relative + /// to the original buffer. + target_offset: usize, +} + +/// Return type of `apply_and_validate_fixup_in_parent`. +struct ParentFixupInfo { + /// The index of the parent buffer in `sg_entries`. + parent_sg_index: usize, + /// The number of ancestors of the buffer. + /// + /// The buffer is considered an ancestor of itself, so this is always at + /// least one. + num_ancestors: usize, + /// New value of `fixup_min_offset` if this fixup is applied. + new_min_offset: usize, + /// The offset of the fixup in the target buffer. + target_offset: usize, +} + +impl ScatterGatherState { + /// Called when a `binder_buffer_object` or `binder_fd_array_object` tries + /// to access a region in its parent buffer. These accesses have various + /// restrictions, which this method verifies. + /// + /// The `parent_offset` and `length` arguments describe the offset and + /// length of the access in the parent buffer. + /// + /// # Detailed restrictions + /// + /// Obviously the fixup must be in-bounds for the parent buffer. + /// + /// For safety reasons, we only allow fixups inside a buffer to happen + /// at increasing offsets; additionally, we only allow fixup on the last + /// buffer object that was verified, or one of its parents. + /// + /// Example of what is allowed: + /// + /// A + /// B (parent = A, offset = 0) + /// C (parent = A, offset = 16) + /// D (parent = C, offset = 0) + /// E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) + /// + /// Examples of what is not allowed: + /// + /// Decreasing offsets within the same parent: + /// A + /// C (parent = A, offset = 16) + /// B (parent = A, offset = 0) // decreasing offset within A + /// + /// Arcerring to a parent that wasn't the last object or any of its parents: + /// A + /// B (parent = A, offset = 0) + /// C (parent = A, offset = 0) + /// C (parent = A, offset = 16) + /// D (parent = B, offset = 0) // B is not A or any of A's parents + fn validate_parent_fixup( + &self, + parent: usize, + parent_offset: usize, + length: usize, + ) -> Result { + // Using `position` would also be correct, but `rposition` avoids + // quadratic running times. + let ancestors_i = self + .ancestors + .iter() + .copied() + .rposition(|sg_idx| self.sg_entries[sg_idx].obj_index == parent) + .ok_or(EINVAL)?; + let sg_idx = self.ancestors[ancestors_i]; + let sg_entry = match self.sg_entries.get(sg_idx) { + Some(sg_entry) => sg_entry, + None => { + pr_err!( + "self.ancestors[{}] is {}, but self.sg_entries.len() is {}", + ancestors_i, + sg_idx, + self.sg_entries.len() + ); + return Err(EINVAL); + } + }; + if sg_entry.fixup_min_offset > parent_offset { + pr_warn!( + "validate_parent_fixup: fixup_min_offset={}, parent_offset={}", + sg_entry.fixup_min_offset, + parent_offset + ); + return Err(EINVAL); + } + let new_min_offset = parent_offset.checked_add(length).ok_or(EINVAL)?; + if new_min_offset > sg_entry.length { + pr_warn!( + "validate_parent_fixup: new_min_offset={}, sg_entry.length={}", + new_min_offset, + sg_entry.length + ); + return Err(EINVAL); + } + let target_offset = sg_entry.offset.checked_add(parent_offset).ok_or(EINVAL)?; + // The `ancestors_i + 1` operation can't overflow since the output of the addition is at + // most `self.ancestors.len()`, which also fits in a usize. + Ok(ParentFixupInfo { + parent_sg_index: sg_idx, + num_ancestors: ancestors_i + 1, + new_min_offset, + target_offset, + }) + } +} + +/// Keeps track of how much unused buffer space is left. The initial amount is the number of bytes +/// requested by the user using the `buffers_size` field of `binder_transaction_data_sg`. Each time +/// we translate an object of type `BINDER_TYPE_PTR`, some of the unused buffer space is consumed. +struct UnusedBufferSpace { + /// The start of the remaining space. + offset: usize, + /// The end of the remaining space. + limit: usize, +} +impl UnusedBufferSpace { + /// Claim the next `size` bytes from the unused buffer space. The offset for the claimed chunk + /// into the buffer is returned. + fn claim_next(&mut self, size: usize) -> Result { + // We require every chunk to be aligned. + let size = ptr_align(size).ok_or(EINVAL)?; + let new_offset = self.offset.checked_add(size).ok_or(EINVAL)?; + + if new_offset <= self.limit { + let offset = self.offset; + self.offset = new_offset; + Ok(offset) + } else { + Err(EINVAL) + } + } +} + +pub(crate) enum PushWorkRes { + Ok, + FailedDead(DLArc), +} + +impl PushWorkRes { + fn is_ok(&self) -> bool { + match self { + PushWorkRes::Ok => true, + PushWorkRes::FailedDead(_) => false, + } + } +} + +/// The fields of `Thread` protected by the spinlock. +struct InnerThread { + /// Determines the looper state of the thread. It is a bit-wise combination of the constants + /// prefixed with `LOOPER_`. + looper_flags: u32, + + /// Determines whether the looper should return. + looper_need_return: bool, + + /// Determines if thread is dead. + is_dead: bool, + + /// Work item used to deliver error codes to the thread that started a transaction. Stored here + /// so that it can be reused. + reply_work: DArc, + + /// Work item used to deliver error codes to the current thread. Stored here so that it can be + /// reused. + return_work: DArc, + + /// Determines whether the work list below should be processed. When set to false, `work_list` + /// is treated as if it were empty. + process_work_list: bool, + /// List of work items to deliver to userspace. + work_list: List>, + current_transaction: Option>, + + /// Extended error information for this thread. + extended_error: ExtendedError, +} + +const LOOPER_REGISTERED: u32 = 0x01; +const LOOPER_ENTERED: u32 = 0x02; +const LOOPER_EXITED: u32 = 0x04; +const LOOPER_INVALID: u32 = 0x08; +const LOOPER_WAITING: u32 = 0x10; +const LOOPER_WAITING_PROC: u32 = 0x20; +const LOOPER_POLL: u32 = 0x40; + +impl InnerThread { + fn new() -> Result { + fn next_err_id() -> u32 { + static EE_ID: AtomicU32 = AtomicU32::new(0); + EE_ID.fetch_add(1, Ordering::Relaxed) + } + + Ok(Self { + looper_flags: 0, + looper_need_return: false, + is_dead: false, + process_work_list: false, + reply_work: ThreadError::try_new()?, + return_work: ThreadError::try_new()?, + work_list: List::new(), + current_transaction: None, + extended_error: ExtendedError::new(next_err_id(), BR_OK, 0), + }) + } + + fn pop_work(&mut self) -> Option> { + if !self.process_work_list { + return None; + } + + let ret = self.work_list.pop_front(); + self.process_work_list = !self.work_list.is_empty(); + ret + } + + fn push_work(&mut self, work: DLArc) -> PushWorkRes { + if self.is_dead { + PushWorkRes::FailedDead(work) + } else { + self.work_list.push_back(work); + self.process_work_list = true; + PushWorkRes::Ok + } + } + + fn push_reply_work(&mut self, code: u32) { + if let Ok(work) = ListArc::try_from_arc(self.reply_work.clone()) { + work.set_error_code(code); + self.push_work(work); + } else { + pr_warn!("Thread reply work is already in use."); + } + } + + fn push_return_work(&mut self, reply: u32) { + if let Ok(work) = ListArc::try_from_arc(self.return_work.clone()) { + work.set_error_code(reply); + self.push_work(work); + } else { + pr_warn!("Thread return work is already in use."); + } + } + + /// Used to push work items that do not need to be processed immediately and can wait until the + /// thread gets another work item. + fn push_work_deferred(&mut self, work: DLArc) { + self.work_list.push_back(work); + } + + /// Fetches the transaction this thread can reply to. If the thread has a pending transaction + /// (that it could respond to) but it has also issued a transaction, it must first wait for the + /// previously-issued transaction to complete. + /// + /// The `thread` parameter should be the thread containing this `ThreadInner`. + fn pop_transaction_to_reply(&mut self, thread: &Thread) -> Result> { + let transaction = self.current_transaction.take().ok_or(EINVAL)?; + if core::ptr::eq(thread, transaction.from.as_ref()) { + self.current_transaction = Some(transaction); + return Err(EINVAL); + } + // Find a new current transaction for this thread. + self.current_transaction = transaction.find_from(thread).cloned(); + Ok(transaction) + } + + fn pop_transaction_replied(&mut self, transaction: &DArc) -> bool { + match self.current_transaction.take() { + None => false, + Some(old) => { + if !Arc::ptr_eq(transaction, &old) { + self.current_transaction = Some(old); + return false; + } + self.current_transaction = old.clone_next(); + true + } + } + } + + fn looper_enter(&mut self) { + self.looper_flags |= LOOPER_ENTERED; + if self.looper_flags & LOOPER_REGISTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_register(&mut self, valid: bool) { + self.looper_flags |= LOOPER_REGISTERED; + if !valid || self.looper_flags & LOOPER_ENTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_exit(&mut self) { + self.looper_flags |= LOOPER_EXITED; + } + + /// Determines whether the thread is part of a pool, i.e., if it is a looper. + fn is_looper(&self) -> bool { + self.looper_flags & (LOOPER_ENTERED | LOOPER_REGISTERED) != 0 + } + + /// Determines whether the thread should attempt to fetch work items from the process queue. + /// This is generally case when the thread is registered as a looper and not part of a + /// transaction stack. But if there is local work, we want to return to userspace before we + /// deliver any remote work. + fn should_use_process_work_queue(&self) -> bool { + self.current_transaction.is_none() && !self.process_work_list && self.is_looper() + } + + fn poll(&mut self) -> u32 { + self.looper_flags |= LOOPER_POLL; + if self.process_work_list || self.looper_need_return { + bindings::POLLIN + } else { + 0 + } + } +} + +/// This represents a thread that's used with binder. +#[pin_data] +pub(crate) struct Thread { + pub(crate) id: i32, + pub(crate) process: Arc, + pub(crate) task: ARef, + #[pin] + inner: SpinLock, + #[pin] + work_condvar: PollCondVar, + /// Used to insert this thread into the process' `ready_threads` list. + /// + /// INVARIANT: May never be used for any other list than the `self.process.ready_threads`. + #[pin] + links: ListLinks, + #[pin] + links_track: AtomicTracker, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Thread { + tracked_by links_track: AtomicTracker; + } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Thread { + using ListLinks { self.links }; + } +} + +impl Thread { + pub(crate) fn new(id: i32, process: Arc) -> Result> { + let inner = InnerThread::new()?; + + Arc::pin_init( + try_pin_init!(Thread { + id, + process, + task: ARef::from(&**kernel::current!()), + inner <- kernel::new_spinlock!(inner, "Thread::inner"), + work_condvar <- kernel::new_poll_condvar!("Thread::work_condvar"), + links <- ListLinks::new(), + links_track <- AtomicTracker::new(), + }), + GFP_KERNEL, + ) + } + + #[inline(never)] + pub(crate) fn debug_print(self: &Arc, m: &SeqFile, print_all: bool) -> Result<()> { + let inner = self.inner.lock(); + + if print_all || inner.current_transaction.is_some() || !inner.work_list.is_empty() { + seq_print!( + m, + " thread {}: l {:02x} need_return {}\n", + self.id, + inner.looper_flags, + inner.looper_need_return, + ); + } + + let mut t_opt = inner.current_transaction.as_ref(); + while let Some(t) = t_opt { + if Arc::ptr_eq(&t.from, self) { + t.debug_print_inner(m, " outgoing transaction "); + t_opt = t.from_parent.as_ref(); + } else if Arc::ptr_eq(&t.to, &self.process) { + t.debug_print_inner(m, " incoming transaction "); + t_opt = t.find_from(self); + } else { + t.debug_print_inner(m, " bad transaction "); + t_opt = None; + } + } + + for work in &inner.work_list { + work.debug_print(m, " ", " pending transaction ")?; + } + Ok(()) + } + + pub(crate) fn get_extended_error(&self, data: UserSlice) -> Result { + let mut writer = data.writer(); + let ee = self.inner.lock().extended_error; + writer.write(&ee)?; + Ok(()) + } + + pub(crate) fn set_current_transaction(&self, transaction: DArc) { + self.inner.lock().current_transaction = Some(transaction); + } + + pub(crate) fn has_current_transaction(&self) -> bool { + self.inner.lock().current_transaction.is_some() + } + + /// Attempts to fetch a work item from the thread-local queue. The behaviour if the queue is + /// empty depends on `wait`: if it is true, the function waits for some work to be queued (or a + /// signal); otherwise it returns indicating that none is available. + fn get_work_local(self: &Arc, wait: bool) -> Result>> { + { + let mut inner = self.inner.lock(); + if inner.looper_need_return { + return Ok(inner.pop_work()); + } + } + + // Try once if the caller does not want to wait. + if !wait { + return self.inner.lock().pop_work().ok_or(EAGAIN).map(Some); + } + + // Loop waiting only on the local queue (i.e., not registering with the process queue). + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + + inner.looper_flags |= LOOPER_WAITING; + let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner); + inner.looper_flags &= !LOOPER_WAITING; + + if signal_pending { + return Err(EINTR); + } + if inner.looper_need_return { + return Ok(None); + } + } + } + + /// Attempts to fetch a work item from the thread-local queue, falling back to the process-wide + /// queue if none is available locally. + /// + /// This must only be called when the thread is not participating in a transaction chain. If it + /// is, the local version (`get_work_local`) should be used instead. + fn get_work(self: &Arc, wait: bool) -> Result>> { + // Try to get work from the thread's work queue, using only a local lock. + { + let mut inner = self.inner.lock(); + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + if inner.looper_need_return { + drop(inner); + return Ok(self.process.get_work()); + } + } + + // If the caller doesn't want to wait, try to grab work from the process queue. + // + // We know nothing will have been queued directly to the thread queue because it is not in + // a transaction and it is not in the process' ready list. + if !wait { + return self.process.get_work().ok_or(EAGAIN).map(Some); + } + + // Get work from the process queue. If none is available, atomically register as ready. + let reg = match self.process.get_work_or_register(self) { + GetWorkOrRegister::Work(work) => return Ok(Some(work)), + GetWorkOrRegister::Register(reg) => reg, + }; + + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + + inner.looper_flags |= LOOPER_WAITING | LOOPER_WAITING_PROC; + let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner); + inner.looper_flags &= !(LOOPER_WAITING | LOOPER_WAITING_PROC); + + if signal_pending || inner.looper_need_return { + // We need to return now. We need to pull the thread off the list of ready threads + // (by dropping `reg`), then check the state again after it's off the list to + // ensure that something was not queued in the meantime. If something has been + // queued, we just return it (instead of the error). + drop(inner); + drop(reg); + + let res = match self.inner.lock().pop_work() { + Some(work) => Ok(Some(work)), + None if signal_pending => Err(EINTR), + None => Ok(None), + }; + return res; + } + } + } + + /// Push the provided work item to be delivered to user space via this thread. + /// + /// Returns whether the item was successfully pushed. This can only fail if the thread is dead. + pub(crate) fn push_work(&self, work: DLArc) -> PushWorkRes { + let sync = work.should_sync_wakeup(); + + let res = self.inner.lock().push_work(work); + + if res.is_ok() { + if sync { + self.work_condvar.notify_sync(); + } else { + self.work_condvar.notify_one(); + } + } + + res + } + + /// Attempts to push to given work item to the thread if it's a looper thread (i.e., if it's + /// part of a thread pool) and is alive. Otherwise, push the work item to the process instead. + pub(crate) fn push_work_if_looper(&self, work: DLArc) -> BinderResult { + let mut inner = self.inner.lock(); + if inner.is_looper() && !inner.is_dead { + inner.push_work(work); + Ok(()) + } else { + drop(inner); + self.process.push_work(work) + } + } + + pub(crate) fn push_work_deferred(&self, work: DLArc) { + self.inner.lock().push_work_deferred(work); + } + + pub(crate) fn push_return_work(&self, reply: u32) { + self.inner.lock().push_return_work(reply); + } + + fn translate_object( + &self, + obj_index: usize, + offset: usize, + object: BinderObjectRef<'_>, + view: &mut AllocationView<'_>, + allow_fds: bool, + sg_state: &mut ScatterGatherState, + ) -> BinderResult { + match object { + BinderObjectRef::Binder(obj) => { + let strong = obj.hdr.type_ == BINDER_TYPE_BINDER; + // SAFETY: `binder` is a `binder_uintptr_t`; any bit pattern is a valid + // representation. + let ptr = unsafe { obj.__bindgen_anon_1.binder } as _; + let cookie = obj.cookie as _; + let flags = obj.flags as _; + let node = self + .process + .as_arc_borrow() + .get_node(ptr, cookie, flags, strong, self)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + view.transfer_binder_object(offset, obj, strong, node)?; + } + BinderObjectRef::Handle(obj) => { + let strong = obj.hdr.type_ == BINDER_TYPE_HANDLE; + // SAFETY: `handle` is a `u32`; any bit pattern is a valid representation. + let handle = unsafe { obj.__bindgen_anon_1.handle } as _; + let node = self.process.get_node_from_handle(handle, strong)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + view.transfer_binder_object(offset, obj, strong, node)?; + } + BinderObjectRef::Fd(obj) => { + if !allow_fds { + return Err(EPERM.into()); + } + + // SAFETY: `fd` is a `u32`; any bit pattern is a valid representation. + let fd = unsafe { obj.__bindgen_anon_1.fd }; + let file = LocalFile::fget(fd)?; + // SAFETY: The binder driver never calls `fdget_pos` and this code runs from an + // ioctl, so there are no active calls to `fdget_pos` on this thread. + let file = unsafe { LocalFile::assume_no_fdget_pos(file) }; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + + let mut obj_write = BinderFdObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_FD; + // This will be overwritten with the actual fd when the transaction is received. + obj_write.__bindgen_anon_1.fd = u32::MAX; + obj_write.cookie = obj.cookie; + view.write::(offset, &obj_write)?; + + const FD_FIELD_OFFSET: usize = + core::mem::offset_of!(uapi::binder_fd_object, __bindgen_anon_1.fd); + + let field_offset = offset + FD_FIELD_OFFSET; + + view.alloc.info_add_fd(file, field_offset, false)?; + } + BinderObjectRef::Ptr(obj) => { + let obj_length = obj.length.try_into().map_err(|_| EINVAL)?; + let alloc_offset = match sg_state.unused_buffer_space.claim_next(obj_length) { + Ok(alloc_offset) => alloc_offset, + Err(err) => { + pr_warn!( + "Failed to claim space for a BINDER_TYPE_PTR. (offset: {}, limit: {}, size: {})", + sg_state.unused_buffer_space.offset, + sg_state.unused_buffer_space.limit, + obj_length, + ); + return Err(err.into()); + } + }; + + let sg_state_idx = sg_state.sg_entries.len(); + sg_state.sg_entries.push( + ScatterGatherEntry { + obj_index, + offset: alloc_offset, + sender_uaddr: obj.buffer as _, + length: obj_length, + pointer_fixups: KVec::new(), + fixup_min_offset: 0, + }, + GFP_KERNEL, + )?; + + let buffer_ptr_in_user_space = (view.alloc.ptr + alloc_offset) as u64; + + if obj.flags & uapi::BINDER_BUFFER_FLAG_HAS_PARENT == 0 { + sg_state.ancestors.clear(); + sg_state.ancestors.push(sg_state_idx, GFP_KERNEL)?; + } else { + // Another buffer also has a pointer to this buffer, and we need to fixup that + // pointer too. + + let parent_index = usize::try_from(obj.parent).map_err(|_| EINVAL)?; + let parent_offset = usize::try_from(obj.parent_offset).map_err(|_| EINVAL)?; + + let info = sg_state.validate_parent_fixup( + parent_index, + parent_offset, + size_of::(), + )?; + + sg_state.ancestors.truncate(info.num_ancestors); + sg_state.ancestors.push(sg_state_idx, GFP_KERNEL)?; + + let parent_entry = match sg_state.sg_entries.get_mut(info.parent_sg_index) { + Some(parent_entry) => parent_entry, + None => { + pr_err!( + "validate_parent_fixup returned index out of bounds for sg.entries" + ); + return Err(EINVAL.into()); + } + }; + + parent_entry.fixup_min_offset = info.new_min_offset; + parent_entry.pointer_fixups.push( + PointerFixupEntry { + skip: 0, + pointer_value: buffer_ptr_in_user_space, + target_offset: info.target_offset, + }, + GFP_KERNEL, + )?; + } + + let mut obj_write = BinderBufferObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_PTR; + obj_write.flags = obj.flags; + obj_write.buffer = buffer_ptr_in_user_space; + obj_write.length = obj.length; + obj_write.parent = obj.parent; + obj_write.parent_offset = obj.parent_offset; + view.write::(offset, &obj_write)?; + } + BinderObjectRef::Fda(obj) => { + if !allow_fds { + return Err(EPERM.into()); + } + let parent_index = usize::try_from(obj.parent).map_err(|_| EINVAL)?; + let parent_offset = usize::try_from(obj.parent_offset).map_err(|_| EINVAL)?; + let num_fds = usize::try_from(obj.num_fds).map_err(|_| EINVAL)?; + let fds_len = num_fds.checked_mul(size_of::()).ok_or(EINVAL)?; + + let info = sg_state.validate_parent_fixup(parent_index, parent_offset, fds_len)?; + view.alloc.info_add_fd_reserve(num_fds)?; + + sg_state.ancestors.truncate(info.num_ancestors); + let parent_entry = match sg_state.sg_entries.get_mut(info.parent_sg_index) { + Some(parent_entry) => parent_entry, + None => { + pr_err!( + "validate_parent_fixup returned index out of bounds for sg.entries" + ); + return Err(EINVAL.into()); + } + }; + + parent_entry.fixup_min_offset = info.new_min_offset; + parent_entry + .pointer_fixups + .push( + PointerFixupEntry { + skip: fds_len, + pointer_value: 0, + target_offset: info.target_offset, + }, + GFP_KERNEL, + ) + .map_err(|_| ENOMEM)?; + + let fda_uaddr = parent_entry + .sender_uaddr + .checked_add(parent_offset) + .ok_or(EINVAL)?; + let mut fda_bytes = KVec::new(); + UserSlice::new(UserPtr::from_addr(fda_uaddr as _), fds_len) + .read_all(&mut fda_bytes, GFP_KERNEL)?; + + if fds_len != fda_bytes.len() { + pr_err!("UserSlice::read_all returned wrong length in BINDER_TYPE_FDA"); + return Err(EINVAL.into()); + } + + for i in (0..fds_len).step_by(size_of::()) { + let fd = { + let mut fd_bytes = [0u8; size_of::()]; + fd_bytes.copy_from_slice(&fda_bytes[i..i + size_of::()]); + u32::from_ne_bytes(fd_bytes) + }; + + let file = LocalFile::fget(fd)?; + // SAFETY: The binder driver never calls `fdget_pos` and this code runs from an + // ioctl, so there are no active calls to `fdget_pos` on this thread. + let file = unsafe { LocalFile::assume_no_fdget_pos(file) }; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + + // The `validate_parent_fixup` call ensuers that this addition will not + // overflow. + view.alloc.info_add_fd(file, info.target_offset + i, true)?; + } + drop(fda_bytes); + + let mut obj_write = BinderFdArrayObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_FDA; + obj_write.num_fds = obj.num_fds; + obj_write.parent = obj.parent; + obj_write.parent_offset = obj.parent_offset; + view.write::(offset, &obj_write)?; + } + } + Ok(()) + } + + fn apply_sg(&self, alloc: &mut Allocation, sg_state: &mut ScatterGatherState) -> BinderResult { + for sg_entry in &mut sg_state.sg_entries { + let mut end_of_previous_fixup = sg_entry.offset; + let offset_end = sg_entry.offset.checked_add(sg_entry.length).ok_or(EINVAL)?; + + let mut reader = + UserSlice::new(UserPtr::from_addr(sg_entry.sender_uaddr), sg_entry.length).reader(); + for fixup in &mut sg_entry.pointer_fixups { + let fixup_len = if fixup.skip == 0 { + size_of::() + } else { + fixup.skip + }; + + let target_offset_end = fixup.target_offset.checked_add(fixup_len).ok_or(EINVAL)?; + if fixup.target_offset < end_of_previous_fixup || offset_end < target_offset_end { + pr_warn!( + "Fixups oob {} {} {} {}", + fixup.target_offset, + end_of_previous_fixup, + offset_end, + target_offset_end + ); + return Err(EINVAL.into()); + } + + let copy_off = end_of_previous_fixup; + let copy_len = fixup.target_offset - end_of_previous_fixup; + if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) { + pr_warn!("Failed copying into alloc: {:?}", err); + return Err(err.into()); + } + if fixup.skip == 0 { + let res = alloc.write::(fixup.target_offset, &fixup.pointer_value); + if let Err(err) = res { + pr_warn!("Failed copying ptr into alloc: {:?}", err); + return Err(err.into()); + } + } + if let Err(err) = reader.skip(fixup_len) { + pr_warn!("Failed skipping {} from reader: {:?}", fixup_len, err); + return Err(err.into()); + } + end_of_previous_fixup = target_offset_end; + } + let copy_off = end_of_previous_fixup; + let copy_len = offset_end - end_of_previous_fixup; + if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) { + pr_warn!("Failed copying remainder into alloc: {:?}", err); + return Err(err.into()); + } + } + Ok(()) + } + + /// This method copies the payload of a transaction into the target process. + /// + /// The resulting payload will have several different components, which will be stored next to + /// each other in the allocation. Furthermore, various objects can be embedded in the payload, + /// and those objects have to be translated so that they make sense to the target transaction. + pub(crate) fn copy_transaction_data( + &self, + to_process: Arc, + tr: &BinderTransactionDataSg, + debug_id: usize, + allow_fds: bool, + txn_security_ctx_offset: Option<&mut usize>, + ) -> BinderResult { + let trd = &tr.transaction_data; + let is_oneway = trd.flags & TF_ONE_WAY != 0; + let mut secctx = if let Some(offset) = txn_security_ctx_offset { + let secid = self.process.cred.get_secid(); + let ctx = match security::SecurityCtx::from_secid(secid) { + Ok(ctx) => ctx, + Err(err) => { + pr_warn!("Failed to get security ctx for id {}: {:?}", secid, err); + return Err(err.into()); + } + }; + Some((offset, ctx)) + } else { + None + }; + + let data_size = trd.data_size.try_into().map_err(|_| EINVAL)?; + let aligned_data_size = ptr_align(data_size).ok_or(EINVAL)?; + let offsets_size = trd.offsets_size.try_into().map_err(|_| EINVAL)?; + let aligned_offsets_size = ptr_align(offsets_size).ok_or(EINVAL)?; + let buffers_size = tr.buffers_size.try_into().map_err(|_| EINVAL)?; + let aligned_buffers_size = ptr_align(buffers_size).ok_or(EINVAL)?; + let aligned_secctx_size = match secctx.as_ref() { + Some((_offset, ctx)) => ptr_align(ctx.len()).ok_or(EINVAL)?, + None => 0, + }; + + // This guarantees that at least `sizeof(usize)` bytes will be allocated. + let len = usize::max( + aligned_data_size + .checked_add(aligned_offsets_size) + .and_then(|sum| sum.checked_add(aligned_buffers_size)) + .and_then(|sum| sum.checked_add(aligned_secctx_size)) + .ok_or(ENOMEM)?, + size_of::(), + ); + let secctx_off = aligned_data_size + aligned_offsets_size + aligned_buffers_size; + let mut alloc = + match to_process.buffer_alloc(debug_id, len, is_oneway, self.process.task.pid()) { + Ok(alloc) => alloc, + Err(err) => { + pr_warn!( + "Failed to allocate buffer. len:{}, is_oneway:{}", + len, + is_oneway + ); + return Err(err); + } + }; + + // SAFETY: This accesses a union field, but it's okay because the field's type is valid for + // all bit-patterns. + let trd_data_ptr = unsafe { &trd.data.ptr }; + let mut buffer_reader = + UserSlice::new(UserPtr::from_addr(trd_data_ptr.buffer as _), data_size).reader(); + let mut end_of_previous_object = 0; + let mut sg_state = None; + + // Copy offsets if there are any. + if offsets_size > 0 { + { + let mut reader = + UserSlice::new(UserPtr::from_addr(trd_data_ptr.offsets as _), offsets_size) + .reader(); + alloc.copy_into(&mut reader, aligned_data_size, offsets_size)?; + } + + let offsets_start = aligned_data_size; + let offsets_end = aligned_data_size + aligned_offsets_size; + + // This state is used for BINDER_TYPE_PTR objects. + let sg_state = sg_state.insert(ScatterGatherState { + unused_buffer_space: UnusedBufferSpace { + offset: offsets_end, + limit: len, + }, + sg_entries: KVec::new(), + ancestors: KVec::new(), + }); + + // Traverse the objects specified. + let mut view = AllocationView::new(&mut alloc, data_size); + for (index, index_offset) in (offsets_start..offsets_end) + .step_by(size_of::()) + .enumerate() + { + let offset = view.alloc.read(index_offset)?; + + if offset < end_of_previous_object { + pr_warn!("Got transaction with invalid offset."); + return Err(EINVAL.into()); + } + + // Copy data between two objects. + if end_of_previous_object < offset { + view.copy_into( + &mut buffer_reader, + end_of_previous_object, + offset - end_of_previous_object, + )?; + } + + let mut object = BinderObject::read_from(&mut buffer_reader)?; + + match self.translate_object( + index, + offset, + object.as_ref(), + &mut view, + allow_fds, + sg_state, + ) { + Ok(()) => end_of_previous_object = offset + object.size(), + Err(err) => { + pr_warn!("Error while translating object."); + return Err(err); + } + } + + // Update the indexes containing objects to clean up. + let offset_after_object = index_offset + size_of::(); + view.alloc + .set_info_offsets(offsets_start..offset_after_object); + } + } + + // Copy remaining raw data. + alloc.copy_into( + &mut buffer_reader, + end_of_previous_object, + data_size - end_of_previous_object, + )?; + + if let Some(sg_state) = sg_state.as_mut() { + if let Err(err) = self.apply_sg(&mut alloc, sg_state) { + pr_warn!("Failure in apply_sg: {:?}", err); + return Err(err); + } + } + + if let Some((off_out, secctx)) = secctx.as_mut() { + if let Err(err) = alloc.write(secctx_off, secctx.as_bytes()) { + pr_warn!("Failed to write security context: {:?}", err); + return Err(err.into()); + } + **off_out = secctx_off; + } + Ok(alloc) + } + + fn unwind_transaction_stack(self: &Arc) { + let mut thread = self.clone(); + while let Ok(transaction) = { + let mut inner = thread.inner.lock(); + inner.pop_transaction_to_reply(thread.as_ref()) + } { + let reply = Err(BR_DEAD_REPLY); + if !transaction.from.deliver_single_reply(reply, &transaction) { + break; + } + + thread = transaction.from.clone(); + } + } + + pub(crate) fn deliver_reply( + &self, + reply: Result, u32>, + transaction: &DArc, + ) { + if self.deliver_single_reply(reply, transaction) { + transaction.from.unwind_transaction_stack(); + } + } + + /// Delivers a reply to the thread that started a transaction. The reply can either be a + /// reply-transaction or an error code to be delivered instead. + /// + /// Returns whether the thread is dead. If it is, the caller is expected to unwind the + /// transaction stack by completing transactions for threads that are dead. + fn deliver_single_reply( + &self, + reply: Result, u32>, + transaction: &DArc, + ) -> bool { + if let Ok(transaction) = &reply { + transaction.set_outstanding(&mut self.process.inner.lock()); + } + + { + let mut inner = self.inner.lock(); + if !inner.pop_transaction_replied(transaction) { + return false; + } + + if inner.is_dead { + return true; + } + + match reply { + Ok(work) => { + inner.push_work(work); + } + Err(code) => inner.push_reply_work(code), + } + } + + // Notify the thread now that we've released the inner lock. + self.work_condvar.notify_sync(); + false + } + + /// Determines if the given transaction is the current transaction for this thread. + fn is_current_transaction(&self, transaction: &DArc) -> bool { + let inner = self.inner.lock(); + match &inner.current_transaction { + None => false, + Some(current) => Arc::ptr_eq(current, transaction), + } + } + + /// Determines the current top of the transaction stack. It fails if the top is in another + /// thread (i.e., this thread belongs to a stack but it has called another thread). The top is + /// [`None`] if the thread is not currently participating in a transaction stack. + fn top_of_transaction_stack(&self) -> Result>> { + let inner = self.inner.lock(); + if let Some(cur) = &inner.current_transaction { + if core::ptr::eq(self, cur.from.as_ref()) { + pr_warn!("got new transaction with bad transaction stack"); + return Err(EINVAL); + } + Ok(Some(cur.clone())) + } else { + Ok(None) + } + } + + fn transaction(self: &Arc, tr: &BinderTransactionDataSg, inner: T) + where + T: FnOnce(&Arc, &BinderTransactionDataSg) -> BinderResult, + { + if let Err(err) = inner(self, tr) { + if err.should_pr_warn() { + let mut ee = self.inner.lock().extended_error; + ee.command = err.reply; + ee.param = err.as_errno(); + pr_warn!( + "Transaction failed: {:?} my_pid:{}", + err, + self.process.pid_in_current_ns() + ); + } + + self.push_return_work(err.reply); + } + } + + fn transaction_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + // SAFETY: Handle's type has no invalid bit patterns. + let handle = unsafe { tr.transaction_data.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + // TODO: We need to ensure that there isn't a pending transaction in the work queue. How + // could this happen? + let top = self.top_of_transaction_stack()?; + let list_completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let completion = list_completion.clone_arc(); + let transaction = Transaction::new(node_ref, top, self, tr)?; + + // Check that the transaction stack hasn't changed while the lock was released, then update + // it with the new transaction. + { + let mut inner = self.inner.lock(); + if !transaction.is_stacked_on(&inner.current_transaction) { + pr_warn!("Transaction stack changed during transaction!"); + return Err(EINVAL.into()); + } + inner.current_transaction = Some(transaction.clone_arc()); + // We push the completion as a deferred work so that we wait for the reply before + // returning to userland. + inner.push_work_deferred(list_completion); + } + + if let Err(e) = transaction.submit() { + completion.skip(); + // Define `transaction` first to drop it after `inner`. + let transaction; + let mut inner = self.inner.lock(); + transaction = inner.current_transaction.take().unwrap(); + inner.current_transaction = transaction.clone_next(); + Err(e) + } else { + Ok(()) + } + } + + fn reply_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + let orig = self.inner.lock().pop_transaction_to_reply(self)?; + if !orig.from.is_current_transaction(&orig) { + return Err(EINVAL.into()); + } + + // We need to complete the transaction even if we cannot complete building the reply. + let out = (|| -> BinderResult<_> { + let completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let process = orig.from.process.clone(); + let allow_fds = orig.flags & TF_ACCEPT_FDS != 0; + let reply = Transaction::new_reply(self, process, tr, allow_fds)?; + self.inner.lock().push_work(completion); + orig.from.deliver_reply(Ok(reply), &orig); + Ok(()) + })() + .map_err(|mut err| { + // At this point we only return `BR_TRANSACTION_COMPLETE` to the caller, and we must let + // the sender know that the transaction has completed (with an error in this case). + pr_warn!( + "Failure {:?} during reply - delivering BR_FAILED_REPLY to sender.", + err + ); + let reply = Err(BR_FAILED_REPLY); + orig.from.deliver_reply(reply, &orig); + err.reply = BR_TRANSACTION_COMPLETE; + err + }); + + out + } + + fn oneway_transaction_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + // SAFETY: The `handle` field is valid for all possible byte values, so reading from the + // union is okay. + let handle = unsafe { tr.transaction_data.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + let transaction = Transaction::new(node_ref, None, self, tr)?; + let code = if self.process.is_oneway_spam_detection_enabled() + && transaction.oneway_spam_detected + { + BR_ONEWAY_SPAM_SUSPECT + } else { + BR_TRANSACTION_COMPLETE + }; + let list_completion = DTRWrap::arc_try_new(DeliverCode::new(code))?; + let completion = list_completion.clone_arc(); + self.inner.lock().push_work(list_completion); + match transaction.submit() { + Ok(()) => Ok(()), + Err(err) => { + completion.skip(); + Err(err) + } + } + } + + fn write(self: &Arc, req: &mut BinderWriteRead) -> Result { + let write_start = req.write_buffer.wrapping_add(req.write_consumed); + let write_len = req.write_size.saturating_sub(req.write_consumed); + let mut reader = + UserSlice::new(UserPtr::from_addr(write_start as _), write_len as _).reader(); + + while reader.len() >= size_of::() && self.inner.lock().return_work.is_unused() { + let before = reader.len(); + let cmd = reader.read::()?; + GLOBAL_STATS.inc_bc(cmd); + self.process.stats.inc_bc(cmd); + match cmd { + BC_TRANSACTION => { + let tr = reader.read::()?.with_buffers_size(0); + if tr.transaction_data.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner); + } else { + self.transaction(&tr, Self::transaction_inner); + } + } + BC_TRANSACTION_SG => { + let tr = reader.read::()?; + if tr.transaction_data.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner); + } else { + self.transaction(&tr, Self::transaction_inner); + } + } + BC_REPLY => { + let tr = reader.read::()?.with_buffers_size(0); + self.transaction(&tr, Self::reply_inner) + } + BC_REPLY_SG => { + let tr = reader.read::()?; + self.transaction(&tr, Self::reply_inner) + } + BC_FREE_BUFFER => { + let buffer = self.process.buffer_get(reader.read()?); + if let Some(buffer) = &buffer { + if buffer.looper_need_return_on_free() { + self.inner.lock().looper_need_return = true; + } + } + drop(buffer); + } + BC_INCREFS => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, true, false)? + } + BC_ACQUIRE => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, true, true)? + } + BC_RELEASE => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, false, true)? + } + BC_DECREFS => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, false, false)? + } + BC_INCREFS_DONE => self.process.inc_ref_done(&mut reader, false)?, + BC_ACQUIRE_DONE => self.process.inc_ref_done(&mut reader, true)?, + BC_REQUEST_DEATH_NOTIFICATION => self.process.request_death(&mut reader, self)?, + BC_CLEAR_DEATH_NOTIFICATION => self.process.clear_death(&mut reader, self)?, + BC_DEAD_BINDER_DONE => self.process.dead_binder_done(reader.read()?, self), + BC_REGISTER_LOOPER => { + let valid = self.process.register_thread(); + self.inner.lock().looper_register(valid); + } + BC_ENTER_LOOPER => self.inner.lock().looper_enter(), + BC_EXIT_LOOPER => self.inner.lock().looper_exit(), + BC_REQUEST_FREEZE_NOTIFICATION => self.process.request_freeze_notif(&mut reader)?, + BC_CLEAR_FREEZE_NOTIFICATION => self.process.clear_freeze_notif(&mut reader)?, + BC_FREEZE_NOTIFICATION_DONE => self.process.freeze_notif_done(&mut reader)?, + + // Fail if given an unknown error code. + // BC_ATTEMPT_ACQUIRE and BC_ACQUIRE_RESULT are no longer supported. + _ => return Err(EINVAL), + } + // Update the number of write bytes consumed. + req.write_consumed += (before - reader.len()) as u64; + } + + Ok(()) + } + + fn read(self: &Arc, req: &mut BinderWriteRead, wait: bool) -> Result { + let read_start = req.read_buffer.wrapping_add(req.read_consumed); + let read_len = req.read_size.saturating_sub(req.read_consumed); + let mut writer = BinderReturnWriter::new( + UserSlice::new(UserPtr::from_addr(read_start as _), read_len as _).writer(), + self, + ); + let (in_pool, use_proc_queue) = { + let inner = self.inner.lock(); + (inner.is_looper(), inner.should_use_process_work_queue()) + }; + + let getter = if use_proc_queue { + Self::get_work + } else { + Self::get_work_local + }; + + // Reserve some room at the beginning of the read buffer so that we can send a + // BR_SPAWN_LOOPER if we need to. + let mut has_noop_placeholder = false; + if req.read_consumed == 0 { + if let Err(err) = writer.write_code(BR_NOOP) { + pr_warn!("Failure when writing BR_NOOP at beginning of buffer."); + return Err(err); + } + has_noop_placeholder = true; + } + + // Loop doing work while there is room in the buffer. + let initial_len = writer.len(); + while writer.len() >= size_of::() + 4 { + match getter(self, wait && initial_len == writer.len()) { + Ok(Some(work)) => match work.into_arc().do_work(self, &mut writer) { + Ok(true) => {} + Ok(false) => break, + Err(err) => { + return Err(err); + } + }, + Ok(None) => { + break; + } + Err(err) => { + // Propagate the error if we haven't written anything else. + if err != EINTR && err != EAGAIN { + pr_warn!("Failure in work getter: {:?}", err); + } + if initial_len == writer.len() { + return Err(err); + } else { + break; + } + } + } + } + + req.read_consumed += read_len - writer.len() as u64; + + // Write BR_SPAWN_LOOPER if the process needs more threads for its pool. + if has_noop_placeholder && in_pool && self.process.needs_thread() { + let mut writer = + UserSlice::new(UserPtr::from_addr(req.read_buffer as _), req.read_size as _) + .writer(); + writer.write(&BR_SPAWN_LOOPER)?; + } + Ok(()) + } + + pub(crate) fn write_read(self: &Arc, data: UserSlice, wait: bool) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut req = reader.read::()?; + + // Go through the write buffer. + let mut ret = Ok(()); + if req.write_size > 0 { + ret = self.write(&mut req); + if let Err(err) = ret { + pr_warn!( + "Write failure {:?} in pid:{}", + err, + self.process.pid_in_current_ns() + ); + req.read_consumed = 0; + writer.write(&req)?; + self.inner.lock().looper_need_return = false; + return ret; + } + } + + // Go through the work queue. + if req.read_size > 0 { + ret = self.read(&mut req, wait); + if ret.is_err() && ret != Err(EINTR) { + pr_warn!( + "Read failure {:?} in pid:{}", + ret, + self.process.pid_in_current_ns() + ); + } + } + + // Write the request back so that the consumed fields are visible to the caller. + writer.write(&req)?; + + self.inner.lock().looper_need_return = false; + + ret + } + + pub(crate) fn poll(&self, file: &File, table: PollTable<'_>) -> (bool, u32) { + table.register_wait(file, &self.work_condvar); + let mut inner = self.inner.lock(); + (inner.should_use_process_work_queue(), inner.poll()) + } + + /// Make the call to `get_work` or `get_work_local` return immediately, if any. + pub(crate) fn exit_looper(&self) { + let mut inner = self.inner.lock(); + let should_notify = inner.looper_flags & LOOPER_WAITING != 0; + if should_notify { + inner.looper_need_return = true; + } + drop(inner); + + if should_notify { + self.work_condvar.notify_one(); + } + } + + pub(crate) fn notify_if_poll_ready(&self, sync: bool) { + // Determine if we need to notify. This requires the lock. + let inner = self.inner.lock(); + let notify = inner.looper_flags & LOOPER_POLL != 0 && inner.should_use_process_work_queue(); + drop(inner); + + // Now that the lock is no longer held, notify the waiters if we have to. + if notify { + if sync { + self.work_condvar.notify_sync(); + } else { + self.work_condvar.notify_one(); + } + } + } + + pub(crate) fn release(self: &Arc) { + self.inner.lock().is_dead = true; + + //self.work_condvar.clear(); + self.unwind_transaction_stack(); + + // Cancel all pending work items. + while let Ok(Some(work)) = self.get_work_local(false) { + work.into_arc().cancel(); + } + } +} + +#[pin_data] +struct ThreadError { + error_code: AtomicU32, + #[pin] + links_track: AtomicTracker, +} + +impl ThreadError { + fn try_new() -> Result> { + DTRWrap::arc_pin_init(pin_init!(Self { + error_code: AtomicU32::new(BR_OK), + links_track <- AtomicTracker::new(), + })) + .map(ListArc::into_arc) + } + + fn set_error_code(&self, code: u32) { + self.error_code.store(code, Ordering::Relaxed); + } + + fn is_unused(&self) -> bool { + self.error_code.load(Ordering::Relaxed) == BR_OK + } +} + +impl DeliverToRead for ThreadError { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let code = self.error_code.load(Ordering::Relaxed); + self.error_code.store(BR_OK, Ordering::Relaxed); + writer.write_code(code)?; + Ok(true) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}transaction error: {}\n", + prefix, + self.error_code.load(Ordering::Relaxed) + ); + Ok(()) + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for ThreadError { + tracked_by links_track: AtomicTracker; + } +} diff --git a/drivers/android/binder/trace.rs b/drivers/android/binder/trace.rs new file mode 100644 index 000000000000..af0e4392805e --- /dev/null +++ b/drivers/android/binder/trace.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::ffi::{c_uint, c_ulong}; +use kernel::tracepoint::declare_trace; + +declare_trace! { + unsafe fn rust_binder_ioctl(cmd: c_uint, arg: c_ulong); +} + +#[inline] +pub(crate) fn trace_ioctl(cmd: u32, arg: usize) { + // SAFETY: Always safe to call. + unsafe { rust_binder_ioctl(cmd, arg as c_ulong) } +} diff --git a/drivers/android/binder/transaction.rs b/drivers/android/binder/transaction.rs new file mode 100644 index 000000000000..02512175d622 --- /dev/null +++ b/drivers/android/binder/transaction.rs @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::sync::atomic::{AtomicBool, Ordering}; +use kernel::{ + prelude::*, + seq_file::SeqFile, + seq_print, + sync::{Arc, SpinLock}, + task::Kuid, + time::{Instant, Monotonic}, + types::ScopeGuard, +}; + +use crate::{ + allocation::{Allocation, TranslatedFds}, + defs::*, + error::{BinderError, BinderResult}, + node::{Node, NodeRef}, + process::{Process, ProcessInner}, + ptr_align, + thread::{PushWorkRes, Thread}, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +#[pin_data(PinnedDrop)] +pub(crate) struct Transaction { + pub(crate) debug_id: usize, + target_node: Option>, + pub(crate) from_parent: Option>, + pub(crate) from: Arc, + pub(crate) to: Arc, + #[pin] + allocation: SpinLock>, + is_outstanding: AtomicBool, + code: u32, + pub(crate) flags: u32, + data_size: usize, + offsets_size: usize, + data_address: usize, + sender_euid: Kuid, + txn_security_ctx_off: Option, + pub(crate) oneway_spam_detected: bool, + start_time: Instant, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Transaction { untracked; } +} + +impl Transaction { + pub(crate) fn new( + node_ref: NodeRef, + from_parent: Option>, + from: &Arc, + tr: &BinderTransactionDataSg, + ) -> BinderResult> { + let debug_id = super::next_debug_id(); + let trd = &tr.transaction_data; + let allow_fds = node_ref.node.flags & FLAT_BINDER_FLAG_ACCEPTS_FDS != 0; + let txn_security_ctx = node_ref.node.flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX != 0; + let mut txn_security_ctx_off = if txn_security_ctx { Some(0) } else { None }; + let to = node_ref.node.owner.clone(); + let mut alloc = match from.copy_transaction_data( + to.clone(), + tr, + debug_id, + allow_fds, + txn_security_ctx_off.as_mut(), + ) { + Ok(alloc) => alloc, + Err(err) => { + if !err.is_dead() { + pr_warn!("Failure in copy_transaction_data: {:?}", err); + } + return Err(err); + } + }; + let oneway_spam_detected = alloc.oneway_spam_detected; + if trd.flags & TF_ONE_WAY != 0 { + if from_parent.is_some() { + pr_warn!("Oneway transaction should not be in a transaction stack."); + return Err(EINVAL.into()); + } + alloc.set_info_oneway_node(node_ref.node.clone()); + } + if trd.flags & TF_CLEAR_BUF != 0 { + alloc.set_info_clear_on_drop(); + } + let target_node = node_ref.node.clone(); + alloc.set_info_target_node(node_ref); + let data_address = alloc.ptr; + + Ok(DTRWrap::arc_pin_init(pin_init!(Transaction { + debug_id, + target_node: Some(target_node), + from_parent, + sender_euid: from.process.task.euid(), + from: from.clone(), + to, + code: trd.code, + flags: trd.flags, + data_size: trd.data_size as _, + offsets_size: trd.offsets_size as _, + data_address, + allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"), + is_outstanding: AtomicBool::new(false), + txn_security_ctx_off, + oneway_spam_detected, + start_time: Instant::now(), + }))?) + } + + pub(crate) fn new_reply( + from: &Arc, + to: Arc, + tr: &BinderTransactionDataSg, + allow_fds: bool, + ) -> BinderResult> { + let debug_id = super::next_debug_id(); + let trd = &tr.transaction_data; + let mut alloc = match from.copy_transaction_data(to.clone(), tr, debug_id, allow_fds, None) + { + Ok(alloc) => alloc, + Err(err) => { + pr_warn!("Failure in copy_transaction_data: {:?}", err); + return Err(err); + } + }; + let oneway_spam_detected = alloc.oneway_spam_detected; + if trd.flags & TF_CLEAR_BUF != 0 { + alloc.set_info_clear_on_drop(); + } + Ok(DTRWrap::arc_pin_init(pin_init!(Transaction { + debug_id, + target_node: None, + from_parent: None, + sender_euid: from.process.task.euid(), + from: from.clone(), + to, + code: trd.code, + flags: trd.flags, + data_size: trd.data_size as _, + offsets_size: trd.offsets_size as _, + data_address: alloc.ptr, + allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"), + is_outstanding: AtomicBool::new(false), + txn_security_ctx_off: None, + oneway_spam_detected, + start_time: Instant::now(), + }))?) + } + + #[inline(never)] + pub(crate) fn debug_print_inner(&self, m: &SeqFile, prefix: &str) { + seq_print!( + m, + "{}{}: from {}:{} to {} code {:x} flags {:x} elapsed {}ms", + prefix, + self.debug_id, + self.from.process.task.pid(), + self.from.id, + self.to.task.pid(), + self.code, + self.flags, + self.start_time.elapsed().as_millis(), + ); + if let Some(target_node) = &self.target_node { + seq_print!(m, " node {}", target_node.debug_id); + } + seq_print!(m, " size {}:{}\n", self.data_size, self.offsets_size); + } + + /// Determines if the transaction is stacked on top of the given transaction. + pub(crate) fn is_stacked_on(&self, onext: &Option>) -> bool { + match (&self.from_parent, onext) { + (None, None) => true, + (Some(from_parent), Some(next)) => Arc::ptr_eq(from_parent, next), + _ => false, + } + } + + /// Returns a pointer to the next transaction on the transaction stack, if there is one. + pub(crate) fn clone_next(&self) -> Option> { + Some(self.from_parent.as_ref()?.clone()) + } + + /// Searches in the transaction stack for a thread that belongs to the target process. This is + /// useful when finding a target for a new transaction: if the node belongs to a process that + /// is already part of the transaction stack, we reuse the thread. + fn find_target_thread(&self) -> Option> { + let mut it = &self.from_parent; + while let Some(transaction) = it { + if Arc::ptr_eq(&transaction.from.process, &self.to) { + return Some(transaction.from.clone()); + } + it = &transaction.from_parent; + } + None + } + + /// Searches in the transaction stack for a transaction originating at the given thread. + pub(crate) fn find_from(&self, thread: &Thread) -> Option<&DArc> { + let mut it = &self.from_parent; + while let Some(transaction) = it { + if core::ptr::eq(thread, transaction.from.as_ref()) { + return Some(transaction); + } + + it = &transaction.from_parent; + } + None + } + + pub(crate) fn set_outstanding(&self, to_process: &mut ProcessInner) { + // No race because this method is only called once. + if !self.is_outstanding.load(Ordering::Relaxed) { + self.is_outstanding.store(true, Ordering::Relaxed); + to_process.add_outstanding_txn(); + } + } + + /// Decrement `outstanding_txns` in `to` if it hasn't already been decremented. + fn drop_outstanding_txn(&self) { + // No race because this is called at most twice, and one of the calls are in the + // destructor, which is guaranteed to not race with any other operations on the + // transaction. It also cannot race with `set_outstanding`, since submission happens + // before delivery. + if self.is_outstanding.load(Ordering::Relaxed) { + self.is_outstanding.store(false, Ordering::Relaxed); + self.to.drop_outstanding_txn(); + } + } + + /// Submits the transaction to a work queue. Uses a thread if there is one in the transaction + /// stack, otherwise uses the destination process. + /// + /// Not used for replies. + pub(crate) fn submit(self: DLArc) -> BinderResult { + // Defined before `process_inner` so that the destructor runs after releasing the lock. + let mut _t_outdated; + + let oneway = self.flags & TF_ONE_WAY != 0; + let process = self.to.clone(); + let mut process_inner = process.inner.lock(); + + self.set_outstanding(&mut process_inner); + + if oneway { + if let Some(target_node) = self.target_node.clone() { + if process_inner.is_frozen { + process_inner.async_recv = true; + if self.flags & TF_UPDATE_TXN != 0 { + if let Some(t_outdated) = + target_node.take_outdated_transaction(&self, &mut process_inner) + { + // Save the transaction to be dropped after locks are released. + _t_outdated = t_outdated; + } + } + } + match target_node.submit_oneway(self, &mut process_inner) { + Ok(()) => {} + Err((err, work)) => { + drop(process_inner); + // Drop work after releasing process lock. + drop(work); + return Err(err); + } + } + + if process_inner.is_frozen { + return Err(BinderError::new_frozen_oneway()); + } else { + return Ok(()); + } + } else { + pr_err!("Failed to submit oneway transaction to node."); + } + } + + if process_inner.is_frozen { + process_inner.sync_recv = true; + return Err(BinderError::new_frozen()); + } + + let res = if let Some(thread) = self.find_target_thread() { + match thread.push_work(self) { + PushWorkRes::Ok => Ok(()), + PushWorkRes::FailedDead(me) => Err((BinderError::new_dead(), me)), + } + } else { + process_inner.push_work(self) + }; + drop(process_inner); + + match res { + Ok(()) => Ok(()), + Err((err, work)) => { + // Drop work after releasing process lock. + drop(work); + Err(err) + } + } + } + + /// Check whether one oneway transaction can supersede another. + pub(crate) fn can_replace(&self, old: &Transaction) -> bool { + if self.from.process.task.pid() != old.from.process.task.pid() { + return false; + } + + if self.flags & old.flags & (TF_ONE_WAY | TF_UPDATE_TXN) != (TF_ONE_WAY | TF_UPDATE_TXN) { + return false; + } + + let target_node_match = match (self.target_node.as_ref(), old.target_node.as_ref()) { + (None, None) => true, + (Some(tn1), Some(tn2)) => Arc::ptr_eq(tn1, tn2), + _ => false, + }; + + self.code == old.code && self.flags == old.flags && target_node_match + } + + fn prepare_file_list(&self) -> Result { + let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?; + + match alloc.translate_fds() { + Ok(translated) => { + *self.allocation.lock() = Some(alloc); + Ok(translated) + } + Err(err) => { + // Free the allocation eagerly. + drop(alloc); + Err(err) + } + } + } +} + +impl DeliverToRead for Transaction { + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let send_failed_reply = ScopeGuard::new(|| { + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Err(BR_FAILED_REPLY); + self.from.deliver_reply(reply, &self); + } + self.drop_outstanding_txn(); + }); + + let files = if let Ok(list) = self.prepare_file_list() { + list + } else { + // On failure to process the list, we send a reply back to the sender and ignore the + // transaction on the recipient. + return Ok(true); + }; + + let mut tr_sec = BinderTransactionDataSecctx::default(); + let tr = tr_sec.tr_data(); + if let Some(target_node) = &self.target_node { + let (ptr, cookie) = target_node.get_id(); + tr.target.ptr = ptr as _; + tr.cookie = cookie as _; + }; + tr.code = self.code; + tr.flags = self.flags; + tr.data_size = self.data_size as _; + tr.data.ptr.buffer = self.data_address as _; + tr.offsets_size = self.offsets_size as _; + if tr.offsets_size > 0 { + tr.data.ptr.offsets = (self.data_address + ptr_align(self.data_size).unwrap()) as _; + } + tr.sender_euid = self.sender_euid.into_uid_in_current_ns(); + tr.sender_pid = 0; + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + // Not a reply and not one-way. + tr.sender_pid = self.from.process.pid_in_current_ns(); + } + let code = if self.target_node.is_none() { + BR_REPLY + } else if self.txn_security_ctx_off.is_some() { + BR_TRANSACTION_SEC_CTX + } else { + BR_TRANSACTION + }; + + // Write the transaction code and data to the user buffer. + writer.write_code(code)?; + if let Some(off) = self.txn_security_ctx_off { + tr_sec.secctx = (self.data_address + off) as u64; + writer.write_payload(&tr_sec)?; + } else { + writer.write_payload(&*tr)?; + } + + let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?; + + // Dismiss the completion of transaction with a failure. No failure paths are allowed from + // here on out. + send_failed_reply.dismiss(); + + // Commit files, and set FDs in FDA to be closed on buffer free. + let close_on_free = files.commit(); + alloc.set_info_close_on_free(close_on_free); + + // It is now the user's responsibility to clear the allocation. + alloc.keep_alive(); + + self.drop_outstanding_txn(); + + // When this is not a reply and not a oneway transaction, update `current_transaction`. If + // it's a reply, `current_transaction` has already been updated appropriately. + if self.target_node.is_some() && tr_sec.transaction_data.flags & TF_ONE_WAY == 0 { + thread.set_current_transaction(self); + } + + Ok(false) + } + + fn cancel(self: DArc) { + let allocation = self.allocation.lock().take(); + drop(allocation); + + // If this is not a reply or oneway transaction, then send a dead reply. + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Err(BR_DEAD_REPLY); + self.from.deliver_reply(reply, &self); + } + + self.drop_outstanding_txn(); + } + + fn should_sync_wakeup(&self) -> bool { + self.flags & TF_ONE_WAY == 0 + } + + fn debug_print(&self, m: &SeqFile, _prefix: &str, tprefix: &str) -> Result<()> { + self.debug_print_inner(m, tprefix); + Ok(()) + } +} + +#[pinned_drop] +impl PinnedDrop for Transaction { + fn drop(self: Pin<&mut Self>) { + self.drop_outstanding_txn(); + } +} diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 1fd92021a573..03ee4c7010d7 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -38,7 +38,7 @@ enum { BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; -enum { +enum flat_binder_object_flags { FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 84d60635e8a9..9b3a4ab95818 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -99,3 +101,9 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; + +#if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) +#include "../../drivers/android/binder/rust_binder.h" +#include "../../drivers/android/binder/rust_binder_events.h" +#include "../../drivers/android/binder/page_range_helper.h" +#endif diff --git a/rust/helpers/binder.c b/rust/helpers/binder.c new file mode 100644 index 000000000000..224d38a92f1d --- /dev/null +++ b/rust/helpers/binder.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2025 Google LLC. + */ + +#include +#include + +unsigned long rust_helper_list_lru_count(struct list_lru *lru) +{ + return list_lru_count(lru); +} + +unsigned long rust_helper_list_lru_walk(struct list_lru *lru, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long nr_to_walk) +{ + return list_lru_walk(lru, isolate, cb_arg, nr_to_walk); +} + +void rust_helper_init_task_work(struct callback_head *twork, + task_work_func_t func) +{ + init_task_work(twork, func); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..8e8277bdddca 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -8,6 +8,7 @@ */ #include "auxiliary.c" +#include "binder.c" #include "blk.c" #include "bug.c" #include "build_assert.c" diff --git a/rust/helpers/page.c b/rust/helpers/page.c index b3f2b8fbf87f..7144de5a61db 100644 --- a/rust/helpers/page.c +++ b/rust/helpers/page.c @@ -2,6 +2,7 @@ #include #include +#include struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -17,3 +18,10 @@ void rust_helper_kunmap_local(const void *addr) { kunmap_local(addr); } + +#ifndef NODE_NOT_IN_PAGE_FLAGS +int rust_helper_page_to_nid(const struct page *page) +{ + return page_to_nid(page); +} +#endif diff --git a/rust/helpers/security.c b/rust/helpers/security.c index 0c4c2065df28..ca22da09548d 100644 --- a/rust/helpers/security.c +++ b/rust/helpers/security.c @@ -17,4 +17,28 @@ void rust_helper_security_release_secctx(struct lsm_context *cp) { security_release_secctx(cp); } + +int rust_helper_security_binder_set_context_mgr(const struct cred *mgr) +{ + return security_binder_set_context_mgr(mgr); +} + +int rust_helper_security_binder_transaction(const struct cred *from, + const struct cred *to) +{ + return security_binder_transaction(from, to); +} + +int rust_helper_security_binder_transfer_binder(const struct cred *from, + const struct cred *to) +{ + return security_binder_transfer_binder(from, to); +} + +int rust_helper_security_binder_transfer_file(const struct cred *from, + const struct cred *to, + const struct file *file) +{ + return security_binder_transfer_file(from, to, file); +} #endif diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs index 2599f01e8b28..3aa2e4c4a50c 100644 --- a/rust/kernel/cred.rs +++ b/rust/kernel/cred.rs @@ -54,6 +54,12 @@ impl Credential { unsafe { &*ptr.cast() } } + /// Returns a raw pointer to the inner credential. + #[inline] + pub fn as_ptr(&self) -> *const bindings::cred { + self.0.get() + } + /// Get the id for this security context. #[inline] pub fn get_secid(&self) -> u32 { diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs index 7c1b17246ed5..811fe30e8e6f 100644 --- a/rust/kernel/page.rs +++ b/rust/kernel/page.rs @@ -85,6 +85,12 @@ impl Page { self.page.as_ptr() } + /// Get the node id containing this page. + pub fn nid(&self) -> i32 { + // SAFETY: Always safe to call with a valid page. + unsafe { bindings::page_to_nid(self.as_ptr()) } + } + /// Runs a piece of code with this page mapped to an address. /// /// The page is unmapped when this call returns. diff --git a/rust/kernel/security.rs b/rust/kernel/security.rs index 0c63e9e7e564..9d271695265f 100644 --- a/rust/kernel/security.rs +++ b/rust/kernel/security.rs @@ -8,9 +8,46 @@ use crate::{ bindings, + cred::Credential, error::{to_result, Result}, + fs::File, }; +/// Calls the security modules to determine if the given task can become the manager of a binder +/// context. +#[inline] +pub fn binder_set_context_mgr(mgr: &Credential) -> Result { + // SAFETY: `mrg.0` is valid because the shared reference guarantees a nonzero refcount. + to_result(unsafe { bindings::security_binder_set_context_mgr(mgr.as_ptr()) }) +} + +/// Calls the security modules to determine if binder transactions are allowed from task `from` to +/// task `to`. +#[inline] +pub fn binder_transaction(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(unsafe { bindings::security_binder_transaction(from.as_ptr(), to.as_ptr()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send binder objects +/// (owned by itself or other processes) to task `to` through a binder transaction. +#[inline] +pub fn binder_transfer_binder(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(unsafe { bindings::security_binder_transfer_binder(from.as_ptr(), to.as_ptr()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send the given file to +/// task `to` (which would get its own file descriptor) through a binder transaction. +#[inline] +pub fn binder_transfer_file(from: &Credential, to: &Credential, file: &File) -> Result { + // SAFETY: `from`, `to` and `file` are valid because the shared references guarantee nonzero + // refcounts. + to_result(unsafe { + bindings::security_binder_transfer_file(from.as_ptr(), to.as_ptr(), file.as_ptr()) + }) +} + /// A security context string. /// /// # Invariants diff --git a/rust/uapi/uapi_helper.h b/rust/uapi/uapi_helper.h index 1409441359f5..de3562b08d0c 100644 --- a/rust/uapi/uapi_helper.h +++ b/rust/uapi/uapi_helper.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 01b4a3061b1d4ded108e1a700b4414c00662954c Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:55 +0300 Subject: wifi: nl80211: Add more configuration options for NAN commands Current NAN APIs have only basic configuration for master preference and operating bands. Add and parse additional parameters which provide more control over NAN synchronization. The newly added attributes allow to publish additional NAN attributes and vendor elements in NAN beacons, control scan and discovery beacons periodicity, enable/disable DW notifications etc. Signed-off-by: Andrei Otcheretianski tested: Miriam Rachel Korenblit Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.a4779492bf8e.I375feb919bd72358173766b9fe10010c40796b33@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 60 +++++++++ include/uapi/linux/nl80211.h | 110 +++++++++++++++- net/wireless/nl80211.c | 298 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 431 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4072a67c9cc9..e2f4ca500ea3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3912,6 +3912,38 @@ struct cfg80211_qos_map { struct cfg80211_dscp_range up[8]; }; +/** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + /** * struct cfg80211_nan_conf - NAN configuration * @@ -3921,10 +3953,31 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * If NULL, the device will pick a random Cluster ID. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + const u8 *cluster_id; + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; }; /** @@ -3933,10 +3986,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index aed0b4c5d5e8..20b8202a3d58 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1085,8 +1085,9 @@ * %NL80211_ATTR_NAN_MASTER_PREF attribute and optional * %NL80211_ATTR_BANDS attributes. If %NL80211_ATTR_BANDS is * omitted or set to 0, it means don't-care and the device will - * decide what to use. After this command NAN functions can be - * added. + * decide what to use. Additional cluster configuration may be + * optionally provided with %NL80211_ATTR_NAN_CONFIG. + * After this command NAN functions can be added. * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by * its %NL80211_ATTR_WDEV interface. * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined @@ -1115,6 +1116,10 @@ * current configuration is not changed. If it is present but * set to zero, the configuration is changed to don't-care * (i.e. the device can decide what to do). + * Additional parameters may be provided with + * %NL80211_ATTR_NAN_CONFIG. User space should provide all previously + * configured nested attributes under %NL80211_ATTR_NAN_CONFIG, even if + * only a subset was changed. * @NL80211_CMD_NAN_MATCH: Notification sent when a match is reported. * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and * %NL80211_ATTR_COOKIE. @@ -2936,6 +2941,12 @@ enum nl80211_commands { * indicate that it wants strict checking on the BSS parameters to be * modified. * + * @NL80211_ATTR_NAN_CONFIG: Nested attribute for + * extended NAN cluster configuration. This is used with + * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. + * See &enum nl80211_nan_conf_attributes for details. + * This attribute is optional. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3498,6 +3509,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, + NL80211_ATTR_NAN_CONFIG, /* add attributes here, update the policy in nl80211.c */ @@ -7323,6 +7335,100 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * enum nl80211_nan_band_conf_attributes - NAN band configuration attributes + * @__NL80211_NAN_BAND_CONF_INVALID: Invalid. + * @NL80211_NAN_BAND_CONF_BAND: Band for which the configuration is + * being set. The value is according to &enum nl80211_band (u8). + * @NL80211_NAN_BAND_CONF_FREQ: Discovery frequency. This attribute shall not + * be present on 2.4 GHZ band. On 5 GHz band its presence is optional. + * The allowed values are 5220 (channel 44) or 5745 (channel 149). + * If not present, channel 149 is used if allowed, otherwise channel 44 + * will be selected. The value is in MHz (u16). + * @NL80211_NAN_BAND_CONF_RSSI_CLOSE: RSSI close threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not specified, default device value is used. The value should + * be greater than -60 dBm (s8). + * @NL80211_NAN_BAND_CONF_RSSI_MIDDLE: RSSI middle threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not present, default device value is used. The value should be + * greater than -75 dBm and less than %NL80211_NAN_BAND_CONF_RSSI_CLOSE + * (s8). + * @NL80211_NAN_BAND_CONF_WAKE_DW: Committed DW information (values 0-5). + * Value 0 means that the device will not wake up during the + * discovery window. Values 1-5 mean that the device will wake up + * during each 2^(n - 1) discovery window, where n is the value of + * this attribute. Setting this attribute to 0 is not allowed on + * 2.4 GHz band (u8). This is an optional parameter (default is 1). + * @NL80211_NAN_BAND_CONF_DISABLE_SCAN: Optional flag attribute to disable + * scanning (for cluster merge) on the band. If set, the device will not + * scan on this band anymore. Disabling scanning on 2.4 GHz band is not + * allowed. + * @NUM_NL80211_NAN_BAND_CONF_ATTR: Internal. + * @NL80211_NAN_BAND_CONF_ATTR_MAX: Highest NAN band configuration attribute. + * + * These attributes are used to configure NAN band-specific parameters. Note, + * that both RSSI attributes should be configured (or both left unset). + */ +enum nl80211_nan_band_conf_attributes { + __NL80211_NAN_BAND_CONF_INVALID, + NL80211_NAN_BAND_CONF_BAND, + NL80211_NAN_BAND_CONF_FREQ, + NL80211_NAN_BAND_CONF_RSSI_CLOSE, + NL80211_NAN_BAND_CONF_RSSI_MIDDLE, + NL80211_NAN_BAND_CONF_WAKE_DW, + NL80211_NAN_BAND_CONF_DISABLE_SCAN, + + /* keep last */ + NUM_NL80211_NAN_BAND_CONF_ATTR, + NL80211_NAN_BAND_CONF_ATTR_MAX = NUM_NL80211_NAN_BAND_CONF_ATTR - 1, +}; + +/** + * enum nl80211_nan_conf_attributes - NAN configuration attributes + * @__NL80211_NAN_CONF_INVALID: Invalid attribute, used for validation. + * @NL80211_NAN_CONF_CLUSTER_ID: ID for the NAN cluster. This is a MAC + * address that can take values from 50-6F-9A-01-00-00 to + * 50-6F-9A-01-FF-FF. This attribute is optional. If not present, + * a random Cluster ID will be chosen. + * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be + * published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will + * be published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_BAND_CONFIGS: This is a nested array attribute, + * containing multiple entries for each supported band. Each band + * configuration consists of &enum nl80211_nan_band_conf_attributes. + * @NL80211_NAN_CONF_SCAN_PERIOD: Scan period in seconds. If not configured, + * device default is used. Zero value will disable scanning. + * This is u16 (optional). + * @NL80211_NAN_CONF_SCAN_DWELL_TIME: Scan dwell time in TUs per channel. + * Only non-zero values are valid. If not configured the device default + * value is used. This is u16 (optional) + * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval + * in TUs. Valid range is 50-200 TUs. If not configured the device default + * value is used. This is u8 (optional) + * @NUM_NL80211_NAN_CONF_ATTR: Internal. + * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. + * + * These attributes are used to configure NAN-specific parameters. + */ +enum nl80211_nan_conf_attributes { + __NL80211_NAN_CONF_INVALID, + NL80211_NAN_CONF_CLUSTER_ID, + NL80211_NAN_CONF_EXTRA_ATTRS, + NL80211_NAN_CONF_VENDOR_ELEMS, + NL80211_NAN_CONF_BAND_CONFIGS, + NL80211_NAN_CONF_SCAN_PERIOD, + NL80211_NAN_CONF_SCAN_DWELL_TIME, + NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + + /* keep last */ + NUM_NL80211_NAN_CONF_ATTR, + NL80211_NAN_CONF_ATTR_MAX = NUM_NL80211_NAN_CONF_ATTR - 1, +}; + /** * enum nl80211_external_auth_action - Action to perform with external * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b7bc7e5e81dd..04679acc8135 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -500,6 +520,35 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -769,6 +818,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -15398,6 +15448,211 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15414,23 +15669,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15786,6 +16031,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15793,27 +16039,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; -- cgit v1.2.3 From ba9b2ceaa2558a38a5da59fd654b641610a8568e Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:56 +0300 Subject: wifi: nl80211: Add NAN Discovery Window (DW) notification This notification will be used by the device to inform user space about upcoming DW. When received, user space will be able to prepare multicast Service Discovery Frames (SDFs) to be transmitted during the next DW using %NL80211_CMD_FRAME command on the NAN management interface. The device/driver will take care to transmit the frames in the correct timing. This allows to implement a synchronized Discovery Engine (DE) in user space, if the device doesn't support DE offload. Note that this notification can be sent before the actual DW starts as long as the driver/device handles the actual timing of the SDF transmission. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.0e1d15031bab.I5b1721e61b63910452b3c5cdcdc1e94cb094d4c9@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/wireless/nl80211.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 16 ++++++++++++++++ 4 files changed, 89 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2f4ca500ea3..0c1311d254be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3959,6 +3959,8 @@ struct cfg80211_nan_band_config { * @scan_period: period (in seconds) between NAN scans. * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. * @band_cfgs: array of band specific configurations, indexed by * &enum nl80211_band values. * @extra_nan_attrs: pointer to additional NAN attributes. @@ -3973,6 +3975,7 @@ struct cfg80211_nan_conf { u16 scan_period; u16 scan_dwell_time; u8 discovery_beacon_interval; + bool enable_dw_notification; struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; const u8 *extra_nan_attrs; u16 extra_nan_attrs_len; @@ -10062,6 +10065,15 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); */ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 20b8202a3d58..d674608e2635 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1349,6 +1349,15 @@ * control EPCS configuration. Used to notify userland on the current state * of EPCS. * + * @NL80211_CMD_NAN_NEXT_DW_NOTIFICATION: This command is used to notify + * user space about the next NAN Discovery Window (DW). User space may use + * it to prepare frames to be sent in the next DW. + * %NL80211_ATTR_WIPHY_FREQ is used to indicate the frequency of the next + * DW. SDF transmission should be requested with %NL80211_CMD_FRAME and + * the device/driver shall take care of the actual transmission timing. + * This notification is only sent to the NAN interface owning socket + * (see %NL80211_ATTR_SOCKET_OWNER flag). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1609,6 +1618,8 @@ enum nl80211_commands { NL80211_CMD_ASSOC_MLO_RECONF, NL80211_CMD_EPCS_CFG, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -7409,6 +7420,10 @@ enum nl80211_nan_band_conf_attributes { * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval * in TUs. Valid range is 50-200 TUs. If not configured the device default * value is used. This is u8 (optional) + * @NL80211_NAN_CONF_NOTIFY_DW: If set, the driver will notify userspace about + * the upcoming discovery window with + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION. + * This is a flag attribute. * @NUM_NL80211_NAN_CONF_ATTR: Internal. * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. * @@ -7423,6 +7438,7 @@ enum nl80211_nan_conf_attributes { NL80211_NAN_CONF_SCAN_PERIOD, NL80211_NAN_CONF_SCAN_DWELL_TIME, NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + NL80211_NAN_CONF_NOTIFY_DW, /* keep last */ NUM_NL80211_NAN_CONF_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 04679acc8135..d64145746b65 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -547,6 +547,7 @@ nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, }; static const struct netlink_range_validation nl80211_punct_bitmap_range = { @@ -15627,6 +15628,11 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) conf->discovery_beacon_interval = nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + out: if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { @@ -21764,6 +21770,45 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9b6074155d59..ff47e9bffd4f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4166,6 +4166,22 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 1ccfd8db34fb3b1852284668094d7207499c2415 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:57 +0300 Subject: wifi: cfg80211: Add cluster joined notification APIs The drivers should notify upper layers and user space when a NAN device joins a cluster. This is needed, for example, to set the correct addr3 in SDF frames. Add API to report cluster join event. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.ad27b7b6e4d9.I70b213a2a49f18d1ba2ad325e67e8eff51cc7a1f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 4 files changed, 82 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0c1311d254be..1b10bd31bdd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10074,6 +10074,20 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d674608e2635..c5a7658b7297 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1357,6 +1357,9 @@ * the device/driver shall take care of the actual transmission timing. * This notification is only sent to the NAN interface owning socket * (see %NL80211_ATTR_SOCKET_OWNER flag). + * @NL80211_CMD_NAN_CLUSTER_JOINED: This command is used to notify + * user space that the NAN new cluster has been joined. The cluster ID is + * indicated by %NL80211_ATTR_MAC. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1619,6 +1622,7 @@ enum nl80211_commands { NL80211_CMD_EPCS_CFG, NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + NL80211_CMD_NAN_CLUSTER_JOINED, /* add new commands above here */ @@ -2957,6 +2961,9 @@ enum nl80211_commands { * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. * See &enum nl80211_nan_conf_attributes for details. * This attribute is optional. + * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new + * NAN cluster has been created. This is used with + * %NL80211_CMD_NAN_CLUSTER_JOINED * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3521,6 +3528,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, + NL80211_ATTR_NAN_NEW_CLUSTER, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d64145746b65..904a725a4f4a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21809,6 +21809,47 @@ void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ff47e9bffd4f..8a4c34112eb5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4182,6 +4182,25 @@ TRACE_EVENT(cfg80211_next_nan_dw_notif, WDEV_PR_ARG, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 3cbadd84f5c4ea792c0df3506639a2cb57ba9b11 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:58 +0300 Subject: wifi: nl80211: Add more NAN capabilities Add better break down for NAN capabilities, as NAN has multiple optional features. This allows to better indicate which features are supported or or offloaded to the device. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.bb02cd8c1596.I01fb2e8dc3662b847f3c27117bc4e199fc96d0a3@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c5a7658b7297..423e258cdbd2 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2964,6 +2964,10 @@ enum nl80211_commands { * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new * NAN cluster has been created. This is used with * %NL80211_CMD_NAN_CLUSTER_JOINED + * @NL80211_ATTR_NAN_CAPABILITIES: Nested attribute for NAN capabilities. + * This is used with %NL80211_CMD_GET_WIPHY to indicate the NAN + * capabilities supported by the driver. See &enum nl80211_nan_capabilities + * for details. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3529,6 +3533,7 @@ enum nl80211_attrs { NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, NL80211_ATTR_NAN_NEW_CLUSTER, + NL80211_ATTR_NAN_CAPABILITIES, /* add attributes here, update the policy in nl80211.c */ @@ -8362,4 +8367,54 @@ enum nl80211_s1g_short_beacon_attrs { __NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1 }; +/** + * enum nl80211_nan_capabilities - NAN (Neighbor Aware Networking) + * capabilities. + * + * @__NL80211_NAN_CAPABILITIES_INVALID: Invalid. + * @NL80211_NAN_CAPA_CONFIGURABLE_SYNC: Flag attribute indicating that + * the device supports configurable synchronization. If set, the device + * should be able to handle %NL80211_ATTR_NAN_CONFIG + * attribute in the %NL80211_CMD_START_NAN (and change) command. + * @NL80211_NAN_CAPA_USERSPACE_DE: Flag attribute indicating that + * NAN Discovery Engine (DE) is not offloaded and the driver assumes + * user space DE implementation. When set, %NL80211_CMD_ADD_NAN_FUNCTION, + * %NL80211_CMD_DEL_NAN_FUNCTION and %NL80211_CMD_NAN_MATCH commands + * should not be used. In addition, the device/driver should support + * sending discovery window (DW) notifications using + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION and handling transmission and + * reception of NAN SDF frames on NAN device interface during DW windows. + * (%NL80211_CMD_FRAME is used to transmit SDFs) + * @NL80211_NAN_CAPA_OP_MODE: u8 attribute indicating the supported operation + * modes as defined in Wi-Fi Aware (TM) specification Table 81 (Operation + * Mode field format). + * @NL80211_NAN_CAPA_NUM_ANTENNAS: u8 attribute indicating the number of + * TX and RX antennas supported by the device. Lower nibble indicates + * the number of TX antennas and upper nibble indicates the number of RX + * antennas. Value 0 indicates the information is not available. + * See table 79 of Wi-Fi Aware (TM) specification (Number of + * Antennas field). + * @NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME: u16 attribute indicating the + * maximum time in microseconds that the device requires to switch + * channels. + * @NL80211_NAN_CAPA_CAPABILITIES: u8 attribute containing the + * capabilities of the device as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + * @__NL80211_NAN_CAPABILITIES_LAST: Internal + * @NL80211_NAN_CAPABILITIES_MAX: Highest NAN capability attribute. + */ +enum nl80211_nan_capabilities { + __NL80211_NAN_CAPABILITIES_INVALID, + + NL80211_NAN_CAPA_CONFIGURABLE_SYNC, + NL80211_NAN_CAPA_USERSPACE_DE, + NL80211_NAN_CAPA_OP_MODE, + NL80211_NAN_CAPA_NUM_ANTENNAS, + NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + NL80211_NAN_CAPA_CAPABILITIES, + /* keep last */ + __NL80211_NAN_CAPABILITIES_LAST, + NL80211_NAN_CAPABILITIES_MAX = __NL80211_NAN_CAPABILITIES_LAST - 1, +}; + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From b9c3d426c8a5823b3a1e5078719750c6abb0d2c1 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:12:59 +0300 Subject: wifi: cfg80211: Advertise supported NAN capabilities Allow drivers to specify the supported NAN capabilities and support advertising the NAN capabilities to user space. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.2976966556f5.Ic6e43b10049573180c909dad806f279cfb31143e@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 17 +++++++++++++++++ include/net/cfg80211.h | 38 ++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d350263f23f3..2110345de8ef 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6065,4 +6065,21 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) _data + ieee80211_mle_common_size(_data),\ _len - ieee80211_mle_common_size(_data)) +/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ +#define NAN_OP_MODE_PHY_MODE_VHT 0x01 +#define NAN_OP_MODE_PHY_MODE_HE 0x10 +#define NAN_OP_MODE_PHY_MODE_MASK 0x11 +#define NAN_OP_MODE_80P80MHZ 0x02 +#define NAN_OP_MODE_160MHZ 0x04 +#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 + +/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification + * Table 79 + */ +#define NAN_DEV_CAPA_DFS_OWNER 0x01 +#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 +#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 +#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 +#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 + #endif /* LINUX_IEEE80211_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b10bd31bdd6..e30c1886c530 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5711,6 +5711,42 @@ struct wiphy_radio { u32 antenna_mask; }; +/** + * enum wiphy_nan_flags - NAN capabilities + * + * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable + * synchronization. + * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload. + */ +enum wiphy_nan_flags { + WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0), + WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1), +}; + +/** + * struct wiphy_nan_capa - NAN capabilities + * + * This structure describes the NAN capabilities of a wiphy. + * + * @flags: NAN capabilities flags, see &enum wiphy_nan_flags + * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification + * Table 81. + * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower + * nibble indicates the number of TX antennas and upper nibble indicates the + * number of RX antennas. Value 0 indicates the information is not + * available. + * @max_channel_switch_time: maximum channel switch time in milliseconds. + * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + */ +struct wiphy_nan_capa { + u32 flags; + u8 op_mode; + u8 n_antennas; + u16 max_channel_switch_time; + u8 dev_capabilities; +}; + #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff /** @@ -5884,6 +5920,7 @@ struct wiphy_radio { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @nan_capa: NAN capabilities * * @txq_limit: configuration of internal TX queue frame limit * @txq_memory_limit: configuration internal TX queue memory limit @@ -6065,6 +6102,7 @@ struct wiphy { u32 bss_select_support; u8 nan_supported_bands; + struct wiphy_nan_capa nan_capa; u32 txq_limit; u32 txq_memory_limit; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 904a725a4f4a..bcd18ae59e84 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,6 +2605,41 @@ fail: return -ENOBUFS; } +static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *nan_caps; + + nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES); + if (!nan_caps) + return -ENOBUFS; + + if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC && + nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC)) + goto fail; + + if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) && + nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE)) + goto fail; + + if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE, + wiphy->nan_capa.op_mode) || + nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS, + wiphy->nan_capa.n_antennas) || + nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + wiphy->nan_capa.max_channel_switch_time) || + nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES, + wiphy->nan_capa.dev_capabilities)) + goto fail; + + nla_nest_end(msg, nan_caps); + + return 0; + +fail: + nla_nest_cancel(msg, nan_caps); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -3257,6 +3292,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_radios(&rdev->wiphy, msg)) goto nla_put_failure; + state->split_start++; + break; + case 18: + if (nl80211_put_nan_capa(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 1884e2594b084a6b1eb438e5eda586f284d80fee Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:01 +0300 Subject: wifi: cfg80211: Store the NAN cluster ID When the driver indicates that the device has joined a cluster, store the cluster ID. This is needed for data path operations, e.g., filtering received frames etc. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.63e9fef2a3aa.I6c858185c9e71f84bd2c5174d7ee45902b4391c3@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ net/wireless/nl80211.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e30c1886c530..26fd42e189ce 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6681,6 +6681,9 @@ struct wireless_dev { struct { struct cfg80211_chan_def chandef; } ocb; + struct { + u8 cluster_id[ETH_ALEN] __aligned(2); + } nan; } u; struct { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 72f68a17c92b..4e0d40865441 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21865,6 +21865,8 @@ void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + memcpy(wdev->u.nan.cluster_id, cluster_id, ETH_ALEN); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; -- cgit v1.2.3 From fc41f4a28ac4d462487903229494eeb266f68a40 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:02 +0300 Subject: wifi: mac80211: Support Tx of action frame for NAN Add support for sending management frame over a NAN Device interface: - Declare support for the supported management frames types. - Since action frame transmissions over a NAN Device interface do not necessarily require a channel configuration, e.g., they can be transmitted during DW, modify the Tx path to avoid accessing channel information for NAN Device interface. - In addition modify the points in the Tx path logic to account for cases that a band is not specified in the Tx information. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.23b160089228.I65a58af753bcbcfb5c4ad8ef372d546f889725ba@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/main.c | 5 +++++ net/mac80211/offchannel.c | 5 ++++- net/mac80211/rate.c | 11 ++++++++++- net/mac80211/tx.c | 12 ++++++++++-- 5 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a45e4bee65d4..a5140ecf334b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3192,6 +3192,10 @@ ieee80211_get_tx_rate(const struct ieee80211_hw *hw, { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; + + if (c->band >= NUM_NL80211_BANDS) + return NULL; + return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 437f1363c982..27b3ec5deabe 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -746,6 +746,11 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | BIT(IEEE80211_STYPE_AUTH >> 4), }, + [NL80211_IFTYPE_NAN] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), + }, }; static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 13df6321634d..ae82533e3c02 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,7 +8,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2009 Johannes Berg - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include #include @@ -897,6 +897,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, need_offchan = true; break; case NL80211_IFTYPE_NAN: + break; default: return -EOPNOTSUPP; } @@ -910,6 +911,8 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; + } else if (sdata->vif.type == NL80211_IFTYPE_NAN) { + /* Frames can be sent during NAN schedule */ } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 3cb2ad6d0b28..e441f8541603 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -4,7 +4,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include @@ -98,6 +98,9 @@ void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + if (st->info->band >= NUM_NL80211_BANDS) + return; + sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); @@ -419,6 +422,9 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (!sband) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -898,6 +904,9 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, return; sdata = vif_to_sdata(vif); + if (info->band >= NUM_NL80211_BANDS) + return; + sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a27e2af5d569..ba51198be94a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -59,6 +59,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; + if (info->band >= NUM_NL80211_BANDS) + return 0; + sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; @@ -683,7 +686,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) memset(&txrc, 0, sizeof(txrc)); - sband = tx->local->hw.wiphy->bands[info->band]; + if (info->band < NUM_NL80211_BANDS) + sband = tx->local->hw.wiphy->bands[info->band]; + else + return TX_CONTINUE; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); @@ -6288,7 +6294,9 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, enum nl80211_band band; rcu_read_lock(); - if (!ieee80211_vif_is_mld(&sdata->vif)) { + if (sdata->vif.type == NL80211_IFTYPE_NAN) { + band = NUM_NL80211_BANDS; + } else if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); -- cgit v1.2.3 From 04f17cfea2442ef2ed01da7ba1f686a58a50048e Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:06 +0300 Subject: wifi: mac80211: Export an API to check if NAN is started So it can be used by drivers to check if NAN Device interface is started or not. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.c69652f77eb6.Ie4f3d197e0706e742e3d97614fadc11b22adfbc6@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/util.c | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a5140ecf334b..a55085cf4ec4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7838,4 +7838,10 @@ int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, int n_vifs, enum ieee80211_chanctx_switch_mode mode); +/** + * ieee80211_vif_nan_started - Return whether a NAN vif is started + * @vif: the vif + * Return: %true iff the vif is a NAN interface and NAN is started + */ +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif); #endif /* MAC80211_H */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 9eb35e3b9e52..123842b841f2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4512,3 +4512,11 @@ void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) sizeof(tpe->psd_reg_client[i].power)); } } + +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; +} +EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); -- cgit v1.2.3 From d0688dc2b172d19e20fdb8be8c37930da12aaf88 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:11 +1000 Subject: wifi: cfg80211: correctly implement and validate S1G chandef Currently, the S1G channelisation implementation differs from that of VHT, which is the PHY that S1G is based on. The major difference between the clock rate is 1/10th of VHT. However how their channelisation is represented within cfg80211 and mac80211 vastly differ. To rectify this, remove the use of IEEE80211_CHAN_1/2/4.. flags that were previously used to indicate the control channel width, however it should be implied that the control channels are 1MHz in the case of S1G. Additionally, introduce the invert - being IEEE80211_CHAN_NO_4/8/16MHz - that imply the control channel may not be used for a certain bandwidth. With these new flags, we can perform regulatory and chandef validation just as we would for VHT. To deal with the notion that S1G PHYs may contain a 2MHz primary channel, introduce a new variable, s1g_primary_2mhz, which indicates whether we are operating on a 2MHz primary channel. In this case, the chandef::chan points to the 1MHz primary channel pointed to by the primary channel location. Alongside this, introduce some new helper routines that can extract the sibling 1MHz channel. The sibling being the alternate 1MHz primary subchannel within the 2MHz primary channel that is not pointed to by chandef::chan. Furthermore, due to unique restrictions imposed on S1G PHYs, introduce a new flag, IEEE80211_CHAN_S1G_NO_PRIMARY, which states that the 1MHz channel cannot be used as a primary channel. This is assumed to be set by vendors as it is hardware and regdom specific, When we validate a 2MHz primary channel, we need to ensure both 1MHz subchannels do not contain this flag. If one or both of the 1MHz subchannels contain this flag then the 2MHz primary is not permitted for use as a primary channel. Properly integrate S1G channel validation such that it is implemented according with other PHY types such as VHT. Additionally, implement a new S1G-specific regulatory flag to allow cfg80211 to understand specific vendor requirements for S1G PHYs. Signed-off-by: Arien Judge Signed-off-by: Andrew Pope Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-2-lachlan.hodges@morsemicro.com [remove redundant NL80211_ATTR_S1G_PRIMARY_2MHZ check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 95 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++ net/wireless/chan.c | 103 +++++++++++++++++++++++++++++-------------- net/wireless/nl80211.c | 37 +++++++++------- net/wireless/reg.c | 76 ++++++++++--------------------- 5 files changed, 225 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 26fd42e189ce..2d612c760dd1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -129,6 +129,13 @@ struct wiphy; * with very low power (VLP), even if otherwise set to NO_IR. * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -158,6 +165,10 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -821,6 +832,9 @@ struct key_params { * @punctured: mask of the punctured 20 MHz subchannels, with * bits turned on being disabled (punctured); numbered * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -830,6 +844,7 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; u16 freq1_offset; u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -990,6 +1005,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) return chandef->edmg.channels || chandef->edmg.bw_config; } +/** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + /** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition @@ -10179,4 +10206,72 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, void *data); #endif +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 423e258cdbd2..8134f10e4e6c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2969,6 +2969,10 @@ enum nl80211_commands { * capabilities supported by the driver. See &enum nl80211_nan_capabilities * for details. * + * @NL80211_ATTR_S1G_PRIMARY_2MHZ: flag attribute indicating that the S1G + * primary channel is 2 MHz wide, and the control channel designates + * the 1 MHz primary subchannel within that 2 MHz primary. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3535,6 +3539,8 @@ enum nl80211_attrs { NL80211_ATTR_NAN_NEW_CLUSTER, NL80211_ATTR_NAN_CAPABILITIES, + NL80211_ATTR_S1G_PRIMARY_2MHZ, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4432,6 +4438,12 @@ enum nl80211_wmm_rule { * very low power (VLP) AP, despite being NO_IR. * @NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY: This channel can be active in * 20 MHz bandwidth, despite being NO_IR. + * @NL80211_FREQUENCY_ATTR_NO_4MHZ: 4 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_8MHZ: 8 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this + * channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4477,6 +4489,9 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_CAN_MONITOR, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY, + NL80211_FREQUENCY_ATTR_NO_4MHZ, + NL80211_FREQUENCY_ATTR_NO_8MHZ, + NL80211_FREQUENCY_ATTR_NO_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e0d40865441..de34a1d14073 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -931,6 +931,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1319,6 +1320,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -3541,6 +3551,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3584,27 +3595,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -10455,8 +10459,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10478,6 +10483,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } -- cgit v1.2.3 From 31e7681da78d7e8d2d83185c0e640012a018f229 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:12 +1000 Subject: wifi: mac80211: correctly initialise S1G chandef for STA When moving to the APs channel, ensure we correctly initialise the chandef and perform the required validation. Additionally, if the AP is beaconing on a 2MHz primary, calculate the 2MHz primary center frequency by extracting the sibling 1MHz primary and averaging the frequencies to find the 2MHz primary center frequency. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 +++++++++++++++- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/main.c | 6 ++++-- net/mac80211/mlme.c | 53 ++++++++++++++++++++++++++++++++++++++++------ net/mac80211/scan.c | 13 ++++++------ net/mac80211/util.c | 39 ++++++++++++++++++++++++++++------ 6 files changed, 109 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2110345de8ef..ddff9102f633 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1182,6 +1182,18 @@ enum ieee80211_s1g_chanwidth { IEEE80211_S1G_CHANWIDTH_16MHZ = 15, }; +/** + * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths + * described in IEEE80211-2024 Table 10-39. + * + * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel + * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel + */ +enum ieee80211_s1g_pri_chanwidth { + IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, + IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, +}; + #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -3170,8 +3182,12 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) -#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) +#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) + +#define S1G_2M_PRIMARY_LOCATION_LOWER 0 +#define S1G_2M_PRIMARY_LOCATION_UPPER 1 /* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ #define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 414058bced1a..73fd86ec1bce 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2710,7 +2710,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 27b3ec5deabe..eefa6f7e899b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1249,11 +1249,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!dflt_chandef.chan) { /* * Assign the first enabled channel to dflt_chandef - * from the list of channels + * from the list of channels. For S1G interfaces + * ensure it can be used as a primary. */ for (i = 0; i < sband->n_channels; i++) if (!(sband->channels[i].flags & - IEEE80211_CHAN_DISABLED)) + (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_S1G_NO_PRIMARY))) break; /* if none found then use the first anyway */ if (i == sband->n_channels) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0e12309accbe..3b5827ea438e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -180,10 +180,11 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { - if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { - sdata_info(sdata, - "Missing S1G Operation Element? Trying operating == primary\n"); - chandef->width = ieee80211_s1g_channel_width(channel); + if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, + chandef)) { + /* Fallback to default 1MHz */ + chandef->width = NL80211_CHAN_WIDTH_1; + chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; @@ -1046,6 +1047,14 @@ again: ret = -EINVAL; goto free; } + + chanreq->oper = *ap_chandef; + if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_DISABLED)) { + ret = -EINVAL; + goto free; + } + return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { @@ -7292,6 +7301,38 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } +static bool +ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, + struct ieee80211_mgmt *mgmt, + struct ieee80211_rx_status *rx_status, + struct ieee80211_chanctx_conf *chanctx) +{ + u32 pri_2mhz_khz; + struct ieee80211_channel *s1g_sibling_1mhz; + u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); + u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); + + if (rx_khz == pri_khz) + return true; + + if (!chanctx->def.s1g_primary_2mhz) + return false; + + /* + * If we have an S1G interface with a 2MHz primary, beacons are + * sent on the center frequency of the 2MHz primary. Find the sibling + * 1MHz channel and calculate the 2MHz primary center frequency. + */ + s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, + &chanctx->def); + if (!s1g_sibling_1mhz) + return false; + + pri_2mhz_khz = + (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; + return rx_khz == pri_2mhz_khz; +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -7346,8 +7387,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, return; } - if (ieee80211_rx_status_to_khz(rx_status) != - ieee80211_channel_to_khz(chanctx_conf->def.chan)) { + if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, + chanctx_conf)) { rcu_read_unlock(); return; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index dbf98aa4cd67..bb9563f50e7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -996,15 +996,15 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; - /* For scanning on the S1G band, detect the channel width according to - * the channel being scanned. - */ + /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { - local->scan_chandef.width = ieee80211_s1g_channel_width(chan); + local->scan_chandef.width = NL80211_CHAN_WIDTH_1; + local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } - /* If scanning on oper channel, use whatever channel-type + /* + * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) @@ -1213,7 +1213,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || - band == NL80211_BAND_6GHZ) + band == NL80211_BAND_6GHZ || + band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 123842b841f2..c9931537f9d2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3199,10 +3199,11 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, return true; } -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { - u32 oper_freq; + u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; @@ -3227,12 +3228,36 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, return false; } - oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, - NL80211_BAND_S1GHZ); - chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); - chandef->freq1_offset = oper_freq % 1000; + chandef->s1g_primary_2mhz = false; - return true; + switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { + case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: + pri_1mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + break; + case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: + chandef->s1g_primary_2mhz = true; + pri_2mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + + if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == + S1G_2M_PRIMARY_LOCATION_LOWER) + pri_1mhz_khz = pri_2mhz_khz - 500; + else + pri_1mhz_khz = pri_2mhz_khz + 500; + break; + default: + return false; + } + + oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, + NL80211_BAND_S1GHZ); + chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); + chandef->freq1_offset = oper_khz % 1000; + chandef->chan = + ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); + + return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, -- cgit v1.2.3 From cbcd507f01deb983d5cad0a25b6495930ab59593 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:13 +1000 Subject: wifi: cfg80211: remove ieee80211_s1g_channel_width With the introduction of proper S1G channel flags, this function is no longer used. Remove it. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-4-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ---------- net/wireless/util.c | 27 --------------------------- 2 files changed, 37 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2d612c760dd1..1c041ce7a03b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6818,16 +6818,6 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan) return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; } -/** - * ieee80211_s1g_channel_width - get allowed channel width from @chan - * - * Only allowed for band NL80211_BAND_S1GHZ - * @chan: channel - * Return: The allowed channel width for this center_freq - */ -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan); - /** * ieee80211_channel_to_freq_khz - convert channel number to frequency * @chan: channel number diff --git a/net/wireless/util.c b/net/wireless/util.c index d12d49134c88..f26440d18ad3 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -106,33 +106,6 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) -{ - if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) - return NL80211_CHAN_WIDTH_20_NOHT; - - /*S1G defines a single allowed channel width per channel. - * Extract that width here. - */ - if (chan->flags & IEEE80211_CHAN_1MHZ) - return NL80211_CHAN_WIDTH_1; - else if (chan->flags & IEEE80211_CHAN_2MHZ) - return NL80211_CHAN_WIDTH_2; - else if (chan->flags & IEEE80211_CHAN_4MHZ) - return NL80211_CHAN_WIDTH_4; - else if (chan->flags & IEEE80211_CHAN_8MHZ) - return NL80211_CHAN_WIDTH_8; - else if (chan->flags & IEEE80211_CHAN_16MHZ) - return NL80211_CHAN_WIDTH_16; - - pr_err("unknown channel width for channel at %dKHz?\n", - ieee80211_channel_to_khz(chan)); - - return NL80211_CHAN_WIDTH_1; -} -EXPORT_SYMBOL(ieee80211_s1g_channel_width); - int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ -- cgit v1.2.3 From e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Apr 2025 17:12:59 +0200 Subject: writeback: Avoid contention on wb->list_lock when switching inodes There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo Signed-off-by: Jan Kara --- fs/fs-writeback.c | 99 +++++++++++++++++++++++++--------------- include/linux/backing-dev-defs.h | 4 ++ include/linux/writeback.h | 2 + mm/backing-dev.c | 5 ++ 4 files changed, 74 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..b0e9092ccf04 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -378,7 +379,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -486,13 +486,11 @@ skip_switch: return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return restart; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba..c5c9d89c73ed 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -152,6 +152,10 @@ struct bdi_writeback { struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ + struct work_struct switch_work; /* work used to perform inode switching + * to this wb */ + struct llist_head switch_wbs_ctxs; /* queued contexts for + * writeback switching */ union { struct work_struct release_work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..15a4bc4ab819 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } +void inode_switch_wbs_work_fn(struct work_struct *work); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..0beaca6bacf7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work) wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); + WARN_ON_ONCE(work_pending(&wb->switch_work)); call_rcu(&wb->rcu, cgwb_free_rcu); } @@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); + init_llist_head(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); + init_llist_head(&bdi->wb.switch_wbs_ctxs); } return ret; } -- cgit v1.2.3 From 0cee64c547e3c9cda646af3e075a64f445ee8148 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Sep 2025 12:38:38 +0200 Subject: writeback: Add tracepoint to track pending inode switches Add trace_inode_switch_wbs_queue tracepoint to allow insight into how many inodes are queued to switch their bdi_writeback structure. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 2 ++ include/trace/events/writeback.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index af5f396449f1..52129267e3bd 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -667,6 +667,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inodes[0] = inode; + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); wb_queue_isw(new_wb, isw); return; @@ -752,6 +753,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) return restart; } + trace_inode_switch_wbs_queue(wb, new_wb, nr); wb_queue_isw(new_wb, isw); return restart; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 1e23919c0da9..c08aff044e80 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history, ) ); +TRACE_EVENT(inode_switch_wbs_queue, + + TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb, + unsigned int count), + + TP_ARGS(old_wb, new_wb, count), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) + __field(unsigned int, count) + ), + + TP_fast_assign( + strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); + __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); + __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); + __entry->count = count; + ), + + TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", + __entry->name, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino, + __entry->count + ) +); + TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, -- cgit v1.2.3 From e3e1812f8e25ac277f5cc9249802365300c582e3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:28 +0200 Subject: ns: move to_ns_common() to ns_common.h Move the helper to ns_common.h where it belongs. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 20 ++++++++++++++++++++ include/linux/nsproxy.h | 11 ----------- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7d22ea50b098..bc2e0758e1c9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -6,6 +6,15 @@ struct proc_ns_operations; +struct cgroup_namespace; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -13,4 +22,15 @@ struct ns_common { refcount_t count; }; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns) + #endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..e6bec522b139 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,17 +42,6 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns->ns), \ - struct ipc_namespace *: &(__ns->ns), \ - struct net *: &(__ns->ns), \ - struct pid_namespace *: &(__ns->ns), \ - struct mnt_namespace *: &(__ns->ns), \ - struct time_namespace *: &(__ns->ns), \ - struct user_namespace *: &(__ns->ns), \ - struct uts_namespace *: &(__ns->ns)) - /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. -- cgit v1.2.3 From 9296f46a9645cf753d2522093485cebe77635aa6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:29 +0200 Subject: nsfs: add nsfs.h header And move the stuff out from proc_ns.h where it really doesn't belong. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 26 ++++++++++++++++++++++++++ include/linux/proc_ns.h | 13 +------------ 2 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 include/linux/nsfs.h (limited to 'include') diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h new file mode 100644 index 000000000000..fb84aa538091 --- /dev/null +++ b/include/linux/nsfs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ + +#ifndef _LINUX_NSFS_H +#define _LINUX_NSFS_H + +#include + +struct path; +struct task_struct; +struct proc_ns_operations; + +int ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); + +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +void nsfs_init(void); + +#endif /* _LINUX_NSFS_H */ + diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4b20375f3783..5e1a4b378b79 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -5,7 +5,7 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H -#include +#include #include struct pid_namespace; @@ -75,16 +75,5 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, - void *private_data); - -extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); - -extern int ns_get_name(char *buf, size_t size, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -extern void nsfs_init(void); #endif /* _LINUX_PROC_NS_H */ -- cgit v1.2.3 From 660def10b01b248fd97255afacb7b0e305ac833a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:30 +0200 Subject: ns: uniformly initialize ns_common No point in cargo-culting the same code across all the different types. Use one common initializer. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5e1a4b378b79..dbb119bda097 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -72,6 +72,22 @@ static inline int ns_alloc_inum(struct ns_common *ns) return proc_alloc_inum(&ns->inum); } +static inline int ns_common_init(struct ns_common *ns, + const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + return 0; +} + #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -- cgit v1.2.3 From 86c5aba210b145d7de011a5abaf9b785aa70a183 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:39 +0200 Subject: ns: remove ns_alloc_inum() It's now unused. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index dbb119bda097..e50d312f9fee 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,12 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_alloc_inum(struct ns_common *ns) -{ - WRITE_ONCE(ns->stashed, NULL); - return proc_alloc_inum(&ns->inum); -} - static inline int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, bool alloc_inum) -- cgit v1.2.3 From 885fc8ac0a4dc70f5d87b80b0977292870e35c60 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:40 +0200 Subject: nstree: make iterator generic Move the namespace iteration infrastructure originally introduced for mount namespaces into a generic library usable by all namespace types. Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 9 ++ include/linux/nstree.h | 91 ++++++++++++++++++ include/linux/proc_ns.h | 3 + kernel/Makefile | 2 +- kernel/nstree.c | 233 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 include/linux/nstree.h create mode 100644 kernel/nstree.c (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bc2e0758e1c9..7224072cccc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -3,6 +3,7 @@ #define _LINUX_NS_COMMON_H #include +#include struct proc_ns_operations; @@ -20,6 +21,14 @@ struct ns_common { const struct proc_ns_operations *ops; unsigned int inum; refcount_t count; + union { + struct { + u64 ns_id; + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct rcu_head ns_rcu; + }; }; #define to_ns_common(__ns) \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h new file mode 100644 index 000000000000..29ad6402260c --- /dev/null +++ b/include/linux/nstree.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NSTREE_H +#define _LINUX_NSTREE_H + +#include +#include +#include +#include +#include +#include + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +extern struct ns_tree cgroup_ns_tree; +extern struct ns_tree ipc_ns_tree; +extern struct ns_tree mnt_ns_tree; +extern struct ns_tree net_ns_tree; +extern struct ns_tree pid_ns_tree; +extern struct ns_tree time_ns_tree; +extern struct ns_tree user_ns_tree; +extern struct ns_tree uts_ns_tree; + +#define to_ns_tree(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(cgroup_ns_tree), \ + struct ipc_namespace *: &(ipc_ns_tree), \ + struct net *: &(net_ns_tree), \ + struct pid_namespace *: &(pid_ns_tree), \ + struct mnt_namespace *: &(mnt_ns_tree), \ + struct time_namespace *: &(time_ns_tree), \ + struct user_namespace *: &(user_ns_tree), \ + struct uts_namespace *: &(uts_ns_tree)) + +u64 ns_tree_gen_id(struct ns_common *ns); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, + bool previous); + +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +{ + ns_tree_gen_id(ns); + __ns_tree_add_raw(ns, ns_tree); +} + +/** + * ns_tree_add_raw - Add a namespace to a namespace + * @ns: Namespace to add + * + * This function adds a namespace to the appropriate namespace tree + * without assigning a id. + */ +#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_add - Add a namespace to a namespace tree + * @ns: Namespace to add + * + * This function assigns a new id to the namespace and adds it to the + * appropriate namespace tree and list. + */ +#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_remove - Remove a namespace from a namespace tree + * @ns: Namespace to remove + * + * This function removes a namespace from the appropriate namespace + * tree and list. + */ +#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) + +#define ns_tree_adjoined_rcu(__ns, __previous) \ + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) + +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) + +#endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index e50d312f9fee..7f89f0829e60 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -79,6 +79,9 @@ static inline int ns_common_init(struct ns_common *ns, refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); return 0; } diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..b807516a1b43 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 000000000000..bbe8bedc924c --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} -- cgit v1.2.3 From b36c823b9a4be5b0c8e38c3fd60cade7d41c216c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:46 +0200 Subject: time: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Reviewed-by: Thomas Gleixner Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 5 +++++ init/main.c | 2 ++ kernel/time/namespace.c | 11 ++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..7f6af7a9771e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,7 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); @@ -108,6 +109,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline void __init time_ns_init(void) +{ +} + static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c..e7d2c57c65a7 100644 --- a/init/main.c +++ b/init/main.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -1072,6 +1073,7 @@ void start_kernel(void) fork_init(); proc_caches_init(); uts_ns_init(); + time_ns_init(); key_init(); security_init(); dbg_late_init(); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0be93d8f2896..408f60d0a3b6 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -250,11 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); __free_page(ns->vvar_page); - kfree(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct time_namespace *to_time_ns(struct ns_common *ns) @@ -487,3 +491,8 @@ struct time_namespace init_time_ns = { .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} -- cgit v1.2.3 From d7afdf889561058068ab46fd8f306c70ef29216a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:49 +0200 Subject: ns: add to__ns() to respective headers Every namespace type has a container_of(ns, , ns) static inline function that is currently not exposed in the header. So we have a bunch of places that open-code it via container_of(). Move it to the headers so we can use it directly. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 +++++ include/linux/ipc_namespace.h | 5 +++++ include/linux/pid_namespace.h | 5 +++++ include/linux/time_namespace.h | 4 ++++ include/linux/user_namespace.h | 5 +++++ include/linux/utsname.h | 5 +++++ include/net/net_namespace.h | 5 +++++ ipc/namespace.c | 5 ----- kernel/cgroup/namespace.c | 5 ----- kernel/pid_namespace.c | 5 ----- kernel/time/namespace.c | 5 ----- kernel/user_namespace.c | 5 ----- kernel/utsname.c | 5 ----- net/core/net_namespace.c | 5 ----- 14 files changed, 34 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..9ca25346f7cb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -794,6 +794,11 @@ extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..924e4754374f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,11 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..ba0efc8c8596 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,6 +54,11 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 7f6af7a9771e..a47a4ce4183e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,10 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d012137..a09056ad090e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,6 +168,11 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..5d34c4f0f945 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -30,6 +30,11 @@ struct uts_namespace { extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + static inline void get_uts_ns(struct uts_namespace *ns) { refcount_inc(&ns->ns.count); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..fd090ceb80bf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -262,6 +262,11 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 9f923c1a1eb3..89588819956b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -209,11 +209,6 @@ void put_ipc_ns(struct ipc_namespace *ns) } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fc12c416dfeb..5a327914b565 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -89,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 228ae20299f9..9b327420309e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -345,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 408f60d0a3b6..20b65f90549e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -261,11 +261,6 @@ void free_time_ns(struct time_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); -} - static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ade5b6806c5c..cfb0e28f2779 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1325,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; diff --git a/kernel/utsname.c b/kernel/utsname.c index 64155417ae0c..a682830742d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -103,11 +103,6 @@ void free_uts_ns(struct uts_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 169ec22c4758..a57b3cda8dbc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1541,11 +1541,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); -- cgit v1.2.3 From d2afdb73f8ad77b49eca9d110d0c54bf30d1df0f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:50 +0200 Subject: nsfs: add current_in_namespace() Add a helper to easily check whether a given namespace is the caller's current namespace. This is currently open-coded in a lot of places. Simply switch on the type and compare the results. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index fb84aa538091..e5a5fa83d36b 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -5,6 +5,8 @@ #define _LINUX_NSFS_H #include +#include +#include struct path; struct task_struct; @@ -22,5 +24,17 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); void nsfs_init(void); -#endif /* _LINUX_NSFS_H */ +#define __current_namespace_from_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ + struct ipc_namespace *: current->nsproxy->ipc_ns, \ + struct net *: current->nsproxy->net_ns, \ + struct pid_namespace *: task_active_pid_ns(current), \ + struct mnt_namespace *: current->nsproxy->mnt_ns, \ + struct time_namespace *: current->nsproxy->time_ns, \ + struct user_namespace *: current_user_ns(), \ + struct uts_namespace *: current->nsproxy->uts_ns) + +#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +#endif /* _LINUX_NSFS_H */ -- cgit v1.2.3 From 5222470b2fbb3740f931f189db33dd1367b1ae75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:51 +0200 Subject: nsfs: support file handles A while ago we added support for file handles to pidfs so pidfds can be encoded and decoded as file handles. Userspace has adopted this quickly and it's proven very useful. Implement file handles for namespaces as well. A process is not always able to open /proc/self/ns/. That requires procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be overmounted. However, userspace can always derive a namespace fd from a pidfd. And that always works for a task's own namespace. There's no need to introduce unnecessary behavioral differences between /proc/self/ns/ fds, pidfd-derived namespace fds, and file-handle-derived namespace fds. So namespace file handles are always decodable if the caller is located in the namespace the file handle refers to. This also allows a task to e.g., store a set of file handles to its namespaces in a file on-disk so it can verify when it gets rexeced that they're still valid and so on. This is akin to the pidfd use-case. Or just plainly for namespace comparison reasons where a file handle to the task's own namespace can be easily compared against others. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/nsfs.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 6 ++ include/uapi/linux/nsfs.h | 9 +++ 3 files changed, 173 insertions(+) (limited to 'include') diff --git a/fs/nsfs.c b/fs/nsfs.c index 80e631aeb3ce..926e2680414e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,6 +13,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" @@ -417,12 +423,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ops->type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!refcount_inc_not_zero(&ns->count)) + return NULL; + } + + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..3aac58a520c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139f..fa86fe3c8bd3 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,4 +53,13 @@ enum init_ns_ino { MNT_NS_INIT_INO = 0xEFFFFFF8U, }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From e83f0b5d10dcf62833008327cb661c7d118bca85 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:52 +0200 Subject: nsfs: support exhaustive file handles Pidfd file handles are exhaustive meaning they don't require a handle on another pidfd to pass to open_by_handle_at() so it can derive the filesystem to decode in. Instead it can be derived from the file handle itself. The same is possible for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 6 ++++++ fs/internal.h | 1 + fs/nsfs.c | 10 ++++++++++ include/uapi/linux/fcntl.h | 1 + 4 files changed, 18 insertions(+) (limited to 'include') diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58..dd734d8828d0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..a33d18ee5b74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/nsfs.c b/fs/nsfs.c index 926e2680414e..22765fcab18e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -25,6 +25,14 @@ static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -598,4 +606,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94eb..3741ea1b73d8 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From f861225b9ee9cb2da1c7b2f5f921856cb8ca86bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:53 +0200 Subject: nsfs: add missing id retrieval support The mount namespace has supported id retrieval for a while already. Add support for the other types as well. Signed-off-by: Christian Brauner --- fs/nsfs.c | 25 +++++++++++++------------ include/uapi/linux/nsfs.h | 6 ++++-- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/nsfs.c b/fs/nsfs.c index 22765fcab18e..8484bc4dd3de 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -177,6 +177,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: + case NS_GET_ID: return true; } @@ -226,18 +227,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->ns.ns_id; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -283,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index fa86fe3c8bd3..5d5bf22464c9 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,6 +40,10 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + enum init_ns_ino { IPC_NS_INIT_INO = 0xEFFFFFFFU, UTS_NS_INIT_INO = 0xEFFFFFFEU, -- cgit v1.2.3 From 93f67a7ddadf6ed8997c000df9790e5d64617196 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:00 +0200 Subject: uts: split namespace into separate header We have dedicated headers for all namespace types. Add one for the uts namespace as well. Now it's consistent for all namespace types. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 65 +++++++++++++++++++++++++++++++++++++++++++ include/linux/utsname.h | 58 +------------------------------------- 2 files changed, 66 insertions(+), 57 deletions(-) create mode 100644 include/linux/uts_namespace.h (limited to 'include') diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h new file mode 100644 index 000000000000..c2b619bb4e57 --- /dev/null +++ b/include/linux/uts_namespace.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UTS_NAMESPACE_H +#define _LINUX_UTS_NAMESPACE_H + +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct uts_namespace { + struct new_utsname name; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct uts_namespace init_uts_ns; + +#ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +extern struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns); +extern void free_uts_ns(struct uts_namespace *ns); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_uts_ns(ns); +} + +void uts_ns_init(void); +#else +static inline void get_uts_ns(struct uts_namespace *ns) +{ +} + +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} + +static inline struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns) +{ + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void uts_ns_init(void) +{ +} +#endif + +#endif /* _LINUX_UTS_NAMESPACE_H */ diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 5d34c4f0f945..547bd4439706 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include enum uts_proc { UTS_PROC_ARCH, @@ -18,62 +18,6 @@ enum uts_proc { UTS_PROC_DOMAINNAME, }; -struct user_namespace; -extern struct user_namespace init_user_ns; - -struct uts_namespace { - struct new_utsname name; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct ns_common ns; -} __randomize_layout; -extern struct uts_namespace init_uts_ns; - -#ifdef CONFIG_UTS_NS -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - -static inline void get_uts_ns(struct uts_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); -extern void free_uts_ns(struct uts_namespace *ns); - -static inline void put_uts_ns(struct uts_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_uts_ns(ns); -} - -void uts_ns_init(void); -#else -static inline void get_uts_ns(struct uts_namespace *ns) -{ -} - -static inline void put_uts_ns(struct uts_namespace *ns) -{ -} - -static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) -{ - if (flags & CLONE_NEWUTS) - return ERR_PTR(-EINVAL); - - return old_ns; -} - -static inline void uts_ns_init(void) -{ -} -#endif - #ifdef CONFIG_PROC_SYSCTL extern void uts_proc_notify(enum uts_proc proc); #else -- cgit v1.2.3 From b2a0b192084acd0a86d66cbbc61e17ba1f5bd583 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:01 +0200 Subject: mnt: expose pointer to init_mnt_ns There's various scenarios where we need to know whether we are in the initial set of namespaces or not to e.g., shortcut permission checking. All namespaces expose that information. Let's do that too. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 27 ++++++++++++++++----------- include/linux/mnt_namespace.h | 2 ++ 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index a68998449698..f0bddc9cf2a6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6008,27 +6008,32 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = PROC_MNT_INIT_INO, + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.count = REFCOUNT_INIT(1), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6036,7 +6041,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..6d1c4c218c14 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,6 +11,8 @@ struct fs_struct; struct user_namespace; struct ns_common; +extern struct mnt_namespace init_mnt_ns; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); -- cgit v1.2.3 From f74ca6da113d5d4b21c00bb4da3f3c137162b4fe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:02 +0200 Subject: nscommon: move to separate file It's really awkward spilling the ns common infrastructure into multiple headers. Move it to a separate file. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 19 ------------------- kernel/Makefile | 2 +- kernel/nscommon.c | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 kernel/nscommon.c (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7224072cccc5..78b17fe80b62 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -31,6 +31,9 @@ struct ns_common { }; }; +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum); + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 7f89f0829e60..9f21670b5824 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,25 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_common_init(struct ns_common *ns, - const struct proc_ns_operations *ops, - bool alloc_inum) -{ - if (alloc_inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } - refcount_set(&ns->count, 1); - ns->stashed = NULL; - ns->ops = ops; - ns->ns_id = 0; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); - return 0; -} - #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) diff --git a/kernel/Makefile b/kernel/Makefile index b807516a1b43..1f48f7cd2d7b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o nstree.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 000000000000..ebf4783d0505 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include + +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + return 0; +} -- cgit v1.2.3 From 5fc6bef178f1b644f1439e520c8f83bfc83a1252 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:03 +0200 Subject: cgroup: split namespace into separate header We have dedicated headers for all namespace types. Add one for the cgroup namespace as well. Now it's consistent for all namespace types and easy to figure out what to include. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 51 +----------------------------------- include/linux/cgroup_namespace.h | 56 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 include/linux/cgroup_namespace.h (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9ca25346f7cb..5156fed8cbc3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ #include #include +#include struct kernel_clone_args; @@ -783,56 +784,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -struct cgroup_namespace { - struct ns_common ns; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct css_set *root_cset; -}; - -extern struct cgroup_namespace init_cgroup_ns; - -#ifdef CONFIG_CGROUPS - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -void free_cgroup_ns(struct cgroup_namespace *ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns); - -int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} - -#else /* !CONFIG_CGROUPS */ - -static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } -static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - return old_ns; -} - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } - -#endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h new file mode 100644 index 000000000000..c02bb76c5e32 --- /dev/null +++ b/include/linux/cgroup_namespace.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CGROUP_NAMESPACE_H +#define _LINUX_CGROUP_NAMESPACE_H + +struct cgroup_namespace { + struct ns_common ns; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_cgroup_ns(ns); +} + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_NAMESPACE_H */ -- cgit v1.2.3 From cc47f434271ba90c18c16e0bba360df38a8bc954 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:04 +0200 Subject: nsfs: add inode number for anon namespace Add an inode number anonymous namespaces. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 5d5bf22464c9..e098759ec917 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,6 +53,9 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; struct nsfs_file_handle { -- cgit v1.2.3 From 5612ff3ec588be09f11a9424db6d1186bcdeb3fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:07 +0200 Subject: nscommon: simplify initialization There's a lot of information that namespace implementers don't need to know about at all. Encapsulate this all in the initialization helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 5 +++-- include/linux/ns_common.h | 39 +++++++++++++++++++++++++++++++++++++-- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 17 ++++++++--------- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- 10 files changed, 55 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index b2fcb901ad8c..699b8c770c47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,8 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - new_ns->ns.inum = MNT_NS_ANON_INO; - ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns, &mntns_operations); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 78b17fe80b62..05c7a7dd211b 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -16,6 +16,15 @@ struct time_namespace; struct user_namespace; struct uts_namespace; +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -31,8 +40,7 @@ struct ns_common { }; }; -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum); +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -45,4 +53,31 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, struct user_namespace *: &(__ns)->ns, \ struct uts_namespace *: &(__ns)->ns) +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_common_init(__ns, __ops) \ + __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) + #endif diff --git a/ipc/namespace.c b/ipc/namespace.c index 89588819956b..0f8bbd18a475 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(&ns->ns, &ipcns_operations, true); + err = ns_common_init(ns, &ipcns_operations); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 5a327914b565..d928c557e28b 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + ret = ns_common_init(new_ns, &cgroupns_operations); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index e10fad8afe61..c3a90bb665ad 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,21 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum) +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - if (alloc_inum && !ns->inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); - return 0; + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 9b327420309e..170757c265c2 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(&ns->ns, &pidns_operations, true); + err = ns_common_init(ns, &pidns_operations); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 20b65f90549e..ce8e952104a7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(&ns->ns, &timens_operations, true); + err = ns_common_init(ns, &timens_operations); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index cfb0e28f2779..db9f0463219c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(&ns->ns, &userns_operations, true); + ret = ns_common_init(ns, &userns_operations); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index a682830742d3..399888be66bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(&ns->ns, &utsns_operations, true); + err = ns_common_init(ns, &utsns_operations); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9df236811454..e50897fba8cd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, true); + ret = ns_common_init(net, ns_ops); if (ret) return ret; -- cgit v1.2.3 From 5d36370f34312776d202e5c35d1a786d8b07a9c3 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 17 Sep 2025 08:32:50 +0100 Subject: ALSA: compress: add raw opus codec define and opus decoder structs Adds a raw opus codec define and raw opus decoder structs. This is for raw OPUS packets not packed in any type of container (for instance OGG container). The decoder struct fields are taken from corresponding RFC document: RFC 7845 Section 5. Cc: Srinivas Kandagatla Cc: Vinod Koul Co-developed-by: Annemarie Porter Signed-off-by: Annemarie Porter Signed-off-by: Alexey Klimov Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_params.h | 43 +++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h index bc7648a30746..faf4fa911f7f 100644 --- a/include/uapi/sound/compress_params.h +++ b/include/uapi/sound/compress_params.h @@ -43,7 +43,8 @@ #define SND_AUDIOCODEC_BESPOKE ((__u32) 0x0000000E) #define SND_AUDIOCODEC_ALAC ((__u32) 0x0000000F) #define SND_AUDIOCODEC_APE ((__u32) 0x00000010) -#define SND_AUDIOCODEC_MAX SND_AUDIOCODEC_APE +#define SND_AUDIOCODEC_OPUS_RAW ((__u32) 0x00000011) +#define SND_AUDIOCODEC_MAX SND_AUDIOCODEC_OPUS_RAW /* * Profile and modes are listed with bit masks. This allows for a @@ -324,6 +325,45 @@ struct snd_dec_ape { __u32 seek_table_present; } __attribute__((packed, aligned(4))); +/** + * struct snd_dec_opus - Opus decoder parameters (raw opus packets) + * @version: Usually should be '1' but can be split into major (4 upper bits) + * and minor (4 lower bits) sub-fields. + * @num_channels: Number of output channels. + * @pre_skip: Number of samples to discard at 48 kHz. + * @sample_rate: Sample rate of original input. + * @output_gain: Gain to apply when decoding (in Q7.8 format). + * @mapping_family: Order and meaning of output channels. Only values 0 and 1 + * are expected; values 2..255 are not recommended for playback. + * + * Optional channel mapping table. Describes mapping of opus streams to decoded + * channels. + * @struct snd_dec_opus_ch_map + * @stream_count: Number of streams encoded in each Ogg packet. + * @coupled_count: Number of streams whose decoders are used for two + * channels. + * @channel_map: describes which decoded channel to be used for each one. + * See RFC doc for details. + * This supports only mapping families 0 and 1, therefore max + * number of channels is 8. + * + * These options were extracted from RFC7845 Section 5. + */ + +struct snd_dec_opus { + __u8 version; + __u8 num_channels; + __u16 pre_skip; + __u32 sample_rate; + __u16 output_gain; + __u8 mapping_family; + struct snd_dec_opus_ch_map { + __u8 stream_count; + __u8 coupled_count; + __u8 channel_map[8]; + } chan_map; +} __attribute__((packed, aligned(4))); + union snd_codec_options { struct snd_enc_wma wma; struct snd_enc_vorbis vorbis; @@ -334,6 +374,7 @@ union snd_codec_options { struct snd_dec_wma wma_d; struct snd_dec_alac alac_d; struct snd_dec_ape ape_d; + struct snd_dec_opus opus_d; struct { __u32 out_sample_rate; } src_d; -- cgit v1.2.3 From b07d2514b91c30ab16fdf8f9cc3523bef969becf Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 17 Sep 2025 08:32:51 +0100 Subject: ALSA: compress_offload: increase SNDRV_COMPRESS_VERSION minor version by 1 Since addition of raw opus codec support we need to update compress API minor version by one. Bump the SNDRV_COMPRESS_VERSION to 0.4.1. Signed-off-by: Alexey Klimov Acked-by: Vinod Koul Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_offload.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index 26f756cc2e62..b610683fd8db 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -13,7 +13,7 @@ #include #include -#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 0) +#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 1) /** * struct snd_compressed_buffer - compressed buffer * @fragment_size: size of buffer fragment in bytes -- cgit v1.2.3 From be5f21d3985f00827e09b798f7a07ebd6dd7f54a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:08 +0200 Subject: ns: add ns_common_free() And drop ns_free_inum(). Anything common that can be wasted centrally should be wasted in the new common helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 2 -- ipc/namespace.c | 4 ++-- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 5 +++++ kernel/pid_namespace.c | 4 ++-- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 4 ++-- kernel/utsname.c | 2 +- net/core/net_namespace.c | 4 ++-- 11 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 699b8c770c47..b9f94769ec11 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4082,7 +4082,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } @@ -4154,7 +4154,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 05c7a7dd211b..19833ac547f9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -41,6 +41,7 @@ struct ns_common { }; int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -80,4 +81,6 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, #define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + #endif diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 9f21670b5824..08016f6e0e6f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,8 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) #endif /* _LINUX_PROC_NS_H */ diff --git a/ipc/namespace.c b/ipc/namespace.c index 0f8bbd18a475..09d261a1a2aa 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -97,7 +97,7 @@ fail_mq: fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index d928c557e28b..16ead7508371 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -40,7 +40,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c3a90bb665ad..7c1b07e2a6c9 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -18,3 +18,8 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, } return proc_alloc_inum(&ns->inum); } + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 170757c265c2..27e2dd9ee051 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -127,7 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -152,7 +152,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index ce8e952104a7..d49c73015d6e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -255,7 +255,7 @@ void free_time_ns(struct time_namespace *ns) ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index db9f0463219c..32406bcab526 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -165,7 +165,7 @@ fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -220,7 +220,7 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); diff --git a/kernel/utsname.c b/kernel/utsname.c index 399888be66bd..95d733eb2c98 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -98,7 +98,7 @@ void free_uts_ns(struct uts_namespace *ns) ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e50897fba8cd..a6a3de56a81c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -590,7 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: - ns_free_inum(&net->ns); + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -713,7 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - ns_free_inum(&net->ns); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); -- cgit v1.2.3 From 224ef741ce87aa6474b82e0eb76e0e8e1bafe544 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:46 +0200 Subject: ns: add reference count helpers Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 19833ac547f9..65e258e1fdc6 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -43,16 +43,24 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns) +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) #define ns_init_inum(__ns) \ _Generic((__ns), \ @@ -83,4 +91,21 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) +{ + return refcount_dec_and_test(&ns->count); +} + +static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) +{ + return refcount_inc_not_zero(&ns->count); +} + +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) +#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_put_and_lock(__ns, __lock) \ + refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + #endif -- cgit v1.2.3 From 06099e374f3ab818f0501671b21493ba2e1b94b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:48 +0200 Subject: cgroup: port to ns_ref_*() helpers Stop accessing ns.count directly. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index c02bb76c5e32..b7dbf4d623d2 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -29,12 +29,12 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_cgroup_ns(ns); } -- cgit v1.2.3 From d4825c99d6a738c565d5142ce37369368a4352da Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:49 +0200 Subject: ipc: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ipc_namespace.h | 4 ++-- ipc/namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 924e4754374f..21eff63f47da 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -140,14 +140,14 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags, static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { - if (refcount_inc_not_zero(&ns->ns.count)) + if (ns_ref_get(ns)) return ns; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 09d261a1a2aa..bd85d1c9d2c2 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -199,7 +199,7 @@ static void free_ipc(struct work_struct *unused) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { + if (ns_ref_put_and_lock(ns, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); -- cgit v1.2.3 From 07897b38eadf5a370a6001790239f23036d5b970 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:50 +0200 Subject: pid: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 2 +- kernel/pid_namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index ba0efc8c8596..5b2f29d369c4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -62,7 +62,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 27e2dd9ee051..162f5fb63d75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -169,7 +169,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } struct pid_namespace *copy_pid_ns(unsigned long flags, @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); -- cgit v1.2.3 From e0c173f1fa02c0b08720aa8aa0cc91c3063146ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:51 +0200 Subject: time: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index a47a4ce4183e..f3b9567cf1f4 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -44,7 +44,7 @@ extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -57,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_time_ns(ns); } -- cgit v1.2.3 From 96d997ea5ad1911cc393ffdb5c928b532f2f921a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:52 +0200 Subject: user: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/user_namespace.h | 4 ++-- kernel/user_namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a09056ad090e..9a9aebbf96b9 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -176,7 +176,7 @@ static inline struct user_namespace *to_user_ns(struct ns_common *ns) static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -186,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) __put_user_ns(ns); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 32406bcab526..f9df45c46235 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static void free_user_ns(struct work_struct *work) kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) -- cgit v1.2.3 From 2438b7d63ad866d6b2bb7b8d3455a6365d9b0fbe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:56 +0200 Subject: uts: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h index c2b619bb4e57..23b4f0e1b338 100644 --- a/include/linux/uts_namespace.h +++ b/include/linux/uts_namespace.h @@ -25,7 +25,7 @@ static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) static inline void get_uts_ns(struct uts_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern struct uts_namespace *copy_utsname(unsigned long flags, @@ -34,7 +34,7 @@ extern void free_uts_ns(struct uts_namespace *ns); static inline void put_uts_ns(struct uts_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_uts_ns(ns); } -- cgit v1.2.3 From 99d33ce100cbc982647c9299cadb1277cfad503e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:57 +0200 Subject: net: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/net/net_namespace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fd090ceb80bf..3e7c825e5810 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -270,7 +270,7 @@ static inline struct net *to_net_ns(struct ns_common *ns) /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -281,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -289,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -301,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } void net_drop_ns(void *); -- cgit v1.2.3 From 024596a4e2802e457a9f92af79f246fa9631f8de Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:59 +0200 Subject: ns: rename to __ns_ref Make it easier to grep and rename to ns_count. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns_common.h | 12 ++++++------ init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/nscommon.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 9109069d85cd..740a6ba524d0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6015,7 +6015,7 @@ struct mnt_namespace init_mnt_ns = { .ns.inum = PROC_MNT_INIT_INO, .ns.ops = &mntns_operations, .user_ns = &init_user_ns, - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 65e258e1fdc6..aea8528d799a 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -29,7 +29,7 @@ struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t count; + refcount_t __ns_ref; /* do not use directly */ union { struct { u64 ns_id; @@ -93,19 +93,19 @@ void __ns_common_free(struct ns_common *ns); static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->count); + return refcount_dec_and_test(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->count); + return refcount_inc_not_zero(&ns->__ns_ref); } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) #define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) #endif diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 043cbf80a766..547e522e6016 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,7 +8,7 @@ #include struct uts_namespace init_uts_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index bbf61275df41..d0f7dcf4c208 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -27,7 +27,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 092e6bf081ed..a0e24adceef0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -219,7 +219,7 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 7c1b07e2a6c9..7aa2be6a0c32 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -5,7 +5,7 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - refcount_set(&ns->count, 1); + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd2..e222426f745d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,7 +71,7 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d49c73015d6e..d70bdfb7b001 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -480,7 +480,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b..17a742fb4e10 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,7 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, -- cgit v1.2.3 From daf4c2929fb792d24af0cd7bb6ca1f2949190fa4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:34 -0700 Subject: bpf: bpf_verifier_state->cleaned flag instead of REG_LIVE_DONE Prepare for bpf_reg_state->live field removal by introducing a separate flag to track if clean_verifier_state() had been applied to the state. No functional changes. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-1-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/log.c | 6 ++---- kernel/bpf/verifier.c | 13 ++++--------- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 020de62bd09c..ac16da8b49dc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,6 @@ enum bpf_reg_liveness { REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; #define ITER_PREFIX "bpf_iter_" @@ -445,6 +444,7 @@ struct bpf_verifier_state { bool speculative; bool in_sleepable; + bool cleaned; /* first and last insn idx of this verifier state */ u32 first_insn_idx; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index e4983c1303e7..0d6d7bfb2fd0 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -545,14 +545,12 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); } #define UNUM_MAX_DECIMAL U16_MAX diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aef6b266f08d..47cec5c8abff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1758,6 +1758,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return err; dst_state->speculative = src->speculative; dst_state->in_sleepable = src->in_sleepable; + dst_state->cleaned = src->cleaned; dst_state->curframe = src->curframe; dst_state->branches = src->branches; dst_state->parent = src->parent; @@ -3589,11 +3590,6 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; - if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, - "type %s var_off %lld off %d", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off)) - return -EFAULT; /* The first condition is more likely to be true than the * second, checked it first. */ @@ -18501,7 +18497,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_FP; i++) { live = st->regs[i].live; /* liveness must not touch this register anymore */ - st->regs[i].live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) /* since the register is unused, clear its state * to make further comparison simpler @@ -18512,7 +18507,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { live = st->stack[i].spilled_ptr.live; /* liveness must not touch this stack slot anymore */ - st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) @@ -18526,6 +18520,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, { int i; + st->cleaned = true; for (i = 0; i <= st->curframe; i++) clean_func_state(env, st->frame[i]); } @@ -18553,7 +18548,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() * we can call this clean_live_states() function to mark all liveness states - * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * as st->cleaned to indicate that 'parent' pointers of 'struct bpf_reg_state' * will not be used. * This function also clears the registers and stack for states that !READ * to simplify state merging. @@ -18576,7 +18571,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) + if (sl->state.cleaned) /* all regs in this state in all frames were already marked */ continue; if (incomplete_read_marks(env, &sl->state)) -- cgit v1.2.3 From 3b20d3c120bae1e18ee11aa04531b161743db682 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:37 -0700 Subject: bpf: declare a few utility functions as internal api Namely, rename the following functions and add prototypes to bpf_verifier.h: - find_containing_subprog -> bpf_find_containing_subprog - insn_successors -> bpf_insn_successors - calls_callback -> bpf_calls_callback - fmt_stack_mask -> bpf_fmt_stack_mask Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-4-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/verifier.c | 34 ++++++++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ac16da8b49dc..93563564bde5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1065,4 +1065,9 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 74a96a0d6c8a..921a5fa06df7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2979,7 +2979,7 @@ static int cmp_subprogs(const void *a, const void *b) } /* Find subprogram that contains instruction at 'off' */ -static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *vals = env->subprog_info; int l, r, m; @@ -3004,7 +3004,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = find_containing_subprog(env, off); + p = bpf_find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; @@ -4211,7 +4211,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) } } /* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) { DECLARE_BITMAP(mask, 64); bool first = true; @@ -4266,8 +4266,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo } } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); - /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -4294,7 +4292,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); verbose(env, "mark_precise: frame%d: regs=%s ", bt->frame, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); verbose_insn(env, insn); @@ -4495,7 +4493,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * backtracking, as these registers are set by the function * invoking callback. */ - if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { @@ -4934,7 +4932,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, bt_frame_reg_mask(bt, fr)); verbose(env, "mark_precise: frame%d: parent state regs=%s ", fr, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_stack_mask(bt, fr)); verbose(env, "stack=%s: ", env->tmp_str_buf); print_verifier_state(env, st, fr, true); @@ -11023,7 +11021,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) "At callback return", "R0"); return -EINVAL; } - if (!calls_callback(env, callee->callsite)) { + if (!bpf_calls_callback(env, callee->callsite)) { verifier_bug(env, "in callback at %d, callsite %d !calls_callback", *insn_idx, callee->callsite); return -EFAULT; @@ -17298,7 +17296,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->changes_pkt_data = true; } @@ -17306,7 +17304,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->might_sleep = true; } @@ -17320,8 +17318,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { struct bpf_subprog_info *caller, *callee; - caller = find_containing_subprog(env, t); - callee = find_containing_subprog(env, w); + caller = bpf_find_containing_subprog(env, t); + callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; } @@ -17391,7 +17389,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].calls_callback = true; } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) { return env->insn_aux_data[insn_idx].calls_callback; } @@ -19439,7 +19437,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto hit; } } - if (calls_callback(env, insn_idx)) { + if (bpf_calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; @@ -24171,7 +24169,7 @@ static bool can_jump(struct bpf_insn *insn) return false; } -static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) { struct bpf_insn *insn = &prog->insnsi[idx]; int i = 0, insn_sz; @@ -24387,7 +24385,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) u16 new_out = 0; u16 new_in = 0; - succ_num = insn_successors(env->prog, insn_idx, succ); + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); for (int s = 0; s < succ_num; ++s) new_out |= state[succ[s]].in; new_in = (new_out & ~live->def) | live->use; @@ -24556,7 +24554,7 @@ dfs_continue: stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = insn_successors(env->prog, w, succ); + succ_cnt = bpf_insn_successors(env->prog, w, succ); for (j = 0; j < succ_cnt; ++j) { if (pre[succ[j]]) { low[w] = min(low[w], low[succ[j]]); -- cgit v1.2.3 From efcda22aa541bbda827e54302baf9ae4fd44cdf2 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:38 -0700 Subject: bpf: compute instructions postorder per subprogram The next patch would require doing postorder traversal of individual subprograms. Facilitate this by moving env->cfg.insn_postorder computation from check_cfg() to a separate pass, as check_cfg() descends into called subprograms (and it needs to, because of merge_callee_effects() logic). env->cfg.insn_postorder is used only by compute_live_registers(), this function does not track cross subprogram dependencies, thus the change does not affect it's operation. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-5-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++- kernel/bpf/verifier.c | 68 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 93563564bde5..bd87e80f9423 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -665,6 +665,7 @@ struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ + u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -794,7 +795,10 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; - /* vector of instruction indexes sorted in post-order */ + /* + * vector of instruction indexes sorted in post-order, grouped by subprogram, + * see bpf_subprog_info->postorder_start. + */ int *insn_postorder; int cur_stack; /* current position in the insn_postorder vector */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 921a5fa06df7..dc8d26dc9bf1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17863,7 +17863,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state, *insn_postorder; + int *insn_stack, *insn_state; int ex_insn_beg, i, ret = 0; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -17876,14 +17876,6 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = - kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - if (!insn_postorder) { - kvfree(insn_state); - kvfree(insn_stack); - return -ENOMEM; - } - ex_insn_beg = env->exception_callback_subprog ? env->subprog_info[env->exception_callback_subprog].start : 0; @@ -17901,7 +17893,6 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; - insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17955,6 +17946,56 @@ err_free: return ret; } +/* + * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range + * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) + * with indices of 'i' instructions in postorder. + */ +static int compute_postorder(struct bpf_verifier_env *env) +{ + u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + int *stack = NULL, *postorder = NULL, *state = NULL; + + postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + if (!postorder || !state || !stack) { + kvfree(postorder); + kvfree(state); + kvfree(stack); + return -ENOMEM; + } + cur_postorder = 0; + for (i = 0; i < env->subprog_cnt; i++) { + env->subprog_info[i].postorder_start = cur_postorder; + stack[0] = env->subprog_info[i].start; + stack_sz = 1; + do { + top = stack[stack_sz - 1]; + state[top] |= DISCOVERED; + if (state[top] & EXPLORED) { + postorder[cur_postorder++] = top; + stack_sz--; + continue; + } + succ_cnt = bpf_insn_successors(env->prog, top, succ); + for (s = 0; s < succ_cnt; ++s) { + if (!state[succ[s]]) { + stack[stack_sz++] = succ[s]; + state[succ[s]] |= DISCOVERED; + } + } + state[top] |= EXPLORED; + } while (stack_sz); + } + env->subprog_info[i].postorder_start = cur_postorder; + env->cfg.insn_postorder = postorder; + env->cfg.cur_postorder = cur_postorder; + kvfree(stack); + kvfree(state); + return 0; +} + static int check_abnormal_return(struct bpf_verifier_env *env) { int i; @@ -24422,9 +24463,6 @@ static int compute_live_registers(struct bpf_verifier_env *env) out: kvfree(state); - kvfree(env->cfg.insn_postorder); - env->cfg.insn_postorder = NULL; - env->cfg.cur_postorder = 0; return err; } @@ -24727,6 +24765,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = compute_postorder(env); + if (ret < 0) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; -- cgit v1.2.3 From b3698c356ad92bcdb9920655bc9df02a2a8946f9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:39 -0700 Subject: bpf: callchain sensitive stack liveness tracking using CFG This commit adds a flow-sensitive, context-sensitive, path-insensitive data flow analysis for live stack slots: - flow-sensitive: uses program control flow graph to compute data flow values; - context-sensitive: collects data flow values for each possible call chain in a program; - path-insensitive: does not distinguish between separate control flow graph paths reaching the same instruction. Compared to the current path-sensitive analysis, this approach trades some precision for not having to enumerate every path in the program. This gives a theoretical capability to run the analysis before main verification pass. See cover letter for motivation. The basic idea is as follows: - Data flow values indicate stack slots that might be read and stack slots that are definitely written. - Data flow values are collected for each (call chain, instruction number) combination in the program. - Within a subprogram, data flow values are propagated using control flow graph. - Data flow values are transferred from entry instructions of callee subprograms to call sites in caller subprograms. In other words, a tree of all possible call chains is constructed. Each node of this tree represents a subprogram. Read and write marks are collected for each instruction of each node. Live stack slots are first computed for lower level nodes. Then, information about outer stack slots that might be read or are definitely written by a subprogram is propagated one level up, to the corresponding call instructions of the upper nodes. Procedure repeats until root node is processed. In the absence of value range analysis, stack read/write marks are collected during main verification pass, and data flow computation is triggered each time verifier.c:states_equal() needs to query the information. Implementation details are documented in kernel/bpf/liveness.c. Quantitative data about verification performance changes and memory consumption is in the cover letter. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-6-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 + kernel/bpf/Makefile | 2 +- kernel/bpf/liveness.c | 677 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 692 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/liveness.c (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bd87e80f9423..2e3bdd50e2ba 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -745,6 +745,8 @@ struct bpf_scc_info { struct bpf_scc_visit visits[]; }; +struct bpf_liveness; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -846,6 +848,7 @@ struct bpf_verifier_env { struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; + struct bpf_liveness *liveness; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; @@ -1074,4 +1077,15 @@ int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); +int bpf_stack_liveness_init(struct bpf_verifier_env *env); +void bpf_stack_liveness_free(struct bpf_verifier_env *env); +int bpf_update_live_stack(struct bpf_verifier_env *env); +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask); +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask); +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f6cf8c2af5f7..7fd0badfacb1 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c new file mode 100644 index 000000000000..6f9dfaaf6e64 --- /dev/null +++ b/kernel/bpf/liveness.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include + +/* + * This file implements live stack slots analysis. After accumulating + * stack usage data, the analysis answers queries about whether a + * particular stack slot may be read by an instruction or any of it's + * successors. This data is consumed by the verifier states caching + * mechanism to decide which stack slots are important when looking for a + * visited state corresponding to the current state. + * + * The analysis is call chain sensitive, meaning that data is collected + * and queried for tuples (call chain, subprogram instruction index). + * Such sensitivity allows identifying if some subprogram call always + * leads to writes in the caller's stack. + * + * The basic idea is as follows: + * - As the verifier accumulates a set of visited states, the analysis instance + * accumulates a conservative estimate of stack slots that can be read + * or must be written for each visited tuple (call chain, instruction index). + * - If several states happen to visit the same instruction with the same + * call chain, stack usage information for the corresponding tuple is joined: + * - "may_read" set represents a union of all possibly read slots + * (any slot in "may_read" set might be read at or after the instruction); + * - "must_write" set represents an intersection of all possibly written slots + * (any slot in "must_write" set is guaranteed to be written by the instruction). + * - The analysis is split into two phases: + * - read and write marks accumulation; + * - read and write marks propagation. + * - The propagation phase is a textbook live variable data flow analysis: + * + * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_before = + * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read + * + * Where: + * - `U` stands for set union + * - `/` stands for set difference; + * - `cc` stands for a call chain; + * - `i` and `s` are instruction indexes; + * + * The above equations are computed for each call chain and instruction + * index until state stops changing. + * - Additionally, in order to transfer "must_write" information from a + * subprogram to call instructions invoking this subprogram, + * the "must_write_acc" set is tracked for each (cc, i) tuple. + * A set of stack slots that are guaranteed to be written by this + * instruction or any of its successors (within the subprogram). + * The equation for "must_write_acc" propagation looks as follows: + * + * state[cc, i].must_write_acc = + * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * U state[cc, i].must_write + * + * (An intersection of all "must_write_acc" for instruction successors + * plus all "must_write" slots for the instruction itself). + * - After the propagation phase completes for a subprogram, information from + * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain: + * - "must_write_acc" set is intersected with the call site's "must_write" set; + * - "may_read" set is added to the call site's "may_read" set. + * - Any live stack queries must be taken after the propagation phase. + * - Accumulation and propagation phases can be entered multiple times, + * at any point in time: + * - "may_read" set only grows; + * - "must_write" set only shrinks; + * - for each visited verifier state with zero branches, all relevant + * read and write marks are already recorded by the analysis instance. + * + * Technically, the analysis is facilitated by the following data structures: + * - Call chain: for given verifier state, the call chain is a tuple of call + * instruction indexes leading to the current subprogram plus the subprogram + * entry point index. + * - Function instance: for a given call chain, for each instruction in + * the current subprogram, a mapping between instruction index and a + * set of "may_read", "must_write" and other marks accumulated for this + * instruction. + * - A hash table mapping call chains to function instances. + */ + +struct callchain { + u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */ + /* cached subprog_info[*].start for functions owning the frames: + * - sp_starts[curframe] used to get insn relative index within current function; + * - sp_starts[0..current-1] used for fast callchain_frame_up(). + */ + u32 sp_starts[MAX_CALL_FRAMES]; + u32 curframe; /* depth of callsites and sp_starts arrays */ +}; + +struct per_frame_masks { + u64 may_read; /* stack slots that may be read by this instruction */ + u64 must_write; /* stack slots written by this instruction */ + u64 must_write_acc; /* stack slots written by this instruction and its successors */ + u64 live_before; /* stack slots that may be read by this insn and its successors */ +}; + +/* + * A function instance created for a specific callchain. + * Encapsulates read and write marks for each instruction in the function. + * Marks are tracked for each frame in the callchain. + */ +struct func_instance { + struct hlist_node hl_node; + struct callchain callchain; + u32 insn_cnt; /* cached number of insns in the function */ + bool updated; + bool must_write_dropped; + /* Per frame, per instruction masks, frames allocated lazily. */ + struct per_frame_masks *frames[MAX_CALL_FRAMES]; + /* For each instruction a flag telling if "must_write" had been initialized for it. */ + bool *must_write_set; +}; + +struct live_stack_query { + struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */ + u32 curframe; + u32 insn_idx; +}; + +struct bpf_liveness { + DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */ + struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */ + /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */ + struct func_instance *cur_instance; + /* + * Below fields are used to accumulate stack write marks for instruction at + * @write_insn_idx before submitting the marks to @cur_instance. + */ + u64 write_masks_acc[MAX_CALL_FRAMES]; + u32 write_insn_idx; +}; + +/* Compute callchain corresponding to state @st at depth @frameno */ +static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, + struct callchain *callchain, u32 frameno) +{ + struct bpf_subprog_info *subprog_info = env->subprog_info; + u32 i; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= frameno; i++) { + callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start; + if (i < st->curframe) + callchain->callsites[i] = st->frame[i + 1]->callsite; + } + callchain->curframe = frameno; + callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe]; +} + +static u32 hash_callchain(struct callchain *callchain) +{ + return jhash2(callchain->callsites, callchain->curframe, 0); +} + +static bool same_callsites(struct callchain *a, struct callchain *b) +{ + int i; + + if (a->curframe != b->curframe) + return false; + for (i = a->curframe; i >= 0; i--) + if (a->callsites[i] != b->callsites[i]) + return false; + return true; +} + +/* + * Find existing or allocate new function instance corresponding to @callchain. + * Instances are accumulated in env->liveness->func_instances and persist + * until the end of the verification process. + */ +static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, + struct callchain *callchain) +{ + struct bpf_liveness *liveness = env->liveness; + struct bpf_subprog_info *subprog; + struct func_instance *result; + u32 subprog_sz, size, key; + + key = hash_callchain(callchain); + hash_for_each_possible(liveness->func_instances, result, hl_node, key) + if (same_callsites(&result->callchain, callchain)) + return result; + + subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]); + subprog_sz = (subprog + 1)->start - subprog->start; + size = sizeof(struct func_instance); + result = kvzalloc(size, GFP_KERNEL_ACCOUNT); + if (!result) + return ERR_PTR(-ENOMEM); + result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), + GFP_KERNEL_ACCOUNT); + if (!result->must_write_set) + return ERR_PTR(-ENOMEM); + memcpy(&result->callchain, callchain, sizeof(*callchain)); + result->insn_cnt = subprog_sz; + hash_add(liveness->func_instances, &result->hl_node, key); + return result; +} + +static struct func_instance *lookup_instance(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + u32 frameno) +{ + struct callchain callchain; + + compute_callchain(env, st, &callchain, frameno); + return __lookup_instance(env, &callchain); +} + +int bpf_stack_liveness_init(struct bpf_verifier_env *env) +{ + env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT); + if (!env->liveness) + return -ENOMEM; + hash_init(env->liveness->func_instances); + return 0; +} + +void bpf_stack_liveness_free(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + struct hlist_node *tmp; + int bkt, i; + + if (!env->liveness) + return; + hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) { + for (i = 0; i <= instance->callchain.curframe; i++) + kvfree(instance->frames[i]); + kvfree(instance->must_write_set); + kvfree(instance); + } + kvfree(env->liveness); +} + +/* + * Convert absolute instruction index @insn_idx to an index relative + * to start of the function corresponding to @instance. + */ +static int relative_idx(struct func_instance *instance, u32 insn_idx) +{ + return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe]; +} + +static struct per_frame_masks *get_frame_masks(struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + if (!instance->frames[frame]) + return NULL; + + return &instance->frames[frame][relative_idx(instance, insn_idx)]; +} + +static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, + struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + struct per_frame_masks *arr; + + if (!instance->frames[frame]) { + arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT); + instance->frames[frame] = arr; + if (!arr) + return ERR_PTR(-ENOMEM); + } + return get_frame_masks(instance, frame, insn_idx); +} + +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env) +{ + env->liveness->cur_instance = NULL; +} + +/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */ +static int ensure_cur_instance(struct bpf_verifier_env *env) +{ + struct bpf_liveness *liveness = env->liveness; + struct func_instance *instance; + + if (liveness->cur_instance) + return 0; + + instance = lookup_instance(env, env->cur_state, env->cur_state->curframe); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + liveness->cur_instance = instance; + return 0; +} + +/* Accumulate may_read masks for @frame at @insn_idx */ +static int mark_stack_read(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask) +{ + struct per_frame_masks *masks; + u64 new_may_read; + + masks = alloc_frame_masks(env, instance, frame, insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + new_may_read = masks->may_read | mask; + if (new_may_read != masks->may_read && + ((new_may_read | masks->live_before) != masks->live_before)) + instance->updated = true; + masks->may_read |= mask; + return 0; +} + +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask) +{ + int err; + + err = ensure_cur_instance(env); + err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask); + return err; +} + +static void reset_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int i; + + liveness->write_insn_idx = insn_idx; + for (i = 0; i <= instance->callchain.curframe; i++) + liveness->write_masks_acc[i] = 0; +} + +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int err; + + err = ensure_cur_instance(env); + if (err) + return err; + + reset_stack_write_marks(env, liveness->cur_instance, insn_idx); + return 0; +} + +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask) +{ + env->liveness->write_masks_acc[frame] |= mask; +} + +static int commit_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct bpf_liveness *liveness = env->liveness; + u32 idx, frame, curframe, old_must_write; + struct per_frame_masks *masks; + u64 mask; + + if (!instance) + return 0; + + curframe = instance->callchain.curframe; + idx = relative_idx(instance, liveness->write_insn_idx); + for (frame = 0; frame <= curframe; frame++) { + mask = liveness->write_masks_acc[frame]; + /* avoid allocating frames for zero masks */ + if (mask == 0 && !instance->must_write_set[idx]) + continue; + masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + old_must_write = masks->must_write; + /* + * If instruction at this callchain is seen for a first time, set must_write equal + * to @mask. Otherwise take intersection with the previous value. + */ + if (instance->must_write_set[idx]) + mask &= old_must_write; + if (old_must_write != mask) { + masks->must_write = mask; + instance->updated = true; + } + if (old_must_write & ~mask) + instance->must_write_dropped = true; + } + instance->must_write_set[idx] = true; + liveness->write_insn_idx = 0; + return 0; +} + +/* + * Merge stack writes marks in @env->liveness->write_masks_acc + * with information already in @env->liveness->cur_instance. + */ +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env) +{ + return commit_stack_write_marks(env, env->liveness->cur_instance); +} + +static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain) +{ + char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf); + char *buf = env->tmp_str_buf; + int i; + + buf += snprintf(buf, buf_end - buf, "("); + for (i = 0; i <= callchain->curframe; i++) + buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]); + snprintf(buf, buf_end - buf, ")"); + return env->tmp_str_buf; +} + +static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain, + char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new) +{ + u64 changed_bits = old ^ new; + u64 new_ones = new & changed_bits; + u64 new_zeros = ~new & changed_bits; + + if (!changed_bits) + return; + bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx); + if (new_ones) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones); + bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf); + } + if (new_zeros) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros); + bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf); + } + bpf_log(&env->log, "\n"); +} + +static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain callchain = instance->callchain; + + /* Adjust @callchain to represent callchain one frame up */ + callchain.callsites[callchain.curframe] = 0; + callchain.sp_starts[callchain.curframe] = 0; + callchain.curframe--; + callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe]; + return __lookup_instance(env, &callchain); +} + +static u32 callchain_subprog_start(struct callchain *callchain) +{ + return callchain->sp_starts[callchain->curframe]; +} + +/* + * Transfer @may_read and @must_write_acc marks from the first instruction of @instance, + * to the call instruction in function instance calling @instance. + */ +static int propagate_to_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain *callchain = &instance->callchain; + u32 this_subprog_start, callsite, frame; + struct func_instance *outer_instance; + struct per_frame_masks *insn; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + outer_instance = get_outer_instance(env, instance); + callsite = callchain->callsites[callchain->curframe - 1]; + + reset_stack_write_marks(env, outer_instance, callsite); + for (frame = 0; frame < callchain->curframe; frame++) { + insn = get_frame_masks(instance, frame, this_subprog_start); + if (!insn) + continue; + bpf_mark_stack_write(env, frame, insn->must_write_acc); + err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before); + if (err) + return err; + } + commit_stack_write_marks(env, outer_instance); + return 0; +} + +static inline bool update_insn(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + u64 new_before, new_after, must_write_acc; + struct per_frame_masks *insn, *succ_insn; + u32 succ_num, s, succ[2]; + bool changed; + + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); + if (unlikely(succ_num == 0)) + return false; + + changed = false; + insn = get_frame_masks(instance, frame, insn_idx); + new_before = 0; + new_after = 0; + /* + * New "must_write_acc" is an intersection of all "must_write_acc" + * of successors plus all "must_write" slots of instruction itself. + */ + must_write_acc = U64_MAX; + for (s = 0; s < succ_num; ++s) { + succ_insn = get_frame_masks(instance, frame, succ[s]); + new_after |= succ_insn->live_before; + must_write_acc &= succ_insn->must_write_acc; + } + must_write_acc |= insn->must_write; + /* + * New "live_before" is a union of all "live_before" of successors + * minus slots written by instruction plus slots read by instruction. + */ + new_before = (new_after & ~insn->must_write) | insn->may_read; + changed |= new_before != insn->live_before; + changed |= must_write_acc != insn->must_write_acc; + if (unlikely(env->log.level & BPF_LOG_LEVEL2) && + (insn->may_read || insn->must_write || + insn_idx == callchain_subprog_start(&instance->callchain) || + aux[insn_idx].prune_point)) { + log_mask_change(env, &instance->callchain, "live", + frame, insn_idx, insn->live_before, new_before); + log_mask_change(env, &instance->callchain, "written", + frame, insn_idx, insn->must_write_acc, must_write_acc); + } + insn->live_before = new_before; + insn->must_write_acc = must_write_acc; + return changed; +} + +/* Fixed-point computation of @live_before and @must_write_acc marks */ +static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + u32 i, frame, po_start, po_end, cnt, this_subprog_start; + struct callchain *callchain = &instance->callchain; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_subprog_info *subprog; + struct per_frame_masks *insn; + bool changed; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + /* + * If must_write marks were updated must_write_acc needs to be reset + * (to account for the case when new must_write sets became smaller). + */ + if (instance->must_write_dropped) { + for (frame = 0; frame <= callchain->curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = 0; i < instance->insn_cnt; i++) { + insn = get_frame_masks(instance, frame, this_subprog_start + i); + insn->must_write_acc = 0; + } + } + } + + subprog = bpf_find_containing_subprog(env, this_subprog_start); + po_start = subprog->postorder_start; + po_end = (subprog + 1)->postorder_start; + cnt = 0; + /* repeat until fixed point is reached */ + do { + cnt++; + changed = false; + for (frame = 0; frame <= instance->callchain.curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = po_start; i < po_end; i++) + changed |= update_insn(env, instance, frame, insn_postorder[i]); + } + } while (changed); + + if (env->log.level & BPF_LOG_LEVEL2) + bpf_log(&env->log, "%s live stack update done in %d iterations\n", + fmt_callchain(env, callchain), cnt); + + /* transfer marks accumulated for outer frames to outer func instance (caller) */ + if (callchain->curframe > 0) { + err = propagate_to_outer_instance(env, instance); + if (err) + return err; + } + + return 0; +} + +/* + * Prepare all callchains within @env->cur_state for querying. + * This function should be called after each verifier.c:pop_stack() + * and whenever verifier.c:do_check_insn() processes subprogram exit. + * This would guarantee that visited verifier states with zero branches + * have their bpf_mark_stack_{read,write}() effects propagated in + * @env->liveness. + */ +int bpf_update_live_stack(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + int err, frame; + + bpf_reset_live_stack_callchain(env); + for (frame = env->cur_state->curframe; frame >= 0; --frame) { + instance = lookup_instance(env, env->cur_state, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + if (instance->updated) { + err = update_instance(env, instance); + if (err) + return err; + instance->updated = false; + instance->must_write_dropped = false; + } + } + return 0; +} + +static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi) +{ + struct per_frame_masks *masks; + + masks = get_frame_masks(instance, frameno, insn_idx); + return masks && (masks->live_before & BIT(spi)); +} + +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance; + u32 frame; + + memset(q, 0, sizeof(*q)); + for (frame = 0; frame <= st->curframe; frame++) { + instance = lookup_instance(env, st, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + q->instances[frame] = instance; + } + q->curframe = st->curframe; + q->insn_idx = st->insn_idx; + return 0; +} + +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi) +{ + /* + * Slot is alive if it is read before q->st->insn_idx in current func instance, + * or if for some outer func instance: + * - alive before callsite if callsite calls callback, otherwise + * - alive after callsite + */ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance, *curframe_instance; + u32 i, callsite; + bool alive; + + curframe_instance = q->instances[q->curframe]; + if (is_live_before(curframe_instance, q->insn_idx, frameno, spi)) + return true; + + for (i = frameno; i < q->curframe; i++) { + callsite = curframe_instance->callchain.callsites[i]; + instance = q->instances[i]; + alive = bpf_calls_callback(env, callsite) + ? is_live_before(instance, callsite, frameno, spi) + : is_live_before(instance, callsite + 1, frameno, spi); + if (alive) + return true; + } + + return false; +} -- cgit v1.2.3 From ccf25a67c7e29cfa6815d193054789b45ef825ad Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:41 -0700 Subject: bpf: signal error if old liveness is more conservative than new Unlike the new algorithm, register chain based liveness tracking is fully path sensitive, and thus should be strictly more accurate. Validate the new algorithm by signaling an error whenever it considers a stack slot dead while the old algorithm considers it alive. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-8-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bdd50e2ba..dec5da3a2e59 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -852,6 +852,7 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + bool internal_error; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb931a144b95..f70e34a38c13 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18576,6 +18576,11 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_stack_slot_alive(env, st->frameno, i)) { + if (st->stack[i].spilled_ptr.live & REG_LIVE_READ) { + verifier_bug(env, "incorrect live marks #1 for insn %d frameno %d spi %d\n", + env->insn_idx, st->frameno, i); + env->internal_error = true; + } __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -19546,6 +19551,8 @@ skip_inf_loop_check: loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: + if (env->internal_error) + return -EFAULT; sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. @@ -19660,6 +19667,8 @@ hit: return 1; } miss: + if (env->internal_error) + return -EFAULT; /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have -- cgit v1.2.3 From 107e169799057bc6a379ddb625cbe1e51cfc7d72 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:42 -0700 Subject: bpf: disable and remove registers chain based liveness Remove register chain based liveness tracking: - struct bpf_reg_state->{parent,live} fields are no longer needed; - REG_LIVE_WRITTEN marks are superseded by bpf_mark_stack_write() calls; - mark_reg_read() calls are superseded by bpf_mark_stack_read(); - log.c:print_liveness() is superseded by logging in liveness.c; - propagate_liveness() is superseded by bpf_update_live_stack(); - no need to establish register chains in is_state_visited() anymore; - fix a bunch of tests expecting "_w" suffixes in verifier log messages. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-9-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/verifier.rst | 264 ----------------- include/linux/bpf_verifier.h | 25 -- kernel/bpf/log.c | 26 +- kernel/bpf/verifier.c | 315 ++------------------- tools/testing/selftests/bpf/prog_tests/align.c | 178 ++++++------ tools/testing/selftests/bpf/prog_tests/spin_lock.c | 12 +- .../selftests/bpf/prog_tests/test_veristat.c | 44 +-- .../selftests/bpf/progs/exceptions_assert.c | 34 +-- .../selftests/bpf/progs/iters_state_safety.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 6 +- .../selftests/bpf/progs/mem_rdonly_untrusted.c | 4 +- .../testing/selftests/bpf/progs/verifier_bounds.c | 38 +-- .../selftests/bpf/progs/verifier_global_ptr_args.c | 4 +- tools/testing/selftests/bpf/progs/verifier_ldsx.c | 2 +- .../selftests/bpf/progs/verifier_precision.c | 16 +- .../selftests/bpf/progs/verifier_scalar_ids.c | 10 +- .../selftests/bpf/progs/verifier_spill_fill.c | 40 +-- .../bpf/progs/verifier_subprog_precision.c | 6 +- tools/testing/selftests/bpf/verifier/bpf_st_mem.c | 4 +- 19 files changed, 226 insertions(+), 806 deletions(-) (limited to 'include') diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst index 95e6f80a407e..510d15bc697b 100644 --- a/Documentation/bpf/verifier.rst +++ b/Documentation/bpf/verifier.rst @@ -347,270 +347,6 @@ However, only the value of register ``r1`` is important to successfully finish verification. The goal of the liveness tracking algorithm is to spot this fact and figure out that both states are actually equivalent. -Data structures -~~~~~~~~~~~~~~~ - -Liveness is tracked using the following data structures:: - - enum bpf_reg_liveness { - REG_LIVE_NONE = 0, - REG_LIVE_READ32 = 0x1, - REG_LIVE_READ64 = 0x2, - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, - REG_LIVE_DONE = 0x8, - }; - - struct bpf_reg_state { - ... - struct bpf_reg_state *parent; - ... - enum bpf_reg_liveness live; - ... - }; - - struct bpf_stack_state { - struct bpf_reg_state spilled_ptr; - ... - }; - - struct bpf_func_state { - struct bpf_reg_state regs[MAX_BPF_REG]; - ... - struct bpf_stack_state *stack; - } - - struct bpf_verifier_state { - struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; - ... - } - -* ``REG_LIVE_NONE`` is an initial value assigned to ``->live`` fields upon new - verifier state creation; - -* ``REG_LIVE_WRITTEN`` means that the value of the register (or stack slot) is - defined by some instruction verified between this verifier state's parent and - verifier state itself; - -* ``REG_LIVE_READ{32,64}`` means that the value of the register (or stack slot) - is read by a some child state of this verifier state; - -* ``REG_LIVE_DONE`` is a marker used by ``clean_verifier_state()`` to avoid - processing same verifier state multiple times and for some sanity checks; - -* ``->live`` field values are formed by combining ``enum bpf_reg_liveness`` - values using bitwise or. - -Register parentage chains -~~~~~~~~~~~~~~~~~~~~~~~~~ - -In order to propagate information between parent and child states, a *register -parentage chain* is established. Each register or stack slot is linked to a -corresponding register or stack slot in its parent state via a ``->parent`` -pointer. This link is established upon state creation in ``is_state_visited()`` -and might be modified by ``set_callee_state()`` called from -``__check_func_call()``. - -The rules for correspondence between registers / stack slots are as follows: - -* For the current stack frame, registers and stack slots of the new state are - linked to the registers and stack slots of the parent state with the same - indices. - -* For the outer stack frames, only callee saved registers (r6-r9) and stack - slots are linked to the registers and stack slots of the parent state with the - same indices. - -* When function call is processed a new ``struct bpf_func_state`` instance is - allocated, it encapsulates a new set of registers and stack slots. For this - new frame, parent links for r6-r9 and stack slots are set to nil, parent links - for r1-r5 are set to match caller r1-r5 parent links. - -This could be illustrated by the following diagram (arrows stand for -``->parent`` pointers):: - - ... ; Frame #0, some instructions - --- checkpoint #0 --- - 1 : r6 = 42 ; Frame #0 - --- checkpoint #1 --- - 2 : call foo() ; Frame #0 - ... ; Frame #1, instructions from foo() - --- checkpoint #2 --- - ... ; Frame #1, instructions from foo() - --- checkpoint #3 --- - exit ; Frame #1, return from foo() - 3 : r1 = r6 ; Frame #0 <- current state - - +-------------------------------+-------------------------------+ - | Frame #0 | Frame #1 | - Checkpoint +-------------------------------+-------------------------------+ - #0 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ ^ - | | | | - Checkpoint +-------------------------------+ - #1 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ - |_______|_______|_______________ - | | | - nil nil | | | nil nil - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #2 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ ^ ^ ^ - nil nil | | | | | - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #3 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ - nil nil | | - | | | | - Current +-------------------------------+ - state | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - \ - r6 read mark is propagated via these links - all the way up to checkpoint #1. - The checkpoint #1 contains a write mark for r6 - because of instruction (1), thus read propagation - does not reach checkpoint #0 (see section below). - -Liveness marks tracking -~~~~~~~~~~~~~~~~~~~~~~~ - -For each processed instruction, the verifier tracks read and written registers -and stack slots. The main idea of the algorithm is that read marks propagate -back along the state parentage chain until they hit a write mark, which 'screens -off' earlier states from the read. The information about reads is propagated by -function ``mark_reg_read()`` which could be summarized as follows:: - - mark_reg_read(struct bpf_reg_state *state, ...): - parent = state->parent - while parent: - if state->live & REG_LIVE_WRITTEN: - break - if parent->live & REG_LIVE_READ64: - break - parent->live |= REG_LIVE_READ64 - state = parent - parent = state->parent - -Notes: - -* The read marks are applied to the **parent** state while write marks are - applied to the **current** state. The write mark on a register or stack slot - means that it is updated by some instruction in the straight-line code leading - from the parent state to the current state. - -* Details about REG_LIVE_READ32 are omitted. - -* Function ``propagate_liveness()`` (see section :ref:`read_marks_for_cache_hits`) - might override the first parent link. Please refer to the comments in the - ``propagate_liveness()`` and ``mark_reg_read()`` source code for further - details. - -Because stack writes could have different sizes ``REG_LIVE_WRITTEN`` marks are -applied conservatively: stack slots are marked as written only if write size -corresponds to the size of the register, e.g. see function ``save_register_state()``. - -Consider the following example:: - - 0: (*u64)(r10 - 8) = 0 ; define 8 bytes of fp-8 - --- checkpoint #0 --- - 1: (*u32)(r10 - 8) = 1 ; redefine lower 4 bytes - 2: r1 = (*u32)(r10 - 8) ; read lower 4 bytes defined at (1) - 3: r2 = (*u32)(r10 - 4) ; read upper 4 bytes defined at (0) - -As stated above, the write at (1) does not count as ``REG_LIVE_WRITTEN``. Should -it be otherwise, the algorithm above wouldn't be able to propagate the read mark -from (3) to checkpoint #0. - -Once the ``BPF_EXIT`` instruction is reached ``update_branch_counts()`` is -called to update the ``->branches`` counter for each verifier state in a chain -of parent verifier states. When the ``->branches`` counter reaches zero the -verifier state becomes a valid entry in a set of cached verifier states. - -Each entry of the verifier states cache is post-processed by a function -``clean_live_states()``. This function marks all registers and stack slots -without ``REG_LIVE_READ{32,64}`` marks as ``NOT_INIT`` or ``STACK_INVALID``. -Registers/stack slots marked in this way are ignored in function ``stacksafe()`` -called from ``states_equal()`` when a state cache entry is considered for -equivalence with a current state. - -Now it is possible to explain how the example from the beginning of the section -works:: - - 0: call bpf_get_prandom_u32() - 1: r1 = 0 - 2: if r0 == 0 goto +1 - 3: r0 = 1 - --- checkpoint[0] --- - 4: r0 = r1 - 5: exit - -* At instruction #2 branching point is reached and state ``{ r0 == 0, r1 == 0, pc == 4 }`` - is pushed to states processing queue (pc stands for program counter). - -* At instruction #4: - - * ``checkpoint[0]`` states cache entry is created: ``{ r0 == 1, r1 == 0, pc == 4 }``; - * ``checkpoint[0].r0`` is marked as written; - * ``checkpoint[0].r1`` is marked as read; - -* At instruction #5 exit is reached and ``checkpoint[0]`` can now be processed - by ``clean_live_states()``. After this processing ``checkpoint[0].r1`` has a - read mark and all other registers and stack slots are marked as ``NOT_INIT`` - or ``STACK_INVALID`` - -* The state ``{ r0 == 0, r1 == 0, pc == 4 }`` is popped from the states queue - and is compared against a cached state ``{ r1 == 0, pc == 4 }``, the states - are considered equivalent. - -.. _read_marks_for_cache_hits: - -Read marks propagation for cache hits -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Another point is the handling of read marks when a previously verified state is -found in the states cache. Upon cache hit verifier must behave in the same way -as if the current state was verified to the program exit. This means that all -read marks, present on registers and stack slots of the cached state, must be -propagated over the parentage chain of the current state. Example below shows -why this is important. Function ``propagate_liveness()`` handles this case. - -Consider the following state parentage chain (S is a starting state, A-E are -derived states, -> arrows show which state is derived from which):: - - r1 read - <------------- A[r1] == 0 - C[r1] == 0 - S ---> A ---> B ---> exit E[r1] == 1 - | - ` ---> C ---> D - | - ` ---> E ^ - |___ suppose all these - ^ states are at insn #Y - | - suppose all these - states are at insn #X - -* Chain of states ``S -> A -> B -> exit`` is verified first. - -* While ``B -> exit`` is verified, register ``r1`` is read and this read mark is - propagated up to state ``A``. - -* When chain of states ``C -> D`` is verified the state ``D`` turns out to be - equivalent to state ``B``. - -* The read mark for ``r1`` has to be propagated to state ``C``, otherwise state - ``C`` might get mistakenly marked as equivalent to state ``E`` even though - values for register ``r1`` differ between ``C`` and ``E``. - Understanding eBPF verifier messages ==================================== diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dec5da3a2e59..c7515da8500c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -26,27 +26,6 @@ /* Patch buffer size */ #define INSN_BUF_SIZE 32 -/* Liveness marks, used for registers and spilled-regs (in stack slots). - * Read marks propagate upwards until they find a write mark; they record that - * "one of this state's descendants read this reg" (and therefore the reg is - * relevant for states_equal() checks). - * Write marks collect downwards and do not propagate; they record that "the - * straight-line code that reached this state (from its parent) wrote this reg" - * (and therefore that reads propagated from this state or its descendants - * should not propagate to its parent). - * A state with a write mark can receive read marks; it just won't propagate - * them to its parent, since the write mark is a property, not of the state, - * but of the link between it and its parent. See mark_reg_read() and - * mark_stack_slot_read() in kernel/bpf/verifier.c. - */ -enum bpf_reg_liveness { - REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ -}; - #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { @@ -211,8 +190,6 @@ struct bpf_reg_state { * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; - /* parentage chain for liveness checking */ - struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -225,7 +202,6 @@ struct bpf_reg_state { * patching which only happens after main verification finished. */ s32 subreg_def; - enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; @@ -852,7 +828,6 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; - bool internal_error; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 0d6d7bfb2fd0..f50533169cc3 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -542,17 +542,6 @@ static char slot_type_char[] = { [STACK_IRQ_FLAG] = 'f' }; -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); -} - #define UNUM_MAX_DECIMAL U16_MAX #define SNUM_MAX_DECIMAL S16_MAX #define SNUM_MIN_DECIMAL S16_MIN @@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); - print_liveness(env, reg->live); verbose(env, "="); print_reg_state(env, state, reg); } @@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie break; types_buf[j] = '\0'; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); print_reg_state(env, state, reg); break; case STACK_DYNPTR: @@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie reg = &state->stack[i].spilled_ptr; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); @@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!reg->ref_obj_id) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), reg->ref_obj_id, iter_state_str(reg->iter.state), reg->iter.depth); @@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie case STACK_MISC: case STACK_ZERO: default: - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); break; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f70e34a38c13..e1da2471442b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -787,8 +787,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; @@ -806,29 +804,6 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? - * - * While we don't allow reading STACK_INVALID, it is still possible to - * do <8 byte writes marking some but not all slots as STACK_MISC. Then, - * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and - * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of - * the slot conservatively. Hence we need to prevent those liveness - * marking walks. - * - * This was not a problem before because STACK_INVALID is only set by - * default (where the default reg state has its reg->parent as NULL), or - * in clean_live_states after REG_LIVE_DONE (at which point - * mark_reg_read won't walk reg->parent chain), but not randomly during - * verifier state exploration (like we did above). Hence, for our case - * parentage chain will still be live (i.e. reg->parent may be - * non-NULL), while earlier reg->parent was NULL, so we need - * REG_LIVE_WRITTEN to screen off read marker propagation when it is - * done later on reads or by mark_dynptr_read as well to unnecessary - * mark registers in verifier state. - */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); } @@ -938,9 +913,6 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Same reason as unmark_stack_slots_dynptr above */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; @@ -1059,7 +1031,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; @@ -1095,9 +1066,6 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; - for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; @@ -1194,7 +1162,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; st->irq.kfunc_class = kfunc_class; @@ -1248,8 +1215,6 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, reg->frameno, BIT(spi)); for (i = 0; i < BPF_REG_SIZE; i++) @@ -2901,8 +2866,6 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); - regs[i].live = REG_LIVE_NONE; - regs[i].parent = NULL; regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -3583,64 +3546,12 @@ next: return 0; } -/* Parentage chain of this register (or stack slot) should take care of all - * issues like callee-saved registers, stack slot allocation time, etc. - */ -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_reg_state *state, - struct bpf_reg_state *parent, u8 flag) -{ - bool writes = parent == state->parent; /* Observe write marks */ - int cnt = 0; - - while (parent) { - /* if read wasn't screened by an earlier write ... */ - if (writes && state->live & REG_LIVE_WRITTEN) - break; - /* The first condition is more likely to be true than the - * second, checked it first. - */ - if ((parent->live & REG_LIVE_READ) == flag || - parent->live & REG_LIVE_READ64) - /* The parentage chain never changes and - * this parent was already marked as LIVE_READ. - * There is no need to keep walking the chain again and - * keep re-marking all parents as LIVE_READ. - * This case happens when the same register is read - * multiple times without writes into it in-between. - * Also, if parent has the stronger REG_LIVE_READ64 set, - * then no need to set the weak REG_LIVE_READ32. - */ - break; - /* ... then we depend on parent's value */ - parent->live |= flag; - /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ - if (flag == REG_LIVE_READ64) - parent->live &= ~REG_LIVE_READ32; - state = parent; - parent = state->parent; - writes = true; - cnt++; - } - - if (env->longest_mark_read_walk < cnt) - env->longest_mark_read_walk = cnt; - return 0; -} - static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); int err, i; for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); - if (err) - return err; - err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); if (err) return err; @@ -3852,15 +3763,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r if (rw64) mark_insn_zext(env, reg); - return mark_reg_read(env, reg, reg->parent, - rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); + return 0; } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); @@ -5065,12 +4974,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { - struct bpf_reg_state *parent = dst->parent; - enum bpf_reg_liveness live = dst->live; - *dst = *src; - dst->parent = parent; - dst->live = live; } static void save_register_state(struct bpf_verifier_env *env, @@ -5081,8 +4985,6 @@ static void save_register_state(struct bpf_verifier_env *env, int i; copy_register_state(&state->stack[spi].spilled_ptr, reg); - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -5231,17 +5133,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { @@ -5434,7 +5325,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } /* Read the stack at 'off' and put the results into the register indicated by @@ -5481,7 +5371,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0; @@ -5535,7 +5424,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* not restoring original register state */ } } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); @@ -5543,7 +5431,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE @@ -5555,7 +5442,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; @@ -5569,7 +5455,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); insn_flags = 0; /* we are not restoring spilled register */ @@ -8197,13 +8082,10 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); if (err) return err; - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + /* We do not call bpf_mark_stack_write(), as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range. @@ -11041,8 +10923,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* we are going to rely on register's precise value */ - err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); - err = err ?: mark_chain_precision(env, BPF_REG_0); + err = mark_chain_precision(env, BPF_REG_0); if (err) return err; @@ -11946,17 +11827,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re if (regno == BPF_REG_0) { /* Function return value */ - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = reg_size == sizeof(u64) ? DEF_NOT_SUBREG : env->insn_idx + 1; - } else { + } else if (reg_size == sizeof(u64)) { /* Function argument */ - if (reg_size == sizeof(u64)) { - mark_insn_zext(env, reg); - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - } else { - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); - } + mark_insn_zext(env, reg); } } @@ -15710,7 +15585,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15729,7 +15603,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!no_sext) dst_reg->id = 0; coerce_reg_to_size_sx(dst_reg, insn->off >> 3); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -15755,7 +15628,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ if (!is_src_reg_u32) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -15766,7 +15638,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -18576,11 +18447,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_stack_slot_alive(env, st->frameno, i)) { - if (st->stack[i].spilled_ptr.live & REG_LIVE_READ) { - verifier_bug(env, "incorrect live marks #1 for insn %d frameno %d spi %d\n", - env->insn_idx, st->frameno, i); - env->internal_error = true; - } __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -18609,25 +18475,23 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. * Example: - * 1: r0 = 1 + * 1: *(u64)(r10 - 8) = 1 * 2: if r1 == 100 goto pc+1 - * 3: r0 = 2 - * 4: exit - * when the verifier reaches exit insn the register r0 in the state list of - * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. At the insn 4 it will walk the - * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * 3: *(u64)(r10 - 8) = 2 + * 4: r0 = *(u64)(r10 - 8) + * 5: exit + * when the verifier reaches exit insn the stack slot -8 in the state list of + * insn 2 is not yet marked alive. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. After the insn 4 read, liveness + * analysis would propagate read mark for -8 at insn 2. * * Since the verifier pushes the branch states as it sees them while exploring * the program the condition of walking the branch instruction for the second * time means that all states below this branch were already explored and * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to mark all liveness states - * as st->cleaned to indicate that 'parent' pointers of 'struct bpf_reg_state' - * will not be used. - * This function also clears the registers and stack for states that !READ - * to simplify state merging. + * we can call this clean_live_states() function to clear dead the registers and stack + * slots to simplify state merging. * * Important note here that walking the same branch instruction in the callee * doesn't meant that the states are DONE. The verifier has to compare @@ -18802,7 +18666,6 @@ static struct bpf_reg_state unbound_reg; static __init int unbound_reg_init(void) { __mark_reg_unknown_imprecise(&unbound_reg); - unbound_reg.live |= REG_LIVE_READ; return 0; } late_initcall(unbound_reg_init); @@ -19097,91 +18960,6 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } -/* Return 0 if no propagation happened. Return negative error code if error - * happened. Otherwise, return the propagated bit. - */ -static int propagate_liveness_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - struct bpf_reg_state *parent_reg) -{ - u8 parent_flag = parent_reg->live & REG_LIVE_READ; - u8 flag = reg->live & REG_LIVE_READ; - int err; - - /* When comes here, read flags of PARENT_REG or REG could be any of - * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need - * of propagation if PARENT_REG has strongest REG_LIVE_READ64. - */ - if (parent_flag == REG_LIVE_READ64 || - /* Or if there is no read flag from REG. */ - !flag || - /* Or if the read flag from REG is the same as PARENT_REG. */ - parent_flag == flag) - return 0; - - err = mark_reg_read(env, reg, parent_reg, flag); - if (err) - return err; - - return flag; -} - -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent, - bool *changed) -{ - struct bpf_reg_state *state_reg, *parent_reg; - struct bpf_func_state *state, *parent; - int i, frame, err = 0; - bool tmp = false; - - changed = changed ?: &tmp; - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; - } - /* Propagate read liveness of registers... */ - BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - for (frame = 0; frame <= vstate->curframe; frame++) { - parent = vparent->frame[frame]; - state = vstate->frame[frame]; - parent_reg = parent->regs; - state_reg = state->regs; - /* We don't need to worry about FP liveness, it's read-only */ - for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - err = propagate_liveness_reg(env, &state_reg[i], - &parent_reg[i]); - if (err < 0) - return err; - *changed |= err > 0; - if (err == REG_LIVE_READ64) - mark_insn_zext(env, &parent_reg[i]); - } - - /* Propagate stack slots. */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - parent_reg = &parent->stack[i].spilled_ptr; - state_reg = &state->stack[i].spilled_ptr; - err = propagate_liveness_reg(env, state_reg, - parent_reg); - *changed |= err > 0; - if (err < 0) - return err; - } - } - return 0; -} - /* find precise scalars in the previous equivalent state and * propagate them into the current state */ @@ -19201,8 +18979,7 @@ static int propagate_precision(struct bpf_verifier_env *env, first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19219,8 +18996,7 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19270,9 +19046,6 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi changed = false; for (backedge = visit->backedges; backedge; backedge = backedge->next) { st = &backedge->state; - err = propagate_liveness(env, st->equal_state, st, &changed); - if (err) - return err; err = propagate_precision(env, st->equal_state, st, &changed); if (err) return err; @@ -19296,7 +19069,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, fcur = cur->frame[fr]; for (i = 0; i < MAX_BPF_REG; i++) if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, parent))) + offsetof(struct bpf_reg_state, frameno))) return false; return true; } @@ -19394,7 +19167,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; bool force_new_state, add_new_state, loop; - int i, j, n, err, states_cnt = 0; + int n, err, states_cnt = 0; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || @@ -19551,28 +19324,16 @@ skip_inf_loop_check: loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: - if (env->internal_error) - return -EFAULT; sl->hit_cnt++; - /* reached equivalent register/stack state, - * prune the search. - * Registers read by the continuation are read by us. - * If we have any write marks in env->cur_state, they - * will prevent corresponding reads in the continuation - * from reaching our parent (an explored_state). Our - * own state will get the read marks recorded, but - * they'll be immediately forgotten as we're pruning - * this state and will pop a new one. - */ - err = propagate_liveness(env, &sl->state, cur, NULL); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ + err = 0; if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; @@ -19667,8 +19428,6 @@ hit: return 1; } miss: - if (env->internal_error) - return -EFAULT; /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have @@ -19754,38 +19513,6 @@ miss: cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); list_add(&new_sl->node, head); - - /* connect new state to parentage chain. Current frame needs all - * registers connected. Only r6 - r9 of the callers are alive (pushed - * to the stack implicitly by JITs) so in callers' frames connect just - * r6 - r9 as an optimization. Callers will have r1 - r5 connected to - * the state of the call instruction (with WRITTEN set), and r0 comes - * from callee with its full parentage chain, anyway. - */ - /* clear write marks in current state: the writes we did are not writes - * our child did, so they don't screen off its reads from us. - * (There are no read marks in current state, because reads always mark - * their parent and current state never has children yet. Only - * explored_states can get read marks.) - */ - for (j = 0; j <= cur->curframe; j++) { - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].live = REG_LIVE_NONE; - } - - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - struct bpf_func_state *newframe = new->frame[j]; - - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - frame->stack[i].spilled_ptr.parent = - &newframe->stack[i].spilled_ptr; - } - } return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 1d53a8561ee2..24c509ce4e5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -42,11 +42,11 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "2"}, - {1, "R3_w", "4"}, - {2, "R3_w", "8"}, - {3, "R3_w", "16"}, - {4, "R3_w", "32"}, + {0, "R3", "2"}, + {1, "R3", "4"}, + {2, "R3", "8"}, + {3, "R3", "16"}, + {4, "R3", "32"}, }, }, { @@ -70,17 +70,17 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "1"}, - {1, "R3_w", "2"}, - {2, "R3_w", "4"}, - {3, "R3_w", "8"}, - {4, "R3_w", "16"}, - {5, "R3_w", "1"}, - {6, "R4_w", "32"}, - {7, "R4_w", "16"}, - {8, "R4_w", "8"}, - {9, "R4_w", "4"}, - {10, "R4_w", "2"}, + {0, "R3", "1"}, + {1, "R3", "2"}, + {2, "R3", "4"}, + {3, "R3", "8"}, + {4, "R3", "16"}, + {5, "R3", "1"}, + {6, "R4", "32"}, + {7, "R4", "16"}, + {8, "R4", "8"}, + {9, "R4", "4"}, + {10, "R4", "2"}, }, }, { @@ -99,12 +99,12 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "4"}, - {1, "R3_w", "8"}, - {2, "R3_w", "10"}, - {3, "R4_w", "8"}, - {4, "R4_w", "12"}, - {5, "R4_w", "14"}, + {0, "R3", "4"}, + {1, "R3", "8"}, + {2, "R3", "10"}, + {3, "R4", "8"}, + {4, "R4", "12"}, + {5, "R4", "14"}, }, }, { @@ -121,10 +121,10 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "7"}, - {1, "R3_w", "7"}, - {2, "R3_w", "14"}, - {3, "R3_w", "56"}, + {0, "R3", "7"}, + {1, "R3", "7"}, + {2, "R3", "14"}, + {3, "R3", "56"}, }, }, @@ -162,19 +162,19 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R0_w", "pkt(off=8,r=8)"}, - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R3_w", "var_off=(0x0; 0x1fe)"}, - {8, "R3_w", "var_off=(0x0; 0x3fc)"}, - {9, "R3_w", "var_off=(0x0; 0x7f8)"}, - {10, "R3_w", "var_off=(0x0; 0xff0)"}, - {12, "R3_w", "pkt_end()"}, - {17, "R4_w", "var_off=(0x0; 0xff)"}, - {18, "R4_w", "var_off=(0x0; 0x1fe0)"}, - {19, "R4_w", "var_off=(0x0; 0xff0)"}, - {20, "R4_w", "var_off=(0x0; 0x7f8)"}, - {21, "R4_w", "var_off=(0x0; 0x3fc)"}, - {22, "R4_w", "var_off=(0x0; 0x1fe)"}, + {6, "R0", "pkt(off=8,r=8)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R3", "var_off=(0x0; 0x1fe)"}, + {8, "R3", "var_off=(0x0; 0x3fc)"}, + {9, "R3", "var_off=(0x0; 0x7f8)"}, + {10, "R3", "var_off=(0x0; 0xff0)"}, + {12, "R3", "pkt_end()"}, + {17, "R4", "var_off=(0x0; 0xff)"}, + {18, "R4", "var_off=(0x0; 0x1fe0)"}, + {19, "R4", "var_off=(0x0; 0xff0)"}, + {20, "R4", "var_off=(0x0; 0x7f8)"}, + {21, "R4", "var_off=(0x0; 0x3fc)"}, + {22, "R4", "var_off=(0x0; 0x1fe)"}, }, }, { @@ -195,16 +195,16 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R4_w", "var_off=(0x0; 0xff)"}, - {8, "R4_w", "var_off=(0x0; 0xff)"}, - {9, "R4_w", "var_off=(0x0; 0xff)"}, - {10, "R4_w", "var_off=(0x0; 0x1fe)"}, - {11, "R4_w", "var_off=(0x0; 0xff)"}, - {12, "R4_w", "var_off=(0x0; 0x3fc)"}, - {13, "R4_w", "var_off=(0x0; 0xff)"}, - {14, "R4_w", "var_off=(0x0; 0x7f8)"}, - {15, "R4_w", "var_off=(0x0; 0xff0)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R4", "var_off=(0x0; 0xff)"}, + {8, "R4", "var_off=(0x0; 0xff)"}, + {9, "R4", "var_off=(0x0; 0xff)"}, + {10, "R4", "var_off=(0x0; 0x1fe)"}, + {11, "R4", "var_off=(0x0; 0xff)"}, + {12, "R4", "var_off=(0x0; 0x3fc)"}, + {13, "R4", "var_off=(0x0; 0xff)"}, + {14, "R4", "var_off=(0x0; 0x7f8)"}, + {15, "R4", "var_off=(0x0; 0xff0)"}, }, }, { @@ -235,14 +235,14 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {2, "R5_w", "pkt(r=0)"}, - {4, "R5_w", "pkt(off=14,r=0)"}, - {5, "R4_w", "pkt(off=14,r=0)"}, + {2, "R5", "pkt(r=0)"}, + {4, "R5", "pkt(off=14,r=0)"}, + {5, "R4", "pkt(off=14,r=0)"}, {9, "R2", "pkt(r=18)"}, {10, "R5", "pkt(off=14,r=18)"}, - {10, "R4_w", "var_off=(0x0; 0xff)"}, - {13, "R4_w", "var_off=(0x0; 0xffff)"}, - {14, "R4_w", "var_off=(0x0; 0xffff)"}, + {10, "R4", "var_off=(0x0; 0xff)"}, + {13, "R4", "var_off=(0x0; 0xffff)"}, + {14, "R4", "var_off=(0x0; 0xffff)"}, }, }, { @@ -299,12 +299,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ - {11, "R5_w", "pkt(id=1,off=14,"}, + {11, "R5", "pkt(id=1,off=14,"}, /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable @@ -320,12 +320,12 @@ static struct bpf_align_test tests[] = { * instruction to validate R5 state. We also check * that R4 is what it should be in such case. */ - {18, "R4_w", "var_off=(0x0; 0x3fc)"}, - {18, "R5_w", "var_off=(0x0; 0x3fc)"}, + {18, "R4", "var_off=(0x0; 0x3fc)"}, + {18, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant offset is added to R5, resulting in * reg->off of 14. */ - {19, "R5_w", "pkt(id=2,off=14,"}, + {19, "R5", "pkt(id=2,off=14,"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte @@ -337,21 +337,21 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w", "pkt(off=14,r=8)"}, + {26, "R5", "pkt(off=14,r=8)"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ - {28, "R4_w", "var_off=(0x0; 0x3fc)"}, - {28, "R5_w", "var_off=(0x0; 0x3fc)"}, + {28, "R4", "var_off=(0x0; 0x3fc)"}, + {28, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant is added to R5 again, setting reg->off to 18. */ - {29, "R5_w", "pkt(id=3,off=18,"}, + {29, "R5", "pkt(id=3,off=18,"}, /* And once more we add a variable; resulting var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {31, "R4_w", "var_off=(0x0; 0x7fc)"}, - {31, "R5_w", "var_off=(0x0; 0x7fc)"}, + {31, "R4", "var_off=(0x0; 0x7fc)"}, + {31, "R5", "var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so @@ -397,12 +397,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {8, "R6_w", "var_off=(0x2; 0x7fc)"}, + {8, "R6", "var_off=(0x2; 0x7fc)"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w", "var_off=(0x2; 0x7fc)"}, + {11, "R5", "var_off=(0x2; 0x7fc)"}, {12, "R4", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -414,11 +414,11 @@ static struct bpf_align_test tests[] = { /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ - {17, "R6_w", "var_off=(0x0; 0x3fc)"}, + {17, "R6", "var_off=(0x0; 0x3fc)"}, /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w", "var_off=(0x2; 0xffc)"}, + {19, "R5", "var_off=(0x2; 0xffc)"}, {20, "R4", "var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -459,18 +459,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {3, "R5_w", "pkt_end()"}, + {3, "R5", "pkt_end()"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"}, + {5, "R5", "var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {6, "R5_w", "var_off=(0x2; 0xfffffffffffffffc)"}, + {6, "R5", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ {9, "R5", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, - {12, "R4_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {11, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R4", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -478,7 +478,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {15, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { @@ -513,12 +513,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {8, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {8, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w", "var_off=(0x2; 0x7fc)"}, + {9, "R6", "var_off=(0x2; 0x7fc)"}, /* New unknown value in R7 is (4n) */ - {10, "R7_w", "var_off=(0x0; 0x3fc)"}, + {10, "R7", "var_off=(0x0; 0x3fc)"}, /* Subtracting it from R6 blows our unsigned bounds */ {11, "R6", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ @@ -566,16 +566,16 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {9, "R6_w", "var_off=(0x0; 0x3c)"}, + {6, "R2", "pkt(r=8)"}, + {9, "R6", "var_off=(0x0; 0x3c)"}, /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w", "var_off=(0x2; 0x7c)"}, + {10, "R6", "var_off=(0x2; 0x7c)"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w", "var_off=(0xffffffffffffff82; 0x7c)"}, + {13, "R5", "var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ - {14, "R7_w", "var_off=(0x0; 0x7fc)"}, + {14, "R7", "var_off=(0x0; 0x7fc)"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w", "var_off=(0x2; 0x7fc)"}, + {16, "R5", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index e3ea5dc2f697..254fbfeab06a 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,22 +13,22 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2) " - "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" + "5: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2) " + "R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)" - " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n" + " R0=map_value(id=1,map=array_map,ks=4,vs=8)" + " R1=map_value(id=1,map=array_map,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" " R0=map_value(id=2,ks=4,vs=8)" - " R1_w=map_value(id=2,ks=4,vs=8)\n" + " R1=map_value(id=2,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index 367f47e4a936..b38c16b4247f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -75,26 +75,26 @@ static void test_set_global_vars_succeeds(void) " -vl2 > %s", fix->veristat, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); - __CHECK_STR("_w=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); - __CHECK_STR("_w=0x80000000 ", "var_s32 = -0x80000000"); - __CHECK_STR("_w=0x76543210 ", "var_u32 = 0x76543210"); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); - __CHECK_STR("_w=128 ", "var_s8 = -128"); - __CHECK_STR("_w=255 ", "var_u8 = 255"); - __CHECK_STR("_w=11 ", "var_ea = EA2"); - __CHECK_STR("_w=12 ", "var_eb = EB2"); - __CHECK_STR("_w=13 ", "var_ec = EC2"); - __CHECK_STR("_w=1 ", "var_b = 1"); - __CHECK_STR("_w=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); - __CHECK_STR("_w=0xaaaa ", "union1.var_u16 = 0xaaaa"); - __CHECK_STR("_w=171 ", "arr[3]= 171"); - __CHECK_STR("_w=172 ", "arr[EA2] =172"); - __CHECK_STR("_w=10 ", "enum_arr[EC2]=EA3"); - __CHECK_STR("_w=173 ", "matrix[31][7][11]=173"); - __CHECK_STR("_w=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); - __CHECK_STR("_w=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); + __CHECK_STR("=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); + __CHECK_STR("=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); + __CHECK_STR("=0x80000000 ", "var_s32 = -0x80000000"); + __CHECK_STR("=0x76543210 ", "var_u32 = 0x76543210"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=128 ", "var_s8 = -128"); + __CHECK_STR("=255 ", "var_u8 = 255"); + __CHECK_STR("=11 ", "var_ea = EA2"); + __CHECK_STR("=12 ", "var_eb = EB2"); + __CHECK_STR("=13 ", "var_ec = EC2"); + __CHECK_STR("=1 ", "var_b = 1"); + __CHECK_STR("=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); + __CHECK_STR("=0xaaaa ", "union1.var_u16 = 0xaaaa"); + __CHECK_STR("=171 ", "arr[3]= 171"); + __CHECK_STR("=172 ", "arr[EA2] =172"); + __CHECK_STR("=10 ", "enum_arr[EC2]=EA3"); + __CHECK_STR("=173 ", "matrix[31][7][11]=173"); + __CHECK_STR("=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); + __CHECK_STR("=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); out: teardown_fixture(fix); @@ -117,8 +117,8 @@ static void test_set_global_vars_from_file_succeeds(void) SYS(out, "%s set_global_vars.bpf.o -G \"@%s\" -vl2 > %s", fix->veristat, input_file, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); out: close(fd); diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 5e0a1ca96d4e..a01c2736890f 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -18,43 +18,43 @@ return *(u64 *)num; \ } -__msg(": R0_w=0xffffffff80000000") +__msg(": R0=0xffffffff80000000") check_assert(s64, ==, eq_int_min, INT_MIN); -__msg(": R0_w=0x7fffffff") +__msg(": R0=0x7fffffff") check_assert(s64, ==, eq_int_max, INT_MAX); -__msg(": R0_w=0") +__msg(": R0=0") check_assert(s64, ==, eq_zero, 0); -__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000") +__msg(": R0=0x8000000000000000 R1=0x8000000000000000") check_assert(s64, ==, eq_llong_min, LLONG_MIN); -__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff") +__msg(": R0=0x7fffffffffffffff R1=0x7fffffffffffffff") check_assert(s64, ==, eq_llong_max, LLONG_MAX); -__msg(": R0_w=scalar(id=1,smax=0x7ffffffe)") +__msg(": R0=scalar(id=1,smax=0x7ffffffe)") check_assert(s64, <, lt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, <, lt_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff7fffffff") +__msg(": R0=scalar(id=1,smax=0xffffffff7fffffff") check_assert(s64, <, lt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smax=0x7fffffff)") +__msg(": R0=scalar(id=1,smax=0x7fffffff)") check_assert(s64, <=, le_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=0)") +__msg(": R0=scalar(id=1,smax=0)") check_assert(s64, <=, le_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff80000000") +__msg(": R0=scalar(id=1,smax=0xffffffff80000000") check_assert(s64, <=, le_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000001") +__msg(": R0=scalar(id=1,smin=0xffffffff80000001") check_assert(s64, >, gt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000000") +__msg(": R0=scalar(id=1,smin=0xffffffff80000000") check_assert(s64, >=, ge_neg, INT_MIN); SEC("?tc") diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index b381ac0c736c..d273b46dfc7c 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 6543d5b6e0a9..83791348bed5 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,7 +20,7 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) @@ -38,7 +38,7 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) @@ -58,7 +58,7 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 4f94c971ae86..3b984b6ae7c0 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,8 +8,8 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7_w=ptr_nameidata(off={{[0-9]+}}) R8_w=rdonly_untrusted_mem(sz=0)") -__msg("r9 = *(u8 *)(r8 +0) ; R8_w=rdonly_untrusted_mem(sz=0) R9_w=scalar") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index fbccc20555f4..0a72e0228ea9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -926,7 +926,7 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for non const xor src dst") __success __log_level(2) -__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (af) r0 ^= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_xor_src_dst(void) { asm volatile (" \ @@ -947,7 +947,7 @@ __naked void non_const_xor_src_dst(void) SEC("socket") __description("bounds check for non const or src dst") __success __log_level(2) -__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (4f) r0 |= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_or_src_dst(void) { asm volatile (" \ @@ -968,7 +968,7 @@ __naked void non_const_or_src_dst(void) SEC("socket") __description("bounds check for non const mul regs") __success __log_level(2) -__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__msg("5: (2f) r0 *= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") __naked void non_const_mul_regs(void) { asm volatile (" \ @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1264,7 +1264,7 @@ __naked void mult_mixed0_sign(void) SEC("tc") __description("multiply mixed sign bounds. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=smin32=-100,smax=smax32=200)") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=smin32=-100,smax=smax32=200)") __naked void mult_mixed1_sign(void) { asm volatile ( @@ -1287,7 +1287,7 @@ __naked void mult_mixed1_sign(void) SEC("tc") __description("multiply negative bounds") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") __naked void mult_sign_bounds(void) { asm volatile ( @@ -1311,7 +1311,7 @@ __naked void mult_sign_bounds(void) SEC("tc") __description("multiply bounds that don't cross signed boundary") __success __log_level(2) -__msg("r8 *= r6 {{.*}}; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8_w=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") +__msg("r8 *= r6 {{.*}}; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") __naked void mult_no_sign_crossing(void) { asm volatile ( @@ -1331,7 +1331,7 @@ __naked void mult_no_sign_crossing(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_unsign_ovf(void) { asm volatile ( @@ -1353,7 +1353,7 @@ __naked void mult_unsign_ovf(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_sign_ovf(void) { asm volatile ( @@ -1376,7 +1376,7 @@ __naked void mult_sign_ovf(void) SEC("socket") __description("64-bit addition, all outcomes overflow") __success __log_level(2) -__msg("5: (0f) r3 += r3 {{.*}} R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") +__msg("5: (0f) r3 += r3 {{.*}} R3=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") __retval(0) __naked void add64_full_overflow(void) { @@ -1396,7 +1396,7 @@ __naked void add64_full_overflow(void) SEC("socket") __description("64-bit addition, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("4: (0f) r3 += r3 {{.*}} R3_w=scalar()") +__msg("4: (0f) r3 += r3 {{.*}} R3=scalar()") __retval(0) __naked void add64_partial_overflow(void) { @@ -1416,7 +1416,7 @@ __naked void add64_partial_overflow(void) SEC("socket") __description("32-bit addition overflow, all outcomes overflow") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_full_overflow(void) { @@ -1436,7 +1436,7 @@ __naked void add32_full_overflow(void) SEC("socket") __description("32-bit addition, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_partial_overflow(void) { @@ -1456,7 +1456,7 @@ __naked void add32_partial_overflow(void) SEC("socket") __description("64-bit subtraction, all outcomes underflow") __success __log_level(2) -__msg("6: (1f) r3 -= r1 {{.*}} R3_w=scalar(umin=1,umax=0x8000000000000000)") +__msg("6: (1f) r3 -= r1 {{.*}} R3=scalar(umin=1,umax=0x8000000000000000)") __retval(0) __naked void sub64_full_overflow(void) { @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3_w=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") __retval(0) __naked void sub64_partial_overflow(void) { @@ -1496,7 +1496,7 @@ __naked void sub64_partial_overflow(void) SEC("socket") __description("32-bit subtraction overflow, all outcomes underflow") __success __log_level(2) -__msg("5: (1c) w3 -= w1 {{.*}} R3_w=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") +__msg("5: (1c) w3 -= w1 {{.*}} R3=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_full_overflow(void) { @@ -1517,7 +1517,7 @@ __naked void sub32_full_overflow(void) SEC("socket") __description("32-bit subtraction, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("3: (1c) w3 -= w2 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("3: (1c) w3 -= w2 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_partial_overflow(void) { @@ -1617,7 +1617,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, positive overlap") __success __log_level(2) __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") __retval(0) __naked void bounds_deduct_positive_overlap(void) { @@ -1650,7 +1650,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 181da86ba5f0..6630a92b1b47 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -215,7 +215,7 @@ __weak int subprog_untrusted(const volatile struct task_struct *restrict task __ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_untrusted') is global and assumed valid.") __msg("Validating subprog_untrusted() func#1...") __msg(": R1=untrusted_ptr_task_struct") @@ -278,7 +278,7 @@ __weak int subprog_enum_untrusted(enum bpf_attach_type *p __arg_untrusted) SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_void_untrusted') is global and assumed valid.") __msg("Validating subprog_void_untrusted() func#1...") __msg(": R1=rdonly_untrusted_mem(sz=0)") diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 52edee41caf6..f087ffb79f20 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -65,7 +65,7 @@ __naked void ldsx_s32(void) SEC("socket") __description("LDSX, S8 range checking, privileged") __log_level(2) __success __retval(1) -__msg("R1_w=scalar(smin=smin32=-128,smax=smax32=127)") +__msg("R1=scalar(smin=smin32=-128,smax=smax32=127)") __naked void ldsx_s8_range_priv(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 73fee2aec698..1fe090cd6744 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -144,21 +144,21 @@ SEC("?raw_tp") __success __log_level(2) /* * Without the bug fix there will be no history between "last_idx 3 first_idx 3" - * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * and "parent state regs=" lines. "R0=6" parts are here to help anchor * expected log messages to the one specific mark_chain_precision operation. * * This is quite fragile: if verifier checkpointing heuristic changes, this * might need adjusting. */ -__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("2: (07) r0 += 1 ; R0=6") __msg("3: (35) if r0 >= 0xa goto pc+1") __msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") -__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") -__msg("3: R0_w=6") +__msg("mark_precise: frame0: parent state regs= stack=: R0=P4") +__msg("3: R0=6") __naked int state_loop_first_last_equal(void) { asm volatile ( @@ -233,8 +233,8 @@ __naked void bpf_cond_op_not_r10(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (84) w0 = -w0 ; R0_w=0xffffffff") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (84) w0 = -w0 ; R0=0xffffffff") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (84) w0 = -w0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") @@ -268,8 +268,8 @@ __naked int bpf_neg_3(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (87) r0 = -r0 ; R0_w=-1") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (87) r0 = -r0 ; R0=-1") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (87) r0 = -r0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index dba3ca728f6e..c0ce690ddb68 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -353,7 +353,7 @@ __flag(BPF_F_TEST_STATE_FREQ) * collect_linked_regs() can't tie more than 6 registers for a single insn. */ __msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +__msg("9: (bf) r6 = r6 ; R6=scalar(id=2") /* check that r{0-5} are marked precise after 'if' */ __msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") @@ -779,12 +779,12 @@ __success __retval(0) /* Check that verifier believes r1/r0 are zero at exit */ __log_level(2) -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") __msg("from 3 to 4") -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") /* Verify that statements to randomize upper half of r1 had not been * generated. diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 1e5a511e8494..7a13dbd794b2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -506,17 +506,17 @@ SEC("raw_tp") __log_level(2) __success /* fp-8 is spilled IMPRECISE value zero (represented by a zero value fake reg) */ -__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8_w=0") +__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8=0") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=0 R10=fp0 fp-16_w=0") +__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0=0 R10=fp0 fp-16=0") /* validate that assigning R2 from STACK_SPILL with zero value doesn't mark register * precise immediately; if necessary, it will be marked precise later */ -__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8_w=0") +__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0") /* similarly, when R2 is assigned from spilled register, it is initially * imprecise, but will be marked precise later once it is used in precise context */ -__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=0 R10=fp0 fp-16_w=0") +__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2=0 R10=fp0 fp-16=0") __msg("11: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)") @@ -598,7 +598,7 @@ __log_level(2) __success /* fp-4 is STACK_ZERO */ __msg("2: (62) *(u32 *)(r10 -4) = 0 ; R10=fp0 fp-8=0000????") -__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8=0000????") +__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0000????") __msg("5: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 4: (71) r2 = *(u8 *)(r10 -1)") @@ -640,25 +640,25 @@ SEC("raw_tp") __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __success /* make sure fp-8 is IMPRECISE fake register spill */ -__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8_w=1") +__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8=1") /* and fp-16 is spilled IMPRECISE const reg */ -__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16_w=1") +__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=1") +__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2=1 R10=fp0 fp-8=1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") /* note, fp-8 is precise, fp-16 is not yet precise, we'll get there */ -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_w=1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=1") +__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (79) r2 = *(u64 *)(r10 -16)") @@ -668,12 +668,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") /* now both fp-8 and fp-16 are precise, very good */ -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -719,22 +719,22 @@ __success /* make sure fp-8 is 32-bit FAKE subregister spill */ __msg("3: (62) *(u32 *)(r10 -8) = 1 ; R10=fp0 fp-8=????1") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16=????1") +__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=????1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=????1") +__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2=1 R10=fp0 fp-8=????1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16=????1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=????1") +__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (61) r2 = *(u32 *)(r10 -16)") @@ -743,12 +743,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 10: (73) *(u8 *)(r1 +0) = r2 __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16_r=????P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 9d415f7ce599..ac3e418c2a96 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -105,7 +105,7 @@ __msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") __msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") __msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int fp_precise_subprog_result(void) { asm volatile ( @@ -141,7 +141,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") * anyways, at which point we'll break precision chain */ __msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int sneaky_fp_precise_subprog_result(void) { asm volatile ( @@ -681,7 +681,7 @@ __msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8") __msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4") __msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=2 R6=1 R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1") __msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2") diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c index b616575c3b00..ce13002c7a19 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c +++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c @@ -93,7 +93,7 @@ .expected_attach_type = BPF_SK_LOOKUP, .result = VERBOSE_ACCEPT, .runs = -1, - .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8_w=-44\ + .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8=-44\ 2: (c5) if r0 s< 0x0 goto pc+2\ - R0_w=-44", + R0=-44", }, -- cgit v1.2.3 From 79f047c7d968b21ff4b72bd70c4533140553c56c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:43 -0700 Subject: bpf: table based bpf_insn_successors() Converting bpf_insn_successors() to use lookup table makes it ~1.5 times faster. Also remove unnecessary conditionals: - `idx + 1 < prog->len` is unnecessary because after check_cfg() all jump targets are guaranteed to be within a program; - `i == 0 || succ[0] != dst` is unnecessary because any client of bpf_insn_successors() can handle duplicate edges: - compute_live_registers() - compute_scc() Moving bpf_insn_successors() to liveness.c allows its inlining in liveness.c:__update_stack_liveness(). Such inlining speeds up __update_stack_liveness() by ~40%. bpf_insn_successors() is used in both verifier.c and liveness.c. perf shows such move does not negatively impact users in verifier.c, as these are executed only once before main varification pass. Unlike __update_stack_liveness() which can be triggered multiple times. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-10-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/liveness.c | 56 ++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 72 +------------------------------------------- 3 files changed, 58 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c7515da8500c..4c497e839526 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1049,6 +1049,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st u32 frameno); struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_jmp_offset(struct bpf_insn *insn); int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 6f9dfaaf6e64..3c611aba7f52 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -433,6 +433,62 @@ static void log_mask_change(struct bpf_verifier_env *env, struct callchain *call bpf_log(&env->log, "\n"); } +int bpf_jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); + +inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + static const struct opcode_info { + bool can_jump; + bool can_fallthrough; + } opcode_info_tbl[256] = { + [0 ... 255] = {.can_jump = false, .can_fallthrough = true}, + #define _J(code, ...) \ + [BPF_JMP | code] = __VA_ARGS__, \ + [BPF_JMP32 | code] = __VA_ARGS__ + + _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}), + _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}), + _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), + #undef _J + }; + struct bpf_insn *insn = &prog->insnsi[idx]; + const struct opcode_info *opcode_info; + int i = 0, insn_sz; + + opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (opcode_info->can_fallthrough) + succ[i++] = idx + insn_sz; + + if (opcode_info->can_jump) + succ[i++] = idx + bpf_jmp_offset(insn) + 1; + + return i; +} + +__diag_pop(); + static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, struct func_instance *instance) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e1da2471442b..1d4183bc3cd1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3485,15 +3485,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } -static int jmp_offset(struct bpf_insn *insn) -{ - u8 code = insn->code; - - if (code == (BPF_JMP32 | BPF_JA)) - return insn->imm; - return insn->off; -} - static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3520,7 +3511,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + jmp_offset(&insn[i]) + 1; + off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -23944,67 +23935,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } -static bool can_fallthrough(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return true; - - if (opcode == BPF_EXIT || opcode == BPF_JA) - return false; - - return true; -} - -static bool can_jump(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return false; - - switch (opcode) { - case BPF_JA: - case BPF_JEQ: - case BPF_JNE: - case BPF_JLT: - case BPF_JLE: - case BPF_JGT: - case BPF_JGE: - case BPF_JSGT: - case BPF_JSGE: - case BPF_JSLT: - case BPF_JSLE: - case BPF_JCOND: - case BPF_JSET: - return true; - } - - return false; -} - -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) -{ - struct bpf_insn *insn = &prog->insnsi[idx]; - int i = 0, insn_sz; - u32 dst; - - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - if (can_fallthrough(insn) && idx + 1 < prog->len) - succ[i++] = idx + insn_sz; - - if (can_jump(insn)) { - dst = idx + jmp_offset(insn) + 1; - if (i == 0 || succ[0] != dst) - succ[i++] = dst; - } - - return i; -} - /* Each field is a register bitmask */ struct insn_live_regs { u16 use; /* registers read by instruction */ -- cgit v1.2.3 From fbd401e95e569ad0307e4301012f2d8e1ec1ee98 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Sep 2025 23:10:31 +0200 Subject: ACPI: processor: idle: Redefine two functions as void Notice that acpi_processor_power_init() and acpi_processor_power_exit() don't need to return any values because their callers don't check them anyway, so redefine those functions as void. While at it, rearrange the code in acpi_processor_power_init() to reduce the indentation level, get rid of a redundant local variable in that function, and rephrase a code comment in it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- drivers/acpi/processor_idle.c | 41 +++++++++++++++++++---------------------- include/acpi/processor.h | 4 ++-- 2 files changed, 21 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5dacf41d7cc0..698d14c19587 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1400,47 +1400,45 @@ void acpi_processor_unregister_idle_driver(void) cpuidle_unregister_driver(&acpi_idle_driver); } -int acpi_processor_power_init(struct acpi_processor *pr) +void acpi_processor_power_init(struct acpi_processor *pr) { - int retval; struct cpuidle_device *dev; if (disabled_by_idle_boot_param()) - return 0; + return; acpi_processor_cstate_first_run_checks(); if (!acpi_processor_get_power_info(pr)) pr->flags.power_setup_done = 1; - if (pr->flags.power) { - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - per_cpu(acpi_cpuidle_device, pr->id) = dev; + if (!pr->flags.power) + return; - acpi_processor_setup_cpuidle_dev(pr, dev); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return; - /* Register per-cpu cpuidle_device. Cpuidle driver - * must already be registered before registering device - */ - retval = cpuidle_register_device(dev); - if (retval) { + per_cpu(acpi_cpuidle_device, pr->id) = dev; - per_cpu(acpi_cpuidle_device, pr->id) = NULL; - kfree(dev); - return retval; - } + acpi_processor_setup_cpuidle_dev(pr, dev); + + /* + * Register a cpuidle device for this CPU. The cpuidle driver using + * this device is expected to be registered. + */ + if (cpuidle_register_device(dev)) { + per_cpu(acpi_cpuidle_device, pr->id) = NULL; + kfree(dev); } - return 0; } -int acpi_processor_power_exit(struct acpi_processor *pr) +void acpi_processor_power_exit(struct acpi_processor *pr) { struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id); if (disabled_by_idle_boot_param()) - return 0; + return; if (pr->flags.power) { cpuidle_unregister_device(dev); @@ -1448,7 +1446,6 @@ int acpi_processor_power_exit(struct acpi_processor *pr) } pr->flags.power_setup_done = 0; - return 0; } MODULE_IMPORT_NS("ACPI_PROCESSOR_IDLE"); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 6ee4a69412de..24fdaa3c2899 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -419,8 +419,8 @@ static inline void acpi_processor_throttling_init(void) {} /* in processor_idle.c */ extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE -int acpi_processor_power_init(struct acpi_processor *pr); -int acpi_processor_power_exit(struct acpi_processor *pr); +void acpi_processor_power_init(struct acpi_processor *pr); +void acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); -- cgit v1.2.3 From f8d2f8205be8cceef2dd3c0e68e7af3c5f83c75c Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 08:52:02 -0700 Subject: psp: make struct sock argument const in psp_sk_get_assoc_rcu() This function does not need a mutable reference to its argument. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918155205.2197603-2-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 91ba06733321..fb3cbe8427ea 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -124,7 +124,7 @@ psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); } -static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { struct inet_timewait_sock *tw; struct psp_assoc *pas; -- cgit v1.2.3 From 803cdb6ddca3e24418226e17e4b1c1134619aca8 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 08:52:03 -0700 Subject: psp: fix preemptive inet_twsk() cast in psp_sk_get_assoc_rcu() It is weird to cast to a timewait_sock before checking sk_state, even if the use is after such a check. Remove the tw local variable, and use inet_twsk() directly in the timewait branch. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918155205.2197603-3-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index fb3cbe8427ea..980de7e58f8a 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -126,7 +126,6 @@ psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { - struct inet_timewait_sock *tw; struct psp_assoc *pas; int state; @@ -134,9 +133,9 @@ static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) return NULL; - tw = inet_twsk(sk); - pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : - rcu_dereference(sk->psp_assoc); + pas = state & TCPF_TIME_WAIT ? + rcu_dereference(inet_twsk(sk)->psp_assoc) : + rcu_dereference(sk->psp_assoc); return pas; } -- cgit v1.2.3 From 28bb24dadd0ed70aed43cf9af3a54c22c3ce04b2 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 08:52:04 -0700 Subject: psp: don't use flags for checking sk_state Using flags to check sk_state only makes sense to check for a subset of states in parallel e.g. sk_fullsock(). We are not doing that here. Compare for individual states directly. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918155205.2197603-4-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 980de7e58f8a..ef7743664da3 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -129,11 +129,11 @@ static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) struct psp_assoc *pas; int state; - state = 1 << READ_ONCE(sk->sk_state); - if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + state = READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) return NULL; - pas = state & TCPF_TIME_WAIT ? + pas = state == TCP_TIME_WAIT ? rcu_dereference(inet_twsk(sk)->psp_assoc) : rcu_dereference(sk->psp_assoc); return pas; -- cgit v1.2.3 From f1bf77491d5e48ab5477f585ee5fca2aa524bd15 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 18 Sep 2025 19:25:35 +0000 Subject: psp: Fix typo in kdoc for struct psp_dev_caps.assoc_drv_spc. assoc_drv_spc is the size of psp_assoc.drv_data[]. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250918192539.1587586-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/psp/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index d9688e66cf09..31cee64b7c86 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -98,7 +98,7 @@ struct psp_dev_caps { /** * @assoc_drv_spc: size of driver-specific state in Tx assoc - * Determines the size of struct psp_assoc::drv_spc + * Determines the size of struct psp_assoc::drv_data */ u32 assoc_drv_spc; }; -- cgit v1.2.3 From 32a8d2a197c1d2d36badb401657aa193938f071c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 17 Sep 2025 16:12:01 +0100 Subject: net: stmmac: rework mac_interface and phy_interface documentation Based on new research, it has come to light that the comment that I added in a014c35556b9 ("net: stmmac: clarify difference between "interface" and "phy_interface"") is not fully correct. Update the comment to properly describe the difference between the two. All of the DTS files in the kernel tree do not mention the "mac-mode" property, which results in mac_interface and phy_interface being the same. Also, none of the platform glue drivers set mac_interface to anything but PHY_INTERFACE_MODE_NA. This means that for all the platforms known to mainline, mac_interface is either the same as phy_interface, or it is PHY_INTERFACE_MODE_NA. Thus, updating the definition for mac_interface in stmmac.h has no material effect on current uses known to mainline, but the change opens the door to cleaning up all uses. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uytpB-00000006H23-0pRi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index e284f04964bf..f14f34ec6d5e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -190,18 +190,32 @@ struct plat_stmmacenet_data { int bus_id; int phy_addr; /* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media - * ^ ^ - * mac_interface phy_interface + * ^ ^ + * mac_interface phy_interface * - * mac_interface is the MAC-side interface, which may be the same - * as phy_interface if there is no intervening PCS. If there is a - * PCS, then mac_interface describes the interface mode between the - * MAC and PCS, and phy_interface describes the interface mode - * between the PCS and PHY. + * The Synopsys dwmac core only covers the MAC and an optional + * integrated PCS. Where the integrated PCS is used with a SerDes, + * e.g. for 1000base-X or Cisco SGMII, the connection between the + * PCS and SerDes will be TBI. + * + * Where the Synopsys dwmac core has been instantiated with multiple + * interface modes, these are selected via core-external configuration + * which is sampled when the dwmac core is reset. How this is done is + * platform glue specific, but this defines the interface used from + * the Synopsys dwmac core to the rest of the SoC. + * + * Where PCS other than the optional integrated Synopsys dwmac PCS + * is used, this counts as "the rest of the SoC" in the above + * paragraph. + * + * Thus, mac_interface is of little use inside the stmmac code; + * please do not use unless there is a definite requirement, and + * make sure to gain review feedback first. */ phy_interface_t mac_interface; /* phy_interface is the PHY-side interface - the interface used by - * an attached PHY. + * an attached PHY or SFP etc. This is equivalent to the interface + * that phylink uses. */ phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; -- cgit v1.2.3 From 6b0ed6a3a89cd2d04980e15a44c645bebb077418 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 17 Sep 2025 16:12:47 +0100 Subject: net: stmmac: remove mac_interface mac_interface has served little purpose, and has only caused confusion. Now that we have cleaned up all platform glue drivers which should not have been using mac_interface, there are no users remaining. Remove mac_interface. This results in the special dwmac specific "mac-mode" DT property becoming redundant, and an in case, no DTS files in the kernel make use of this property. Add a warning if the property is set, and it is different from the "phy-mode". Signed-off-by: Russell King (Oracle) Acked-by: Vladimir Zapolskiy Link: https://patch.msgid.link/E1uytpv-00000006H2x-196h@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 -- drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c | 1 - drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +---- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 6 +++++- include/linux/stmmac.h | 11 +++-------- 5 files changed, 9 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index dd82dc2189e9..592aa9d636e5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -98,8 +98,6 @@ static void loongson_default_data(struct pci_dev *pdev, /* Set default value for multicast hash bins */ plat->multicast_filter_bins = 256; - plat->mac_interface = PHY_INTERFACE_MODE_NA; - /* Set default value for unicast filter entries */ plat->unicast_filter_entries = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c index c0c44916f849..2562a6d036a2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c @@ -41,7 +41,6 @@ static int lpc18xx_dwmac_probe(struct platform_device *pdev) if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); - plat_dat->mac_interface = PHY_INTERFACE_MODE_NA; plat_dat->has_gmac = true; reg = syscon_regmap_lookup_by_compatible("nxp,lpc1850-creg"); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a23017a886f3..d17820d9e7f1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1118,10 +1118,7 @@ static const struct phylink_mac_ops stmmac_phylink_mac_ops = { */ static void stmmac_check_pcs_mode(struct stmmac_priv *priv) { - int interface = priv->plat->mac_interface; - - if (interface == PHY_INTERFACE_MODE_NA) - interface = priv->plat->phy_interface; + int interface = priv->plat->phy_interface; if (priv->dma_cap.pcs) { if ((interface == PHY_INTERFACE_MODE_RGMII) || diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index a3e077f225d1..712ef235f0f4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -453,8 +453,12 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) return ERR_PTR(phy_mode); plat->phy_interface = phy_mode; + rc = stmmac_of_get_mac_mode(np); - plat->mac_interface = rc < 0 ? plat->phy_interface : rc; + if (rc >= 0 && rc != phy_mode) + dev_warn(&pdev->dev, + "\"mac-mode\" property used for %s but differs to \"phy-mode\" of %s, and will be ignored. Please report.\n", + phy_modes(rc), phy_modes(phy_mode)); /* Some wrapper drivers still rely on phy_node. Let's save it while * they are not converted to phylink. */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index f14f34ec6d5e..fa1318bac06c 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -190,8 +190,8 @@ struct plat_stmmacenet_data { int bus_id; int phy_addr; /* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media - * ^ ^ - * mac_interface phy_interface + * ^ + * phy_interface * * The Synopsys dwmac core only covers the MAC and an optional * integrated PCS. Where the integrated PCS is used with a SerDes, @@ -208,12 +208,7 @@ struct plat_stmmacenet_data { * is used, this counts as "the rest of the SoC" in the above * paragraph. * - * Thus, mac_interface is of little use inside the stmmac code; - * please do not use unless there is a definite requirement, and - * make sure to gain review feedback first. - */ - phy_interface_t mac_interface; - /* phy_interface is the PHY-side interface - the interface used by + * phy_interface is the PHY-side interface - the interface used by * an attached PHY or SFP etc. This is equivalent to the interface * that phylink uses. */ -- cgit v1.2.3 From b73b8146d7ff68e245525adb944a4c998d423d59 Mon Sep 17 00:00:00 2001 From: Alasdair McWilliam Date: Wed, 17 Sep 2025 10:55:42 +0100 Subject: rtnetlink: add needed_{head,tail}room attributes Various network interface types make use of needed_{head,tail}room values to efficiently reserve buffer space for additional encapsulation headers, such as VXLAN, Geneve, IPSec, etc. However, it is not currently possible to query these values in a generic way. Introduce ability to query the needed_{head,tail}room values of a network device via rtnetlink, such that applications that may wish to use these values can do so. For example, Cilium agent iterates over present devices based on user config (direct routing, vxlan, geneve, wireguard etc.) and in future will configure netkit in order to expose the needed_{head,tail}room into K8s pods. See b9ed315d3c4c ("netkit: Allow for configuring needed_{head,tail}room"). Suggested-by: Daniel Borkmann Signed-off-by: Alasdair McWilliam Reviewed-by: Daniel Borkmann Link: https://patch.msgid.link/20250917095543.14039-1-alasdair@mcwilliam.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 6 ++++++ include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 10 +++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6ab31f86854d..2a23e9699c0b 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1057,6 +1057,12 @@ attribute-sets: - name: netns-immutable type: u8 + - + name: headroom + type: u16 + - + name: tailroom + type: u16 - name: prop-list-link-attrs subset-of: link-attrs diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 45f56c9f95d9..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -379,6 +379,8 @@ enum { IFLA_DPLL_PIN, IFLA_MAX_PACING_OFFLOAD_HORIZON, IFLA_NETNS_IMMUTABLE, + IFLA_HEADROOM, + IFLA_TAILROOM, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 094b085cff20..d9e68ca84926 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1326,6 +1326,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; } @@ -2091,7 +2093,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -2243,6 +2249,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From 1c7e4a618509476658bafba35fffb3a5cfb213b1 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 15 Sep 2025 11:19:54 +0200 Subject: net: ipv4: make udp_v4_early_demux explicitly return drop reason udp_v4_early_demux already returns drop reasons as it either returns 0 or ip_mc_validate_source, which itself returns drop reasons. Its return value is also already used as a drop reason itself. Makes this explicit by making it return drop reasons. Signed-off-by: Antoine Tenart Reviewed-by: David Ahern Link: https://patch.msgid.link/20250915091958.15382-2-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/net/udp.h | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/udp.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index eecd64097f91..059a0cee5f55 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -404,7 +404,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, &off, err); } -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index a09aca2c8567..8878e865ddf6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -319,7 +319,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, } int tcp_v4_early_demux(struct sk_buff *skb); -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0c40426628eb..85cfc32eb2cc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2811,7 +2811,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -int udp_v4_early_demux(struct sk_buff *skb) +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; @@ -2825,7 +2825,7 @@ int udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return 0; + return SKB_NOT_DROPPED_YET; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2834,12 +2834,12 @@ int udp_v4_early_demux(struct sk_buff *skb) in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return 0; + return SKB_NOT_DROPPED_YET; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return 0; + return SKB_NOT_DROPPED_YET; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2850,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb) } if (!sk) - return 0; + return SKB_NOT_DROPPED_YET; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); @@ -2877,7 +2877,7 @@ int udp_v4_early_demux(struct sk_buff *skb) ip4h_dscp(iph), skb->dev, in_dev, &itag); } - return 0; + return SKB_NOT_DROPPED_YET; } int udp_rcv(struct sk_buff *skb) -- cgit v1.2.3 From b34df17d588de926212527a2f2ce72bc4e330260 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 18 Sep 2025 05:25:57 -0700 Subject: net: netpoll: remove unused netpoll pointer from netpoll_info The netpoll_info structure contains an useless pointer back to its associated netpoll. This field is never used, and the assignment in __netpoll_setup() is does not comtemplate multiple instances, as reported by Jay[1]. Drop both the member and its initialization to simplify the structure. Link: https://lore.kernel.org/all/2930648.1757463506@famine/ [1] Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250918-netpoll_jv-v1-1-67d50eeb2c26@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 - net/core/netpoll.c | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b5ea9882eda8..f22eec466040 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -55,7 +55,6 @@ struct netpoll_info { struct delayed_work tx_work; - struct netpoll *netpoll; struct rcu_head rcu; }; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5f65b62346d4..c58faa747165 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); - npinfo->netpoll = np; /* fill up the skb queue */ refill_skbs(np); -- cgit v1.2.3 From 8be1f299041220512195e40590bb4984f297ae48 Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 11 Sep 2025 11:34:03 +0800 Subject: dt-bindings: clock: spacemit: introduce i2s pre-clock to fix i2s clock Previously, the K1 clock driver did not include the parent clocks of the I2S sysclk. Introduce pre-clock to fix I2S clock. Otherwise, the I2S clock may not work as expected. This patch adds their definitions to allow proper registration in the driver and usage in the device tree. Fixes: 1b72c59db0add ("clk: spacemit: Add clock support for SpacemiT K1 SoC") Acked-by: Krzysztof Kozlowski Signed-off-by: Troy Mitchell Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/spacemit,k1-syscon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/spacemit,k1-syscon.h b/include/dt-bindings/clock/spacemit,k1-syscon.h index 505205453d7f..0f8b59d6753c 100644 --- a/include/dt-bindings/clock/spacemit,k1-syscon.h +++ b/include/dt-bindings/clock/spacemit,k1-syscon.h @@ -77,6 +77,10 @@ #define CLK_I2S_BCLK 30 #define CLK_APB 31 #define CLK_WDT_BUS 32 +#define CLK_I2S_153P6 33 +#define CLK_I2S_153P6_BASE 34 +#define CLK_I2S_SYSCLK_SRC 35 +#define CLK_I2S_BCLK_FACTOR 36 /* MPMU resets */ #define RESET_WDT 0 -- cgit v1.2.3 From 519cff1d85694cbdf33b27591740e7e37348e6b4 Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Thu, 11 Sep 2025 11:34:05 +0800 Subject: clk: spacemit: fix i2s clock Defining i2s_bclk and i2s_sysclk as fixed-rate clocks is insufficient for real I2S use cases. Moreover, the current I2S clock configuration does not work as expected due to missing parent clocks. This patch adds the missing parent clocks, defines i2s_sysclk as a DDN clock, and i2s_bclk as a DIV clock. A special note for i2s_bclk: From the register definition, the i2s_bclk divider always implies an additional 1/2 factor. The following table shows the correspondence between index and frequency division coefficients: | index | div | |-------|-------| | 0 | 2 | | 1 | 4 | | 2 | 6 | | 3 | 8 | From a software perspective, introducing i2s_bclk_factor as the parent of i2s_bclk is sufficient to address the issue. The I2S-related clock registers can be found here [1]. Link: https://developer.spacemit.com/documentation?token=LCrKwWDasiJuROkVNusc2pWTnEb [1] Fixes: 1b72c59db0add ("clk: spacemit: Add clock support for SpacemiT K1 SoC") Co-developer: Jinmei Wei Suggested-by: Haylen Chu Signed-off-by: Jinmei Wei Signed-off-by: Troy Mitchell Signed-off-by: Stephen Boyd --- drivers/clk/spacemit/ccu-k1.c | 28 ++++++++++++++++++++++++++-- include/soc/spacemit/k1-syscon.h | 1 + 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/clk/spacemit/ccu-k1.c b/drivers/clk/spacemit/ccu-k1.c index 14a88d0372dd..f5a9fe6ba185 100644 --- a/drivers/clk/spacemit/ccu-k1.c +++ b/drivers/clk/spacemit/ccu-k1.c @@ -141,8 +141,28 @@ CCU_DDN_DEFINE(slow_uart2_48, pll1_d4_614p4, MPMU_SUCCR_1, 16, 13, 0, 13, 2, 0); CCU_GATE_DEFINE(wdt_clk, CCU_PARENT_HW(pll1_d96_25p6), MPMU_WDTPCR, BIT(1), 0); -CCU_FACTOR_GATE_DEFINE(i2s_sysclk, CCU_PARENT_HW(pll1_d16_153p6), MPMU_ISCCR, BIT(31), 50, 1); -CCU_FACTOR_GATE_DEFINE(i2s_bclk, CCU_PARENT_HW(i2s_sysclk), MPMU_ISCCR, BIT(29), 1, 1); +CCU_FACTOR_DEFINE(i2s_153p6, CCU_PARENT_HW(pll1_d8_307p2), 2, 1); + +static const struct clk_parent_data i2s_153p6_base_parents[] = { + CCU_PARENT_HW(i2s_153p6), + CCU_PARENT_HW(pll1_d8_307p2), +}; +CCU_MUX_DEFINE(i2s_153p6_base, i2s_153p6_base_parents, MPMU_FCCR, 29, 1, 0); + +static const struct clk_parent_data i2s_sysclk_src_parents[] = { + CCU_PARENT_HW(pll1_d96_25p6), + CCU_PARENT_HW(i2s_153p6_base) +}; +CCU_MUX_GATE_DEFINE(i2s_sysclk_src, i2s_sysclk_src_parents, MPMU_ISCCR, 30, 1, BIT(31), 0); + +CCU_DDN_DEFINE(i2s_sysclk, i2s_sysclk_src, MPMU_ISCCR, 0, 15, 15, 12, 1, 0); + +CCU_FACTOR_DEFINE(i2s_bclk_factor, CCU_PARENT_HW(i2s_sysclk), 2, 1); +/* + * Divider of i2s_bclk always implies a 1/2 factor, which is + * described by i2s_bclk_factor. + */ +CCU_DIV_GATE_DEFINE(i2s_bclk, CCU_PARENT_HW(i2s_bclk_factor), MPMU_ISCCR, 27, 2, BIT(29), 0); static const struct clk_parent_data apb_parents[] = { CCU_PARENT_HW(pll1_d96_25p6), @@ -775,6 +795,10 @@ static struct clk_hw *k1_ccu_mpmu_hws[] = { [CLK_I2S_BCLK] = &i2s_bclk.common.hw, [CLK_APB] = &apb_clk.common.hw, [CLK_WDT_BUS] = &wdt_bus_clk.common.hw, + [CLK_I2S_153P6] = &i2s_153p6.common.hw, + [CLK_I2S_153P6_BASE] = &i2s_153p6_base.common.hw, + [CLK_I2S_SYSCLK_SRC] = &i2s_sysclk_src.common.hw, + [CLK_I2S_BCLK_FACTOR] = &i2s_bclk_factor.common.hw, }; static const struct spacemit_ccu_data k1_ccu_mpmu_data = { diff --git a/include/soc/spacemit/k1-syscon.h b/include/soc/spacemit/k1-syscon.h index c59bd7a38e5b..354751562c55 100644 --- a/include/soc/spacemit/k1-syscon.h +++ b/include/soc/spacemit/k1-syscon.h @@ -30,6 +30,7 @@ to_spacemit_ccu_adev(struct auxiliary_device *adev) /* MPMU register offset */ #define MPMU_POSR 0x0010 +#define MPMU_FCCR 0x0008 #define POSR_PLL1_LOCK BIT(27) #define POSR_PLL2_LOCK BIT(28) #define POSR_PLL3_LOCK BIT(29) -- cgit v1.2.3 From f75f66683ded09f7135aef2e763c245a07c8271a Mon Sep 17 00:00:00 2001 From: Dan Moulding Date: Mon, 8 Sep 2025 10:12:43 -0600 Subject: crypto: comp - Use same definition of context alloc and free ops In commit 42d9f6c77479 ("crypto: acomp - Move scomp stream allocation code into acomp"), the crypto_acomp_streams struct was made to rely on having the alloc_ctx and free_ctx operations defined in the same order as the scomp_alg struct. But in that same commit, the alloc_ctx and free_ctx members of scomp_alg may be randomized by structure layout randomization, since they are contained in a pure ops structure (containing only function pointers). If the pointers within scomp_alg are randomized, but those in crypto_acomp_streams are not, then the order may no longer match. This fixes the problem by removing the union from scomp_alg so that both crypto_acomp_streams and scomp_alg will share the same definition of alloc_ctx and free_ctx, ensuring they will always have the same layout. Signed-off-by: Dan Moulding Suggested-by: Herbert Xu Fixes: 42d9f6c77479 ("crypto: acomp - Move scomp stream allocation code into acomp") Signed-off-by: Herbert Xu --- crypto/842.c | 6 ++++-- crypto/lz4.c | 6 ++++-- crypto/lz4hc.c | 6 ++++-- crypto/lzo-rle.c | 6 ++++-- crypto/lzo.c | 6 ++++-- drivers/crypto/nx/nx-common-powernv.c | 6 ++++-- drivers/crypto/nx/nx-common-pseries.c | 6 ++++-- include/crypto/internal/scompress.h | 11 +---------- 8 files changed, 29 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/crypto/842.c b/crypto/842.c index 8c257c40e2b9..4007e87bed80 100644 --- a/crypto/842.c +++ b/crypto/842.c @@ -54,8 +54,10 @@ static int crypto842_sdecompress(struct crypto_scomp *tfm, } static struct scomp_alg scomp = { - .alloc_ctx = crypto842_alloc_ctx, - .free_ctx = crypto842_free_ctx, + .streams = { + .alloc_ctx = crypto842_alloc_ctx, + .free_ctx = crypto842_free_ctx, + }, .compress = crypto842_scompress, .decompress = crypto842_sdecompress, .base = { diff --git a/crypto/lz4.c b/crypto/lz4.c index 7a984ae5ae52..57b713516aef 100644 --- a/crypto/lz4.c +++ b/crypto/lz4.c @@ -68,8 +68,10 @@ static int lz4_sdecompress(struct crypto_scomp *tfm, const u8 *src, } static struct scomp_alg scomp = { - .alloc_ctx = lz4_alloc_ctx, - .free_ctx = lz4_free_ctx, + .streams = { + .alloc_ctx = lz4_alloc_ctx, + .free_ctx = lz4_free_ctx, + }, .compress = lz4_scompress, .decompress = lz4_sdecompress, .base = { diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c index 9c61d05b6214..bb84f8a68cb5 100644 --- a/crypto/lz4hc.c +++ b/crypto/lz4hc.c @@ -66,8 +66,10 @@ static int lz4hc_sdecompress(struct crypto_scomp *tfm, const u8 *src, } static struct scomp_alg scomp = { - .alloc_ctx = lz4hc_alloc_ctx, - .free_ctx = lz4hc_free_ctx, + .streams = { + .alloc_ctx = lz4hc_alloc_ctx, + .free_ctx = lz4hc_free_ctx, + }, .compress = lz4hc_scompress, .decompress = lz4hc_sdecompress, .base = { diff --git a/crypto/lzo-rle.c b/crypto/lzo-rle.c index ba013f2d5090..794e7ec49536 100644 --- a/crypto/lzo-rle.c +++ b/crypto/lzo-rle.c @@ -70,8 +70,10 @@ static int lzorle_sdecompress(struct crypto_scomp *tfm, const u8 *src, } static struct scomp_alg scomp = { - .alloc_ctx = lzorle_alloc_ctx, - .free_ctx = lzorle_free_ctx, + .streams = { + .alloc_ctx = lzorle_alloc_ctx, + .free_ctx = lzorle_free_ctx, + }, .compress = lzorle_scompress, .decompress = lzorle_sdecompress, .base = { diff --git a/crypto/lzo.c b/crypto/lzo.c index 7867e2c67c4e..d43242b24b4e 100644 --- a/crypto/lzo.c +++ b/crypto/lzo.c @@ -70,8 +70,10 @@ static int lzo_sdecompress(struct crypto_scomp *tfm, const u8 *src, } static struct scomp_alg scomp = { - .alloc_ctx = lzo_alloc_ctx, - .free_ctx = lzo_free_ctx, + .streams = { + .alloc_ctx = lzo_alloc_ctx, + .free_ctx = lzo_free_ctx, + }, .compress = lzo_scompress, .decompress = lzo_sdecompress, .base = { diff --git a/drivers/crypto/nx/nx-common-powernv.c b/drivers/crypto/nx/nx-common-powernv.c index fd0a98b2fb1b..0493041ea088 100644 --- a/drivers/crypto/nx/nx-common-powernv.c +++ b/drivers/crypto/nx/nx-common-powernv.c @@ -1043,8 +1043,10 @@ static struct scomp_alg nx842_powernv_alg = { .base.cra_priority = 300, .base.cra_module = THIS_MODULE, - .alloc_ctx = nx842_powernv_crypto_alloc_ctx, - .free_ctx = nx842_crypto_free_ctx, + .streams = { + .alloc_ctx = nx842_powernv_crypto_alloc_ctx, + .free_ctx = nx842_crypto_free_ctx, + }, .compress = nx842_crypto_compress, .decompress = nx842_crypto_decompress, }; diff --git a/drivers/crypto/nx/nx-common-pseries.c b/drivers/crypto/nx/nx-common-pseries.c index f528e072494a..fc0222ebe807 100644 --- a/drivers/crypto/nx/nx-common-pseries.c +++ b/drivers/crypto/nx/nx-common-pseries.c @@ -1020,8 +1020,10 @@ static struct scomp_alg nx842_pseries_alg = { .base.cra_priority = 300, .base.cra_module = THIS_MODULE, - .alloc_ctx = nx842_pseries_crypto_alloc_ctx, - .free_ctx = nx842_crypto_free_ctx, + .streams = { + .alloc_ctx = nx842_pseries_crypto_alloc_ctx, + .free_ctx = nx842_crypto_free_ctx, + }, .compress = nx842_crypto_compress, .decompress = nx842_crypto_decompress, }; diff --git a/include/crypto/internal/scompress.h b/include/crypto/internal/scompress.h index 533d6c16a491..6a2c5f2e90f9 100644 --- a/include/crypto/internal/scompress.h +++ b/include/crypto/internal/scompress.h @@ -18,11 +18,8 @@ struct crypto_scomp { /** * struct scomp_alg - synchronous compression algorithm * - * @alloc_ctx: Function allocates algorithm specific context - * @free_ctx: Function frees context allocated with alloc_ctx * @compress: Function performs a compress operation * @decompress: Function performs a de-compress operation - * @base: Common crypto API algorithm data structure * @streams: Per-cpu memory for algorithm * @calg: Cmonn algorithm data structure shared with acomp */ @@ -34,13 +31,7 @@ struct scomp_alg { unsigned int slen, u8 *dst, unsigned int *dlen, void *ctx); - union { - struct { - void *(*alloc_ctx)(void); - void (*free_ctx)(void *ctx); - }; - struct crypto_acomp_streams streams; - }; + struct crypto_acomp_streams streams; union { struct COMP_ALG_COMMON; -- cgit v1.2.3 From bee8a520eb84950193d0566ea2c2e46406a4b6ce Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Sep 2025 17:50:56 +0800 Subject: rhashtable: Use rcu_dereference_all and rcu_dereference_all_check Add rcu_dereference_all and rcu_dereference_all_check so that library code such as rhashtable can be used with any RCU variant. As it stands rcu_dereference is used within rashtable, which creates false-positive warnings if the user calls it from another RCU context, such as preempt_disable(). Use the rcu_dereference_all and rcu_dereference_all_check calls in rhashtable to suppress these warnings. Also replace the rcu_dereference_raw calls in the list iterators with rcu_dereference_all to uncover buggy calls. Reported-by: Menglong Dong Signed-off-by: Herbert Xu Reviewed-by: Paul E. McKenney Signed-off-by: Herbert Xu --- include/linux/rcupdate.h | 26 ++++++++++++++++++++++++++ include/linux/rhashtable.h | 14 +++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..448eb1f0cb48 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -713,6 +713,24 @@ do { \ (c) || rcu_read_lock_sched_held(), \ __rcu) +/** + * rcu_dereference_all_check() - rcu_dereference_all with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is similar to rcu_dereference_check(), but allows protection + * by all forms of vanilla RCU readers, including preemption disabled, + * bh-disabled, and interrupt-disabled regions of code. Note that "vanilla + * RCU" excludes SRCU and the various Tasks RCU flavors. Please note + * that this macro should not be backported to any Linux-kernel version + * preceding v5.0 due to changes in synchronize_rcu() semantics prior + * to that version. + */ +#define rcu_dereference_all_check(p, c) \ + __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ + (c) || rcu_read_lock_any_held(), \ + __rcu) + /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. @@ -767,6 +785,14 @@ do { \ */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) +/** + * rcu_dereference_all() - fetch RCU-all-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. + */ +#define rcu_dereference_all(p) rcu_dereference_all_check(p, 0) + /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index e740157f3cd7..05a221ce79a6 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -272,13 +272,13 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ - rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) + rcu_dereference_all_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ - rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) + rcu_dereference_all_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) @@ -373,7 +373,7 @@ static inline struct rhash_head *__rht_ptr( static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference(*bkt), bkt); + return __rht_ptr(rcu_dereference_all(*bkt), bkt); } static inline struct rhash_head *rht_ptr( @@ -497,7 +497,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_all(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain @@ -513,7 +513,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_all(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head @@ -560,7 +560,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ - for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + for (pos = list; pos; pos = rcu_dereference_all(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type @@ -574,7 +574,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_all(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) -- cgit v1.2.3 From 3d716c51e0e8791f8dd72479a3e6d5e7650ac35e Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 13 Sep 2025 18:57:51 +0800 Subject: crypto: hisilicon/qm - mask axi error before memory init After the device memory is cleared, if the software sends the doorbell operation, the hardware may trigger a axi error when processing the doorbell. This error is caused by memory clearing and hardware access to address 0. Therefore, the axi error is masked during this period. Signed-off-by: Weili Qian Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/hpre/hpre_main.c | 100 +++++++++++++++++++---------- drivers/crypto/hisilicon/qm.c | 66 +++++++++++++------ drivers/crypto/hisilicon/sec2/sec_main.c | 90 ++++++++++++++++++-------- drivers/crypto/hisilicon/zip/zip_main.c | 102 ++++++++++++++++++++---------- include/linux/hisi_acc_qm.h | 21 ++++-- 5 files changed, 257 insertions(+), 122 deletions(-) (limited to 'include') diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index f437f361a2c9..718abe3fa5fe 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -39,6 +39,7 @@ #define HPRE_HAC_RAS_NFE_ENB 0x301414 #define HPRE_HAC_RAS_FE_ENB 0x301418 #define HPRE_HAC_INT_SET 0x301500 +#define HPRE_AXI_ERROR_MASK GENMASK(21, 10) #define HPRE_RNG_TIMEOUT_NUM 0x301A34 #define HPRE_CORE_INT_ENABLE 0 #define HPRE_RDCHN_INI_ST 0x301a00 @@ -798,8 +799,7 @@ static void hpre_master_ooo_ctrl(struct hisi_qm *qm, bool enable) val1 = readl(qm->io_base + HPRE_AM_OOO_SHUTDOWN_ENB); if (enable) { val1 |= HPRE_AM_OOO_SHUTDOWN_ENABLE; - val2 = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + val2 = qm->err_info.dev_err.shutdown_mask; } else { val1 &= ~HPRE_AM_OOO_SHUTDOWN_ENABLE; val2 = 0x0; @@ -813,38 +813,33 @@ static void hpre_master_ooo_ctrl(struct hisi_qm *qm, bool enable) static void hpre_hw_error_disable(struct hisi_qm *qm) { - u32 ce, nfe; - - ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_CE_MASK_CAP, qm->cap_ver); - nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; /* disable hpre hw error interrupts */ - writel(ce | nfe | HPRE_HAC_RAS_FE_ENABLE, qm->io_base + HPRE_INT_MASK); + writel(err_mask, qm->io_base + HPRE_INT_MASK); /* disable HPRE block master OOO when nfe occurs on Kunpeng930 */ hpre_master_ooo_ctrl(qm, false); } static void hpre_hw_error_enable(struct hisi_qm *qm) { - u32 ce, nfe, err_en; - - ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_CE_MASK_CAP, qm->cap_ver); - nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; /* clear HPRE hw error source if having */ - writel(ce | nfe | HPRE_HAC_RAS_FE_ENABLE, qm->io_base + HPRE_HAC_SOURCE_INT); + writel(err_mask, qm->io_base + HPRE_HAC_SOURCE_INT); /* configure error type */ - writel(ce, qm->io_base + HPRE_RAS_CE_ENB); - writel(nfe, qm->io_base + HPRE_RAS_NFE_ENB); - writel(HPRE_HAC_RAS_FE_ENABLE, qm->io_base + HPRE_RAS_FE_ENB); + writel(dev_err->ce, qm->io_base + HPRE_RAS_CE_ENB); + writel(dev_err->nfe, qm->io_base + HPRE_RAS_NFE_ENB); + writel(dev_err->fe, qm->io_base + HPRE_RAS_FE_ENB); /* enable HPRE block master OOO when nfe occurs on Kunpeng930 */ hpre_master_ooo_ctrl(qm, true); /* enable hpre hw error interrupts */ - err_en = ce | nfe | HPRE_HAC_RAS_FE_ENABLE; - writel(~err_en, qm->io_base + HPRE_INT_MASK); + writel(~err_mask, qm->io_base + HPRE_INT_MASK); } static inline struct hisi_qm *hpre_file_to_qm(struct hpre_debugfs_file *file) @@ -1399,9 +1394,8 @@ static void hpre_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts) static void hpre_disable_error_report(struct hisi_qm *qm, u32 err_type) { - u32 nfe_mask; + u32 nfe_mask = qm->err_info.dev_err.nfe; - nfe_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); writel(nfe_mask & (~err_type), qm->io_base + HPRE_RAS_NFE_ENB); } @@ -1422,11 +1416,11 @@ static enum acc_err_result hpre_get_err_result(struct hisi_qm *qm) err_status = hpre_get_hw_err_status(qm); if (err_status) { - if (err_status & qm->err_info.ecc_2bits_mask) + if (err_status & qm->err_info.dev_err.ecc_2bits_mask) qm->err_status.is_dev_ecc_mbit = true; hpre_log_hw_error(qm, err_status); - if (err_status & qm->err_info.dev_reset_mask) { + if (err_status & qm->err_info.dev_err.reset_mask) { /* Disable the same error reporting until device is recovered. */ hpre_disable_error_report(qm, err_status); return ACC_ERR_NEED_RESET; @@ -1442,28 +1436,64 @@ static bool hpre_dev_is_abnormal(struct hisi_qm *qm) u32 err_status; err_status = hpre_get_hw_err_status(qm); - if (err_status & qm->err_info.dev_shutdown_mask) + if (err_status & qm->err_info.dev_err.shutdown_mask) return true; return false; } +static void hpre_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + u32 val; + + val = ~(err_mask & (~HPRE_AXI_ERROR_MASK)); + writel(val, qm->io_base + HPRE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask & (~HPRE_AXI_ERROR_MASK), + qm->io_base + HPRE_OOO_SHUTDOWN_SEL); +} + +static void hpre_enable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + /* clear axi error source */ + writel(HPRE_AXI_ERROR_MASK, qm->io_base + HPRE_HAC_SOURCE_INT); + + writel(~err_mask, qm->io_base + HPRE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask, qm->io_base + HPRE_OOO_SHUTDOWN_SEL); +} + static void hpre_err_info_init(struct hisi_qm *qm) { struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &err_info->qm_err; + struct hisi_qm_err_mask *dev_err = &err_info->dev_err; + + qm_err->fe = HPRE_HAC_RAS_FE_ENABLE; + qm_err->ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_CE_MASK_CAP, qm->cap_ver); + qm_err->nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_NFE_MASK_CAP, qm->cap_ver); + qm_err->shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + qm_err->reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_QM_RESET_MASK_CAP, qm->cap_ver); + qm_err->ecc_2bits_mask = QM_ECC_MBIT; + + dev_err->fe = HPRE_HAC_RAS_FE_ENABLE; + dev_err->ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_CE_MASK_CAP, qm->cap_ver); + dev_err->nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); + dev_err->shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + dev_err->reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_RESET_MASK_CAP, qm->cap_ver); + dev_err->ecc_2bits_mask = HPRE_CORE_ECC_2BIT_ERR | HPRE_OOO_ECC_2BIT_ERR; - err_info->fe = HPRE_HAC_RAS_FE_ENABLE; - err_info->ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_CE_MASK_CAP, qm->cap_ver); - err_info->nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_NFE_MASK_CAP, qm->cap_ver); - err_info->ecc_2bits_mask = HPRE_CORE_ECC_2BIT_ERR | HPRE_OOO_ECC_2BIT_ERR; - err_info->dev_shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_QM_RESET_MASK_CAP, qm->cap_ver); - err_info->dev_reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_RESET_MASK_CAP, qm->cap_ver); err_info->msi_wr_port = HPRE_WR_MSI_PORT; err_info->acpi_rst = "HRST"; } @@ -1481,6 +1511,8 @@ static const struct hisi_qm_err_ini hpre_err_ini = { .err_info_init = hpre_err_info_init, .get_err_result = hpre_get_err_result, .dev_is_abnormal = hpre_dev_is_abnormal, + .disable_axi_error = hpre_disable_axi_error, + .enable_axi_error = hpre_enable_axi_error, }; static int hpre_pf_probe_init(struct hpre *hpre) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index c1ffaae78532..d0d1fc45b8fe 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -147,9 +147,9 @@ #define QM_RAS_CE_TIMES_PER_IRQ 1 #define QM_OOO_SHUTDOWN_SEL 0x1040f8 #define QM_AXI_RRESP_ERR BIT(0) -#define QM_ECC_MBIT BIT(2) #define QM_DB_TIMEOUT BIT(10) #define QM_OF_FIFO_OF BIT(11) +#define QM_RAS_AXI_ERROR (BIT(0) | BIT(1) | BIT(12)) #define QM_RESET_WAIT_TIMEOUT 400 #define QM_PEH_VENDOR_ID 0x1000d8 @@ -165,7 +165,6 @@ #define ACC_MASTER_TRANS_RETURN 0x300150 #define ACC_MASTER_GLOBAL_CTRL 0x300000 #define ACC_AM_CFG_PORT_WR_EN 0x30001c -#define QM_RAS_NFE_MBIT_DISABLE ~QM_ECC_MBIT #define ACC_AM_ROB_ECC_INT_STS 0x300104 #define ACC_ROB_ECC_ERR_MULTPL BIT(1) #define QM_MSI_CAP_ENABLE BIT(16) @@ -522,7 +521,7 @@ static bool qm_check_dev_error(struct hisi_qm *qm) return false; err_status = qm_get_hw_error_status(pf_qm); - if (err_status & pf_qm->err_info.qm_shutdown_mask) + if (err_status & pf_qm->err_info.qm_err.shutdown_mask) return true; if (pf_qm->err_ini->dev_is_abnormal) @@ -1397,17 +1396,17 @@ static void qm_hw_error_init_v1(struct hisi_qm *qm) static void qm_hw_error_cfg(struct hisi_qm *qm) { - struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &qm->err_info.qm_err; - qm->error_mask = err_info->nfe | err_info->ce | err_info->fe; + qm->error_mask = qm_err->nfe | qm_err->ce | qm_err->fe; /* clear QM hw residual error source */ writel(qm->error_mask, qm->io_base + QM_ABNORMAL_INT_SOURCE); /* configure error type */ - writel(err_info->ce, qm->io_base + QM_RAS_CE_ENABLE); + writel(qm_err->ce, qm->io_base + QM_RAS_CE_ENABLE); writel(QM_RAS_CE_TIMES_PER_IRQ, qm->io_base + QM_RAS_CE_THRESHOLD); - writel(err_info->nfe, qm->io_base + QM_RAS_NFE_ENABLE); - writel(err_info->fe, qm->io_base + QM_RAS_FE_ENABLE); + writel(qm_err->nfe, qm->io_base + QM_RAS_NFE_ENABLE); + writel(qm_err->fe, qm->io_base + QM_RAS_FE_ENABLE); } static void qm_hw_error_init_v2(struct hisi_qm *qm) @@ -1436,7 +1435,7 @@ static void qm_hw_error_init_v3(struct hisi_qm *qm) qm_hw_error_cfg(qm); /* enable close master ooo when hardware error happened */ - writel(qm->err_info.qm_shutdown_mask, qm->io_base + QM_OOO_SHUTDOWN_SEL); + writel(qm->err_info.qm_err.shutdown_mask, qm->io_base + QM_OOO_SHUTDOWN_SEL); irq_unmask = ~qm->error_mask; irq_unmask &= readl(qm->io_base + QM_ABNORMAL_INT_MASK); @@ -1498,6 +1497,7 @@ static void qm_log_hw_error(struct hisi_qm *qm, u32 error_status) static enum acc_err_result qm_hw_error_handle_v2(struct hisi_qm *qm) { + struct hisi_qm_err_mask *qm_err = &qm->err_info.qm_err; u32 error_status; error_status = qm_get_hw_error_status(qm); @@ -1506,17 +1506,16 @@ static enum acc_err_result qm_hw_error_handle_v2(struct hisi_qm *qm) qm->err_status.is_qm_ecc_mbit = true; qm_log_hw_error(qm, error_status); - if (error_status & qm->err_info.qm_reset_mask) { + if (error_status & qm_err->reset_mask) { /* Disable the same error reporting until device is recovered. */ - writel(qm->err_info.nfe & (~error_status), - qm->io_base + QM_RAS_NFE_ENABLE); + writel(qm_err->nfe & (~error_status), qm->io_base + QM_RAS_NFE_ENABLE); return ACC_ERR_NEED_RESET; } /* Clear error source if not need reset. */ writel(error_status, qm->io_base + QM_ABNORMAL_INT_SOURCE); - writel(qm->err_info.nfe, qm->io_base + QM_RAS_NFE_ENABLE); - writel(qm->err_info.ce, qm->io_base + QM_RAS_CE_ENABLE); + writel(qm_err->nfe, qm->io_base + QM_RAS_NFE_ENABLE); + writel(qm_err->ce, qm->io_base + QM_RAS_CE_ENABLE); } return ACC_ERR_RECOVERED; @@ -4227,9 +4226,9 @@ static void qm_dev_ecc_mbit_handle(struct hisi_qm *qm) !qm->err_status.is_qm_ecc_mbit && !qm->err_ini->close_axi_master_ooo) { nfe_enb = readl(qm->io_base + QM_RAS_NFE_ENABLE); - writel(nfe_enb & QM_RAS_NFE_MBIT_DISABLE, + writel(nfe_enb & ~qm->err_info.qm_err.ecc_2bits_mask, qm->io_base + QM_RAS_NFE_ENABLE); - writel(QM_ECC_MBIT, qm->io_base + QM_ABNORMAL_INT_SET); + writel(qm->err_info.qm_err.ecc_2bits_mask, qm->io_base + QM_ABNORMAL_INT_SET); } } @@ -4508,12 +4507,12 @@ static void qm_restart_prepare(struct hisi_qm *qm) qm->io_base + ACC_AM_CFG_PORT_WR_EN); /* clear dev ecc 2bit error source if having */ - value = qm_get_dev_err_status(qm) & qm->err_info.ecc_2bits_mask; + value = qm_get_dev_err_status(qm) & qm->err_info.dev_err.ecc_2bits_mask; if (value && qm->err_ini->clear_dev_hw_err_status) qm->err_ini->clear_dev_hw_err_status(qm, value); /* clear QM ecc mbit error source */ - writel(QM_ECC_MBIT, qm->io_base + QM_ABNORMAL_INT_SOURCE); + writel(qm->err_info.qm_err.ecc_2bits_mask, qm->io_base + QM_ABNORMAL_INT_SOURCE); /* clear AM Reorder Buffer ecc mbit source */ writel(ACC_ROB_ECC_ERR_MULTPL, qm->io_base + ACC_AM_ROB_ECC_INT_STS); @@ -4540,6 +4539,34 @@ clear_flags: qm->err_status.is_dev_ecc_mbit = false; } +static void qm_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *qm_err = &qm->err_info.qm_err; + u32 val; + + val = ~(qm->error_mask & (~QM_RAS_AXI_ERROR)); + writel(val, qm->io_base + QM_ABNORMAL_INT_MASK); + if (qm->ver > QM_HW_V2) + writel(qm_err->shutdown_mask & (~QM_RAS_AXI_ERROR), + qm->io_base + QM_OOO_SHUTDOWN_SEL); + + if (qm->err_ini->disable_axi_error) + qm->err_ini->disable_axi_error(qm); +} + +static void qm_enable_axi_error(struct hisi_qm *qm) +{ + /* clear axi error source */ + writel(QM_RAS_AXI_ERROR, qm->io_base + QM_ABNORMAL_INT_SOURCE); + + writel(~qm->error_mask, qm->io_base + QM_ABNORMAL_INT_MASK); + if (qm->ver > QM_HW_V2) + writel(qm->err_info.qm_err.shutdown_mask, qm->io_base + QM_OOO_SHUTDOWN_SEL); + + if (qm->err_ini->enable_axi_error) + qm->err_ini->enable_axi_error(qm); +} + static int qm_controller_reset_done(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -4573,6 +4600,7 @@ static int qm_controller_reset_done(struct hisi_qm *qm) qm_restart_prepare(qm); hisi_qm_dev_err_init(qm); + qm_disable_axi_error(qm); if (qm->err_ini->open_axi_master_ooo) qm->err_ini->open_axi_master_ooo(qm); @@ -4595,7 +4623,7 @@ static int qm_controller_reset_done(struct hisi_qm *qm) ret = qm_wait_vf_prepare_finish(qm); if (ret) pci_err(pdev, "failed to start by vfs in soft reset!\n"); - + qm_enable_axi_error(qm); qm_cmd_init(qm); qm_restart_done(qm); diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index bdb2d52ee1b6..19fda486fefb 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -47,6 +47,8 @@ #define SEC_RAS_FE_ENB_MSK 0x0 #define SEC_OOO_SHUTDOWN_SEL 0x301014 #define SEC_RAS_DISABLE 0x0 +#define SEC_AXI_ERROR_MASK (BIT(0) | BIT(1)) + #define SEC_MEM_START_INIT_REG 0x301100 #define SEC_MEM_INIT_DONE_REG 0x301104 @@ -713,8 +715,7 @@ static void sec_master_ooo_ctrl(struct hisi_qm *qm, bool enable) val1 = readl(qm->io_base + SEC_CONTROL_REG); if (enable) { val1 |= SEC_AXI_SHUTDOWN_ENABLE; - val2 = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + val2 = qm->err_info.dev_err.shutdown_mask; } else { val1 &= SEC_AXI_SHUTDOWN_DISABLE; val2 = 0x0; @@ -728,7 +729,8 @@ static void sec_master_ooo_ctrl(struct hisi_qm *qm, bool enable) static void sec_hw_error_enable(struct hisi_qm *qm) { - u32 ce, nfe; + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; if (qm->ver == QM_HW_V1) { writel(SEC_CORE_INT_DISABLE, qm->io_base + SEC_CORE_INT_MASK); @@ -736,22 +738,19 @@ static void sec_hw_error_enable(struct hisi_qm *qm) return; } - ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_CE_MASK_CAP, qm->cap_ver); - nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver); - /* clear SEC hw error source if having */ - writel(ce | nfe | SEC_RAS_FE_ENB_MSK, qm->io_base + SEC_CORE_INT_SOURCE); + writel(err_mask, qm->io_base + SEC_CORE_INT_SOURCE); /* enable RAS int */ - writel(ce, qm->io_base + SEC_RAS_CE_REG); - writel(SEC_RAS_FE_ENB_MSK, qm->io_base + SEC_RAS_FE_REG); - writel(nfe, qm->io_base + SEC_RAS_NFE_REG); + writel(dev_err->ce, qm->io_base + SEC_RAS_CE_REG); + writel(dev_err->fe, qm->io_base + SEC_RAS_FE_REG); + writel(dev_err->nfe, qm->io_base + SEC_RAS_NFE_REG); /* enable SEC block master OOO when nfe occurs on Kunpeng930 */ sec_master_ooo_ctrl(qm, true); /* enable SEC hw error interrupts */ - writel(ce | nfe | SEC_RAS_FE_ENB_MSK, qm->io_base + SEC_CORE_INT_MASK); + writel(err_mask, qm->io_base + SEC_CORE_INT_MASK); } static void sec_hw_error_disable(struct hisi_qm *qm) @@ -1108,9 +1107,8 @@ static void sec_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts) static void sec_disable_error_report(struct hisi_qm *qm, u32 err_type) { - u32 nfe_mask; + u32 nfe_mask = qm->err_info.dev_err.nfe; - nfe_mask = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver); writel(nfe_mask & (~err_type), qm->io_base + SEC_RAS_NFE_REG); } @@ -1129,11 +1127,11 @@ static enum acc_err_result sec_get_err_result(struct hisi_qm *qm) err_status = sec_get_hw_err_status(qm); if (err_status) { - if (err_status & qm->err_info.ecc_2bits_mask) + if (err_status & qm->err_info.dev_err.ecc_2bits_mask) qm->err_status.is_dev_ecc_mbit = true; sec_log_hw_error(qm, err_status); - if (err_status & qm->err_info.dev_reset_mask) { + if (err_status & qm->err_info.dev_err.reset_mask) { /* Disable the same error reporting until device is recovered. */ sec_disable_error_report(qm, err_status); return ACC_ERR_NEED_RESET; @@ -1149,28 +1147,62 @@ static bool sec_dev_is_abnormal(struct hisi_qm *qm) u32 err_status; err_status = sec_get_hw_err_status(qm); - if (err_status & qm->err_info.dev_shutdown_mask) + if (err_status & qm->err_info.dev_err.shutdown_mask) return true; return false; } +static void sec_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + writel(err_mask & ~SEC_AXI_ERROR_MASK, qm->io_base + SEC_CORE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask & (~SEC_AXI_ERROR_MASK), + qm->io_base + SEC_OOO_SHUTDOWN_SEL); +} + +static void sec_enable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + /* clear axi error source */ + writel(SEC_AXI_ERROR_MASK, qm->io_base + SEC_CORE_INT_SOURCE); + + writel(err_mask, qm->io_base + SEC_CORE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask, qm->io_base + SEC_OOO_SHUTDOWN_SEL); +} + static void sec_err_info_init(struct hisi_qm *qm) { struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &err_info->qm_err; + struct hisi_qm_err_mask *dev_err = &err_info->dev_err; + + qm_err->fe = SEC_RAS_FE_ENB_MSK; + qm_err->ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_CE_MASK_CAP, qm->cap_ver); + qm_err->nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_NFE_MASK_CAP, qm->cap_ver); + qm_err->shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + qm_err->reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_QM_RESET_MASK_CAP, qm->cap_ver); + qm_err->ecc_2bits_mask = QM_ECC_MBIT; + + dev_err->fe = SEC_RAS_FE_ENB_MSK; + dev_err->ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_CE_MASK_CAP, qm->cap_ver); + dev_err->nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver); + dev_err->shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + dev_err->reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_RESET_MASK_CAP, qm->cap_ver); + dev_err->ecc_2bits_mask = SEC_CORE_INT_STATUS_M_ECC; - err_info->fe = SEC_RAS_FE_ENB_MSK; - err_info->ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_CE_MASK_CAP, qm->cap_ver); - err_info->nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_NFE_MASK_CAP, qm->cap_ver); - err_info->ecc_2bits_mask = SEC_CORE_INT_STATUS_M_ECC; - err_info->qm_shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->dev_shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_QM_RESET_MASK_CAP, qm->cap_ver); - err_info->dev_reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_RESET_MASK_CAP, qm->cap_ver); err_info->msi_wr_port = BIT(0); err_info->acpi_rst = "SRST"; } @@ -1188,6 +1220,8 @@ static const struct hisi_qm_err_ini sec_err_ini = { .err_info_init = sec_err_info_init, .get_err_result = sec_get_err_result, .dev_is_abnormal = sec_dev_is_abnormal, + .disable_axi_error = sec_disable_axi_error, + .enable_axi_error = sec_enable_axi_error, }; static int sec_pf_probe_init(struct sec_dev *sec) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 62cd090e13af..6b5cad82c856 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -65,6 +65,7 @@ #define HZIP_SRAM_ECC_ERR_NUM_SHIFT 16 #define HZIP_SRAM_ECC_ERR_ADDR_SHIFT 24 #define HZIP_CORE_INT_MASK_ALL GENMASK(12, 0) +#define HZIP_AXI_ERROR_MASK (BIT(2) | BIT(3)) #define HZIP_SQE_SIZE 128 #define HZIP_PF_DEF_Q_NUM 64 #define HZIP_PF_DEF_Q_BASE 0 @@ -662,8 +663,7 @@ static void hisi_zip_master_ooo_ctrl(struct hisi_qm *qm, bool enable) val1 = readl(qm->io_base + HZIP_SOFT_CTRL_ZIP_CONTROL); if (enable) { val1 |= HZIP_AXI_SHUTDOWN_ENABLE; - val2 = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + val2 = qm->err_info.dev_err.shutdown_mask; } else { val1 &= ~HZIP_AXI_SHUTDOWN_ENABLE; val2 = 0x0; @@ -677,7 +677,8 @@ static void hisi_zip_master_ooo_ctrl(struct hisi_qm *qm, bool enable) static void hisi_zip_hw_error_enable(struct hisi_qm *qm) { - u32 nfe, ce; + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; if (qm->ver == QM_HW_V1) { writel(HZIP_CORE_INT_MASK_ALL, @@ -686,33 +687,29 @@ static void hisi_zip_hw_error_enable(struct hisi_qm *qm) return; } - nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); - ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_CE_MASK_CAP, qm->cap_ver); - /* clear ZIP hw error source if having */ - writel(ce | nfe | HZIP_CORE_INT_RAS_FE_ENB_MASK, qm->io_base + HZIP_CORE_INT_SOURCE); + writel(err_mask, qm->io_base + HZIP_CORE_INT_SOURCE); /* configure error type */ - writel(ce, qm->io_base + HZIP_CORE_INT_RAS_CE_ENB); - writel(HZIP_CORE_INT_RAS_FE_ENB_MASK, qm->io_base + HZIP_CORE_INT_RAS_FE_ENB); - writel(nfe, qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB); + writel(dev_err->ce, qm->io_base + HZIP_CORE_INT_RAS_CE_ENB); + writel(dev_err->fe, qm->io_base + HZIP_CORE_INT_RAS_FE_ENB); + writel(dev_err->nfe, qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB); hisi_zip_master_ooo_ctrl(qm, true); /* enable ZIP hw error interrupts */ - writel(0, qm->io_base + HZIP_CORE_INT_MASK_REG); + writel(~err_mask, qm->io_base + HZIP_CORE_INT_MASK_REG); hisi_dae_hw_error_enable(qm); } static void hisi_zip_hw_error_disable(struct hisi_qm *qm) { - u32 nfe, ce; + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; /* disable ZIP hw error interrupts */ - nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); - ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_CE_MASK_CAP, qm->cap_ver); - writel(ce | nfe | HZIP_CORE_INT_RAS_FE_ENB_MASK, qm->io_base + HZIP_CORE_INT_MASK_REG); + writel(err_mask, qm->io_base + HZIP_CORE_INT_MASK_REG); hisi_zip_master_ooo_ctrl(qm, false); @@ -1186,9 +1183,8 @@ static void hisi_zip_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts) static void hisi_zip_disable_error_report(struct hisi_qm *qm, u32 err_type) { - u32 nfe_mask; + u32 nfe_mask = qm->err_info.dev_err.nfe; - nfe_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); writel(nfe_mask & (~err_type), qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB); } @@ -1230,14 +1226,14 @@ static enum acc_err_result hisi_zip_get_err_result(struct hisi_qm *qm) /* Get device hardware new error status */ err_status = hisi_zip_get_hw_err_status(qm); if (err_status) { - if (err_status & qm->err_info.ecc_2bits_mask) + if (err_status & qm->err_info.dev_err.ecc_2bits_mask) qm->err_status.is_dev_ecc_mbit = true; hisi_zip_log_hw_error(qm, err_status); - if (err_status & qm->err_info.dev_reset_mask) { + if (err_status & qm->err_info.dev_err.reset_mask) { /* Disable the same error reporting until device is recovered. */ hisi_zip_disable_error_report(qm, err_status); - return ACC_ERR_NEED_RESET; + zip_result = ACC_ERR_NEED_RESET; } else { hisi_zip_clear_hw_err_status(qm, err_status); } @@ -1255,7 +1251,7 @@ static bool hisi_zip_dev_is_abnormal(struct hisi_qm *qm) u32 err_status; err_status = hisi_zip_get_hw_err_status(qm); - if (err_status & qm->err_info.dev_shutdown_mask) + if (err_status & qm->err_info.dev_err.shutdown_mask) return true; return hisi_dae_dev_is_abnormal(qm); @@ -1266,23 +1262,59 @@ static int hisi_zip_set_priv_status(struct hisi_qm *qm) return hisi_dae_close_axi_master_ooo(qm); } +static void hisi_zip_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + u32 val; + + val = ~(err_mask & (~HZIP_AXI_ERROR_MASK)); + writel(val, qm->io_base + HZIP_CORE_INT_MASK_REG); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask & (~HZIP_AXI_ERROR_MASK), + qm->io_base + HZIP_OOO_SHUTDOWN_SEL); +} + +static void hisi_zip_enable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + /* clear axi error source */ + writel(HZIP_AXI_ERROR_MASK, qm->io_base + HZIP_CORE_INT_SOURCE); + + writel(~err_mask, qm->io_base + HZIP_CORE_INT_MASK_REG); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask, qm->io_base + HZIP_OOO_SHUTDOWN_SEL); +} + static void hisi_zip_err_info_init(struct hisi_qm *qm) { struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &err_info->qm_err; + struct hisi_qm_err_mask *dev_err = &err_info->dev_err; + + qm_err->fe = HZIP_CORE_INT_RAS_FE_ENB_MASK; + qm_err->ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_QM_CE_MASK_CAP, qm->cap_ver); + qm_err->nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_QM_NFE_MASK_CAP, qm->cap_ver); + qm_err->ecc_2bits_mask = QM_ECC_MBIT; + qm_err->reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_QM_RESET_MASK_CAP, qm->cap_ver); + qm_err->shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + + dev_err->fe = HZIP_CORE_INT_RAS_FE_ENB_MASK; + dev_err->ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_CE_MASK_CAP, qm->cap_ver); + dev_err->nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); + dev_err->ecc_2bits_mask = HZIP_CORE_INT_STATUS_M_ECC; + dev_err->shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + dev_err->reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_RESET_MASK_CAP, qm->cap_ver); - err_info->fe = HZIP_CORE_INT_RAS_FE_ENB_MASK; - err_info->ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_QM_CE_MASK_CAP, qm->cap_ver); - err_info->nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_QM_NFE_MASK_CAP, qm->cap_ver); - err_info->ecc_2bits_mask = HZIP_CORE_INT_STATUS_M_ECC; - err_info->qm_shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->dev_shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_QM_RESET_MASK_CAP, qm->cap_ver); - err_info->dev_reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_RESET_MASK_CAP, qm->cap_ver); err_info->msi_wr_port = HZIP_WR_PORT; err_info->acpi_rst = "ZRST"; } @@ -1302,6 +1334,8 @@ static const struct hisi_qm_err_ini hisi_zip_err_ini = { .get_err_result = hisi_zip_get_err_result, .set_priv_status = hisi_zip_set_priv_status, .dev_is_abnormal = hisi_zip_dev_is_abnormal, + .disable_axi_error = hisi_zip_disable_axi_error, + .enable_axi_error = hisi_zip_enable_axi_error, }; static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip) diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index f2254ddc327c..c4690e365ade 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -104,6 +104,8 @@ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ #define UACCE_MODE_DESC "0(default) means only register to crypto, 1 means both register to crypto and uacce" +#define QM_ECC_MBIT BIT(2) + enum qm_stop_reason { QM_NORMAL, QM_SOFT_RESET, @@ -240,19 +242,22 @@ enum acc_err_result { ACC_ERR_RECOVERED, }; -struct hisi_qm_err_info { - char *acpi_rst; - u32 msi_wr_port; +struct hisi_qm_err_mask { u32 ecc_2bits_mask; - u32 qm_shutdown_mask; - u32 dev_shutdown_mask; - u32 qm_reset_mask; - u32 dev_reset_mask; + u32 shutdown_mask; + u32 reset_mask; u32 ce; u32 nfe; u32 fe; }; +struct hisi_qm_err_info { + char *acpi_rst; + u32 msi_wr_port; + struct hisi_qm_err_mask qm_err; + struct hisi_qm_err_mask dev_err; +}; + struct hisi_qm_err_status { u32 is_qm_ecc_mbit; u32 is_dev_ecc_mbit; @@ -273,6 +278,8 @@ struct hisi_qm_err_ini { enum acc_err_result (*get_err_result)(struct hisi_qm *qm); bool (*dev_is_abnormal)(struct hisi_qm *qm); int (*set_priv_status)(struct hisi_qm *qm); + void (*disable_axi_error)(struct hisi_qm *qm); + void (*enable_axi_error)(struct hisi_qm *qm); }; struct hisi_qm_cap_info { -- cgit v1.2.3 From 79525b51acc1c8e331ab47eb131a99f5370a76c2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 19 Sep 2025 12:38:58 -0700 Subject: io_uring: fix nvme's 32b cqes on mixed cq The nvme uring_cmd only uses 32b CQEs. If the ring uses a mixed CQ, then we need to make sure we flag the completion as a 32b CQE. On the other hand, if nvme uring_cmd was using a dedicated 32b CQE, the posting was missing the extra memcpy because it only applied to bit CQEs on a mixed CQ. Fixes: e26dca67fde1943 ("io_uring: add support for IORING_SETUP_CQE_MIXED") Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 2 +- include/linux/io_uring/cmd.h | 20 ++++++++++++++++---- io_uring/io_uring.h | 2 +- io_uring/uring_cmd.c | 11 +++++++---- 4 files changed, 25 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 6b3ac8ae3f34..e28bb9113f64 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c8185f54fde9..02d50f08f668 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -56,8 +56,8 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, - unsigned issue_flags); +void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, io_uring_cmd_tw_t task_work_cb, @@ -104,8 +104,8 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, - u64 ret2, unsigned issue_flags) +static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, + u64 ret2, unsigned issue_flags, bool is_cqe32) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -159,6 +159,18 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) return cmd_to_io_kiocb(cmd)->ctx; } +static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, false); +} + +static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e92099d8fcb7..af4c11310652 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -275,7 +275,7 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, return false; memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (is_cqe32) { + if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 5562e8491c5b..a688c9f1a21c 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -151,8 +151,8 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, - unsigned issue_flags) +void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -165,8 +165,11 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, req_set_fail(req); io_req_set_res(req, ret, 0); - if (req->ctx->flags & IORING_SETUP_CQE32) + if (is_cqe32) { + if (req->ctx->flags & IORING_SETUP_CQE_MIXED) + req->cqe.flags |= IORING_CQE_F_32; io_req_set_cqe32_extra(req, res2, 0); + } io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ @@ -180,7 +183,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_task_work_add(req); } } -EXPORT_SYMBOL_GPL(io_uring_cmd_done); +EXPORT_SYMBOL_GPL(__io_uring_cmd_done); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -- cgit v1.2.3 From 9e622804d57e2d08f0271200606bd1270f75126f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 11:10:20 -0400 Subject: Bluetooth: hci_event: Fix UAF in hci_acl_create_conn_sync This fixes the following UFA in hci_acl_create_conn_sync where a connection still pending is command submission (conn->state == BT_OPEN) maybe freed, also since this also can happen with the likes of hci_le_create_conn_sync fix it as well: BUG: KASAN: slab-use-after-free in hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 Write of size 2 at addr ffff88805ffcc038 by task kworker/u11:2/9541 CPU: 1 UID: 0 PID: 9541 Comm: kworker/u11:2 Not tainted 6.16.0-rc7 #3 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci3 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x230 mm/kasan/report.c:480 kasan_report+0x118/0x150 mm/kasan/report.c:593 hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Allocated by task 123736: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] __hci_conn_add+0x233/0x1b30 net/bluetooth/hci_conn.c:939 hci_conn_add_unset net/bluetooth/hci_conn.c:1051 [inline] hci_connect_acl+0x16c/0x4e0 net/bluetooth/hci_conn.c:1634 pair_device+0x418/0xa70 net/bluetooth/mgmt.c:3556 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:727 sock_write_iter+0x258/0x330 net/socket.c:1131 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x54b/0xa90 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 103680: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 device_release+0x9c/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 hci_conn_cleanup net/bluetooth/hci_conn.c:175 [inline] hci_conn_del+0x8ff/0xcb0 net/bluetooth/hci_conn.c:1173 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Last potentially related work creation: kasan_save_stack+0x3e/0x60 mm/kasan/common.c:47 kasan_record_aux_stack+0xbd/0xd0 mm/kasan/generic.c:548 insert_work+0x3d/0x330 kernel/workqueue.c:2183 __queue_work+0xbd9/0xfe0 kernel/workqueue.c:2345 queue_delayed_work_on+0x18b/0x280 kernel/workqueue.c:2561 pairing_complete+0x1e7/0x2b0 net/bluetooth/mgmt.c:3451 pairing_complete_cb+0x1ac/0x230 net/bluetooth/mgmt.c:3487 hci_connect_cfm include/net/bluetooth/hci_core.h:2064 [inline] hci_conn_failed+0x24d/0x310 net/bluetooth/hci_conn.c:1275 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Fixes: aef2aa4fa98e ("Bluetooth: hci_event: Fix creating hci_conn object on error status") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 21 +++++++++++++++++++++ net/bluetooth/hci_event.c | 26 +++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6906af7a8f24..6560b32f3125 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1245,6 +1245,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 97f543824bb0..fe49e8a7969f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3087,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* Check for existing connection: + * + * 1. If it doesn't exist then it must be receiver/slave role. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ @@ -5638,8 +5648,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); - if (!conn) { + /* Check for existing connection: + * + * 1. If it doesn't exist then use the role to create a new object. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ + conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr); + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ -- cgit v1.2.3 From 64e7df08ed43e45aa1a8382b459b516d04d47e99 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 24 Jul 2025 10:39:05 +0200 Subject: dt-bindings: clock: mt7622: Add AFE_MRGIF clock Add the missing AFE Merge Interface clock to MT7622 to make use of it in the audio subsystem. While at it, also remove the useless CLK_AUDIO_NR_CLK definition. Signed-off-by: AngeloGioacchino Del Regno Acked-by: Rob Herring (Arm) Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/mt7622-clk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/clock/mt7622-clk.h b/include/dt-bindings/clock/mt7622-clk.h index c12e7eab0788..a173eb132892 100644 --- a/include/dt-bindings/clock/mt7622-clk.h +++ b/include/dt-bindings/clock/mt7622-clk.h @@ -228,7 +228,7 @@ #define CLK_AUDIO_MEM_ASRC4 44 #define CLK_AUDIO_MEM_ASRC5 45 #define CLK_AUDIO_AFE_CONN 46 -#define CLK_AUDIO_NR_CLK 47 +#define CLK_AUDIO_AFE_MRGIF 47 /* SSUSBSYS */ -- cgit v1.2.3 From dd240e95f1bee671f58148dea25e3be7cb39b50d Mon Sep 17 00:00:00 2001 From: Laura Nao Date: Mon, 15 Sep 2025 17:19:29 +0200 Subject: dt-bindings: clock: mediatek: Describe MT8196 clock controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce binding documentation for system clocks, functional clocks, and PEXTP0/1 and UFS reset controllers on MediaTek MT8196. This binding also includes a handle to the hardware voter, a fixed-function MCU designed to aggregate votes from the application processor and other remote processors to manage clocks and power domains. The HWV on MT8196/MT6991 is incomplete and requires software to manually enable power supplies, parent clocks, and FENC, as well as write to both the HWV MMIO and the controller registers. Because of these constraints, the HWV cannot be modeled using generic clock, power domain, or interconnect APIs. Instead, a custom phandle is exceptionally used to provide direct, syscon-like register access to drivers. Reviewed-by: Nícolas F. R. A. Prado Co-developed-by: AngeloGioacchino Del Regno Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Krzysztof Kozlowski Signed-off-by: Laura Nao Signed-off-by: Stephen Boyd --- .../bindings/clock/mediatek,mt8196-clock.yaml | 112 +++ .../bindings/clock/mediatek,mt8196-sys-clock.yaml | 107 +++ include/dt-bindings/clock/mediatek,mt8196-clock.h | 803 +++++++++++++++++++++ include/dt-bindings/reset/mediatek,mt8196-resets.h | 26 + 4 files changed, 1048 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/mediatek,mt8196-clock.yaml create mode 100644 Documentation/devicetree/bindings/clock/mediatek,mt8196-sys-clock.yaml create mode 100644 include/dt-bindings/clock/mediatek,mt8196-clock.h create mode 100644 include/dt-bindings/reset/mediatek,mt8196-resets.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8196-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8196-clock.yaml new file mode 100644 index 000000000000..bfdbd2e4a167 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/mediatek,mt8196-clock.yaml @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/mediatek,mt8196-clock.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek Functional Clock Controller for MT8196 + +maintainers: + - Guangjie Song + - Laura Nao + +description: | + The clock architecture in MediaTek SoCs is structured like below: + PLLs --> + dividers --> + muxes + --> + clock gate + + The device nodes provide clock gate control in different IP blocks. + +properties: + compatible: + items: + - enum: + - mediatek,mt8196-imp-iic-wrap-c + - mediatek,mt8196-imp-iic-wrap-e + - mediatek,mt8196-imp-iic-wrap-n + - mediatek,mt8196-imp-iic-wrap-w + - mediatek,mt8196-mdpsys0 + - mediatek,mt8196-mdpsys1 + - mediatek,mt8196-pericfg-ao + - mediatek,mt8196-pextp0cfg-ao + - mediatek,mt8196-pextp1cfg-ao + - mediatek,mt8196-ufscfg-ao + - mediatek,mt8196-vencsys + - mediatek,mt8196-vencsys-c1 + - mediatek,mt8196-vencsys-c2 + - mediatek,mt8196-vdecsys + - mediatek,mt8196-vdecsys-soc + - mediatek,mt8196-vdisp-ao + - const: syscon + + reg: + maxItems: 1 + + '#clock-cells': + const: 1 + + '#reset-cells': + const: 1 + description: + Reset lines for PEXTP0/1 and UFS blocks. + + mediatek,hardware-voter: + $ref: /schemas/types.yaml#/definitions/phandle + description: | + Phandle to the "Hardware Voter" (HWV), as named in the vendor + documentation for MT8196/MT6991. + + The HWV is a SoC-internal fixed-function MCU used to collect votes from + both the Application Processor and other remote processors within the SoC. + It is intended to transparently enable or disable hardware resources (such + as power domains or clocks) based on internal vote aggregation handled by + the MCU's internal state machine. + + However, in practice, this design is incomplete. While the HWV performs + some internal vote aggregation,software is still required to + - Manually enable power supplies externally, if present and if required + - Manually enable parent clocks via direct MMIO writes to clock controllers + - Enable the FENC after the clock has been ungated via direct MMIO + writes to clock controllers + + As such, the HWV behaves more like a hardware-managed clock reference + counter than a true voter. Furthermore, it is not a separate + controller. It merely serves as an alternative interface to the same + underlying clock or power controller. Actual control still requires + direct access to the controller's own MMIO register space, in + addition to writing to the HWV's MMIO region. + + For this reason, a custom phandle is used here - drivers need to directly + access the HWV MMIO region in a syscon-like fashion, due to how the + hardware is wired. This differs from true hardware voting systems, which + typically do not require custom phandles and rely instead on generic APIs + (clocks, power domains, interconnects). + + The name "hardware-voter" is retained to match vendor documentation, but + this should not be reused or misunderstood as a proper voting mechanism. + +required: + - compatible + - reg + - '#clock-cells' + +additionalProperties: false + +examples: + - | + pericfg_ao: clock-controller@16640000 { + compatible = "mediatek,mt8196-pericfg-ao", "syscon"; + reg = <0x16640000 0x1000>; + mediatek,hardware-voter = <&scp_hwv>; + #clock-cells = <1>; + }; + - | + pextp0cfg_ao: clock-controller@169b0000 { + compatible = "mediatek,mt8196-pextp0cfg-ao", "syscon"; + reg = <0x169b0000 0x1000>; + #clock-cells = <1>; + #reset-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8196-sys-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8196-sys-clock.yaml new file mode 100644 index 000000000000..660ab64f390d --- /dev/null +++ b/Documentation/devicetree/bindings/clock/mediatek,mt8196-sys-clock.yaml @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/mediatek,mt8196-sys-clock.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek System Clock Controller for MT8196 + +maintainers: + - Guangjie Song + - Laura Nao + +description: | + The clock architecture in MediaTek SoCs is structured like below: + PLLs --> + dividers --> + muxes + --> + clock gate + + The apmixedsys, apmixedsys_gp2, vlpckgen, armpll, ccipll, mfgpll and ptppll + provide most of the PLLs which are generated from the SoC's 26MHZ crystal oscillator. + The topckgen, topckgen_gp2 and vlpckgen provide dividers and muxes which + provide the clock source to other IP blocks. + +properties: + compatible: + items: + - enum: + - mediatek,mt8196-apmixedsys + - mediatek,mt8196-armpll-b-pll-ctrl + - mediatek,mt8196-armpll-bl-pll-ctrl + - mediatek,mt8196-armpll-ll-pll-ctrl + - mediatek,mt8196-apmixedsys-gp2 + - mediatek,mt8196-ccipll-pll-ctrl + - mediatek,mt8196-mfgpll-pll-ctrl + - mediatek,mt8196-mfgpll-sc0-pll-ctrl + - mediatek,mt8196-mfgpll-sc1-pll-ctrl + - mediatek,mt8196-ptppll-pll-ctrl + - mediatek,mt8196-topckgen + - mediatek,mt8196-topckgen-gp2 + - mediatek,mt8196-vlpckgen + - const: syscon + + reg: + maxItems: 1 + + '#clock-cells': + const: 1 + + mediatek,hardware-voter: + $ref: /schemas/types.yaml#/definitions/phandle + description: | + Phandle to the "Hardware Voter" (HWV), as named in the vendor + documentation for MT8196/MT6991. + + The HWV is a SoC-internal fixed-function MCU used to collect votes from + both the Application Processor and other remote processors within the SoC. + It is intended to transparently enable or disable hardware resources (such + as power domains or clocks) based on internal vote aggregation handled by + the MCU's internal state machine. + + However, in practice, this design is incomplete. While the HWV performs + some internal vote aggregation,software is still required to + - Manually enable power supplies externally, if present and if required + - Manually enable parent clocks via direct MMIO writes to clock controllers + - Enable the FENC after the clock has been ungated via direct MMIO + writes to clock controllers + + As such, the HWV behaves more like a hardware-managed clock reference + counter than a true voter. Furthermore, it is not a separate + controller. It merely serves as an alternative interface to the same + underlying clock or power controller. Actual control still requires + direct access to the controller's own MMIO register space, in + addition to writing to the HWV's MMIO region. + + For this reason, a custom phandle is used here - drivers need to directly + access the HWV MMIO region in a syscon-like fashion, due to how the + hardware is wired. This differs from true hardware voting systems, which + typically do not require custom phandles and rely instead on generic APIs + (clocks, power domains, interconnects). + + The name "hardware-voter" is retained to match vendor documentation, but + this should not be reused or misunderstood as a proper voting mechanism. + +required: + - compatible + - reg + - '#clock-cells' + +additionalProperties: false + +examples: + - | + apmixedsys_clk: syscon@10000800 { + compatible = "mediatek,mt8196-apmixedsys", "syscon"; + reg = <0x10000800 0x1000>; + #clock-cells = <1>; + }; + - | + topckgen: syscon@10000000 { + compatible = "mediatek,mt8196-topckgen", "syscon"; + reg = <0x10000000 0x800>; + mediatek,hardware-voter = <&scp_hwv>; + #clock-cells = <1>; + }; + diff --git a/include/dt-bindings/clock/mediatek,mt8196-clock.h b/include/dt-bindings/clock/mediatek,mt8196-clock.h new file mode 100644 index 000000000000..ae0946ab7621 --- /dev/null +++ b/include/dt-bindings/clock/mediatek,mt8196-clock.h @@ -0,0 +1,803 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Copyright (c) 2025 MediaTek Inc. + * Guangjie Song + * Copyright (c) 2025 Collabora Ltd. + * Laura Nao + */ + +#ifndef _DT_BINDINGS_CLK_MT8196_H +#define _DT_BINDINGS_CLK_MT8196_H + +/* CKSYS */ +#define CLK_TOP_AXI 0 +#define CLK_TOP_MEM_SUB 1 +#define CLK_TOP_IO_NOC 2 +#define CLK_TOP_P_AXI 3 +#define CLK_TOP_UFS_PEXTP0_AXI 4 +#define CLK_TOP_PEXTP1_USB_AXI 5 +#define CLK_TOP_P_FMEM_SUB 6 +#define CLK_TOP_PEXPT0_MEM_SUB 7 +#define CLK_TOP_PEXTP1_USB_MEM_SUB 8 +#define CLK_TOP_P_NOC 9 +#define CLK_TOP_EMI_N 10 +#define CLK_TOP_EMI_S 11 +#define CLK_TOP_AP2CONN_HOST 12 +#define CLK_TOP_ATB 13 +#define CLK_TOP_CIRQ 14 +#define CLK_TOP_PBUS_156M 15 +#define CLK_TOP_EFUSE 16 +#define CLK_TOP_MCL3GIC 17 +#define CLK_TOP_MCINFRA 18 +#define CLK_TOP_DSP 19 +#define CLK_TOP_MFG_REF 20 +#define CLK_TOP_MFG_EB 21 +#define CLK_TOP_UART 22 +#define CLK_TOP_SPI0_BCLK 23 +#define CLK_TOP_SPI1_BCLK 24 +#define CLK_TOP_SPI2_BCLK 25 +#define CLK_TOP_SPI3_BCLK 26 +#define CLK_TOP_SPI4_BCLK 27 +#define CLK_TOP_SPI5_BCLK 28 +#define CLK_TOP_SPI6_BCLK 29 +#define CLK_TOP_SPI7_BCLK 30 +#define CLK_TOP_MSDC30_1 31 +#define CLK_TOP_MSDC30_2 32 +#define CLK_TOP_DISP_PWM 33 +#define CLK_TOP_USB_TOP_1P 34 +#define CLK_TOP_USB_XHCI_1P 35 +#define CLK_TOP_USB_FMCNT_P1 36 +#define CLK_TOP_I2C_P 37 +#define CLK_TOP_I2C_EAST 38 +#define CLK_TOP_I2C_WEST 39 +#define CLK_TOP_I2C_NORTH 40 +#define CLK_TOP_AES_UFSFDE 41 +#define CLK_TOP_UFS 42 +#define CLK_TOP_AUD_1 43 +#define CLK_TOP_AUD_2 44 +#define CLK_TOP_ADSP 45 +#define CLK_TOP_ADSP_UARTHUB_B 46 +#define CLK_TOP_DPMAIF_MAIN 47 +#define CLK_TOP_PWM 48 +#define CLK_TOP_MCUPM 49 +#define CLK_TOP_IPSEAST 50 +#define CLK_TOP_TL 51 +#define CLK_TOP_TL_P1 52 +#define CLK_TOP_TL_P2 53 +#define CLK_TOP_EMI_INTERFACE_546 54 +#define CLK_TOP_SDF 55 +#define CLK_TOP_UARTHUB_BCLK 56 +#define CLK_TOP_DPSW_CMP_26M 57 +#define CLK_TOP_SMAP 58 +#define CLK_TOP_SSR_PKA 59 +#define CLK_TOP_SSR_DMA 60 +#define CLK_TOP_SSR_KDF 61 +#define CLK_TOP_SSR_RNG 62 +#define CLK_TOP_SPU0 63 +#define CLK_TOP_SPU1 64 +#define CLK_TOP_DXCC 65 +#define CLK_TOP_APLL_I2SIN0 66 +#define CLK_TOP_APLL_I2SIN1 67 +#define CLK_TOP_APLL_I2SIN2 68 +#define CLK_TOP_APLL_I2SIN3 69 +#define CLK_TOP_APLL_I2SIN4 70 +#define CLK_TOP_APLL_I2SIN6 71 +#define CLK_TOP_APLL_I2SOUT0 72 +#define CLK_TOP_APLL_I2SOUT1 73 +#define CLK_TOP_APLL_I2SOUT2 74 +#define CLK_TOP_APLL_I2SOUT3 75 +#define CLK_TOP_APLL_I2SOUT4 76 +#define CLK_TOP_APLL_I2SOUT6 77 +#define CLK_TOP_APLL_FMI2S 78 +#define CLK_TOP_APLL_TDMOUT 79 +#define CLK_TOP_APLL12_DIV_TDMOUT_M 80 +#define CLK_TOP_APLL12_DIV_TDMOUT_B 81 +#define CLK_TOP_MAINPLL_D3 82 +#define CLK_TOP_MAINPLL_D4 83 +#define CLK_TOP_MAINPLL_D4_D2 84 +#define CLK_TOP_MAINPLL_D4_D4 85 +#define CLK_TOP_MAINPLL_D4_D8 86 +#define CLK_TOP_MAINPLL_D5 87 +#define CLK_TOP_MAINPLL_D5_D2 88 +#define CLK_TOP_MAINPLL_D5_D4 89 +#define CLK_TOP_MAINPLL_D5_D8 90 +#define CLK_TOP_MAINPLL_D6 91 +#define CLK_TOP_MAINPLL_D6_D2 92 +#define CLK_TOP_MAINPLL_D7 93 +#define CLK_TOP_MAINPLL_D7_D2 94 +#define CLK_TOP_MAINPLL_D7_D4 95 +#define CLK_TOP_MAINPLL_D7_D8 96 +#define CLK_TOP_MAINPLL_D9 97 +#define CLK_TOP_UNIVPLL_D4 98 +#define CLK_TOP_UNIVPLL_D4_D2 99 +#define CLK_TOP_UNIVPLL_D4_D4 100 +#define CLK_TOP_UNIVPLL_D4_D8 101 +#define CLK_TOP_UNIVPLL_D5 102 +#define CLK_TOP_UNIVPLL_D5_D2 103 +#define CLK_TOP_UNIVPLL_D5_D4 104 +#define CLK_TOP_UNIVPLL_D6 105 +#define CLK_TOP_UNIVPLL_D6_D2 106 +#define CLK_TOP_UNIVPLL_D6_D4 107 +#define CLK_TOP_UNIVPLL_D6_D8 108 +#define CLK_TOP_UNIVPLL_D6_D16 109 +#define CLK_TOP_UNIVPLL_192M 110 +#define CLK_TOP_UNIVPLL_192M_D4 111 +#define CLK_TOP_UNIVPLL_192M_D8 112 +#define CLK_TOP_UNIVPLL_192M_D16 113 +#define CLK_TOP_UNIVPLL_192M_D32 114 +#define CLK_TOP_UNIVPLL_192M_D10 115 +#define CLK_TOP_TVDPLL1_D2 116 +#define CLK_TOP_MSDCPLL_D2 117 +#define CLK_TOP_OSC_D2 118 +#define CLK_TOP_OSC_D3 119 +#define CLK_TOP_OSC_D4 120 +#define CLK_TOP_OSC_D5 121 +#define CLK_TOP_OSC_D7 122 +#define CLK_TOP_OSC_D8 123 +#define CLK_TOP_OSC_D10 124 +#define CLK_TOP_OSC_D14 125 +#define CLK_TOP_OSC_D20 126 +#define CLK_TOP_OSC_D32 127 +#define CLK_TOP_OSC_D40 128 +#define CLK_TOP_SFLASH 129 + +/* APMIXEDSYS */ +#define CLK_APMIXED_MAINPLL 0 +#define CLK_APMIXED_UNIVPLL 1 +#define CLK_APMIXED_MSDCPLL 2 +#define CLK_APMIXED_ADSPPLL 3 +#define CLK_APMIXED_EMIPLL 4 +#define CLK_APMIXED_EMIPLL2 5 +#define CLK_APMIXED_NET1PLL 6 +#define CLK_APMIXED_SGMIIPLL 7 + +/* CKSYS_GP2 */ +#define CLK_TOP2_SENINF0 0 +#define CLK_TOP2_SENINF1 1 +#define CLK_TOP2_SENINF2 2 +#define CLK_TOP2_SENINF3 3 +#define CLK_TOP2_SENINF4 4 +#define CLK_TOP2_SENINF5 5 +#define CLK_TOP2_IMG1 6 +#define CLK_TOP2_IPE 7 +#define CLK_TOP2_CAM 8 +#define CLK_TOP2_CAMTM 9 +#define CLK_TOP2_DPE 10 +#define CLK_TOP2_VDEC 11 +#define CLK_TOP2_CCUSYS 12 +#define CLK_TOP2_CCUTM 13 +#define CLK_TOP2_VENC 14 +#define CLK_TOP2_DP1 15 +#define CLK_TOP2_DP0 16 +#define CLK_TOP2_DISP 17 +#define CLK_TOP2_MDP 18 +#define CLK_TOP2_MMINFRA 19 +#define CLK_TOP2_MMINFRA_SNOC 20 +#define CLK_TOP2_MMUP 21 +#define CLK_TOP2_MMINFRA_AO 22 +#define CLK_TOP2_MAINPLL2_D2 23 +#define CLK_TOP2_MAINPLL2_D3 24 +#define CLK_TOP2_MAINPLL2_D4 25 +#define CLK_TOP2_MAINPLL2_D4_D2 26 +#define CLK_TOP2_MAINPLL2_D4_D4 27 +#define CLK_TOP2_MAINPLL2_D5 28 +#define CLK_TOP2_MAINPLL2_D5_D2 29 +#define CLK_TOP2_MAINPLL2_D6 30 +#define CLK_TOP2_MAINPLL2_D6_D2 31 +#define CLK_TOP2_MAINPLL2_D7 32 +#define CLK_TOP2_MAINPLL2_D7_D2 33 +#define CLK_TOP2_MAINPLL2_D9 34 +#define CLK_TOP2_UNIVPLL2_D3 35 +#define CLK_TOP2_UNIVPLL2_D4 36 +#define CLK_TOP2_UNIVPLL2_D4_D2 37 +#define CLK_TOP2_UNIVPLL2_D5 38 +#define CLK_TOP2_UNIVPLL2_D5_D2 39 +#define CLK_TOP2_UNIVPLL2_D6 40 +#define CLK_TOP2_UNIVPLL2_D6_D2 41 +#define CLK_TOP2_UNIVPLL2_D6_D4 42 +#define CLK_TOP2_UNIVPLL2_D7 43 +#define CLK_TOP2_IMGPLL_D2 44 +#define CLK_TOP2_IMGPLL_D4 45 +#define CLK_TOP2_IMGPLL_D5 46 +#define CLK_TOP2_IMGPLL_D5_D2 47 +#define CLK_TOP2_MMPLL2_D3 48 +#define CLK_TOP2_MMPLL2_D4 49 +#define CLK_TOP2_MMPLL2_D4_D2 50 +#define CLK_TOP2_MMPLL2_D5 51 +#define CLK_TOP2_MMPLL2_D5_D2 52 +#define CLK_TOP2_MMPLL2_D6 53 +#define CLK_TOP2_MMPLL2_D6_D2 54 +#define CLK_TOP2_MMPLL2_D7 55 +#define CLK_TOP2_MMPLL2_D9 56 +#define CLK_TOP2_TVDPLL1_D4 57 +#define CLK_TOP2_TVDPLL1_D8 58 +#define CLK_TOP2_TVDPLL1_D16 59 +#define CLK_TOP2_TVDPLL2_D2 60 +#define CLK_TOP2_TVDPLL2_D4 61 +#define CLK_TOP2_TVDPLL2_D8 62 +#define CLK_TOP2_TVDPLL2_D16 63 +#define CLK_TOP2_DVO 64 +#define CLK_TOP2_DVO_FAVT 65 +#define CLK_TOP2_TVDPLL3_D2 66 +#define CLK_TOP2_TVDPLL3_D4 67 +#define CLK_TOP2_TVDPLL3_D8 68 +#define CLK_TOP2_TVDPLL3_D16 69 + +/* APMIXEDSYS_GP2 */ +#define CLK_APMIXED2_MAINPLL2 0 +#define CLK_APMIXED2_UNIVPLL2 1 +#define CLK_APMIXED2_MMPLL2 2 +#define CLK_APMIXED2_IMGPLL 3 +#define CLK_APMIXED2_TVDPLL1 4 +#define CLK_APMIXED2_TVDPLL2 5 +#define CLK_APMIXED2_TVDPLL3 6 + +/* IMP_IIC_WRAP_E */ +#define CLK_IMPE_I2C5 0 + +/* IMP_IIC_WRAP_W */ +#define CLK_IMPW_I2C0 0 +#define CLK_IMPW_I2C3 1 +#define CLK_IMPW_I2C6 2 +#define CLK_IMPW_I2C10 3 + +/* IMP_IIC_WRAP_N */ +#define CLK_IMPN_I2C1 0 +#define CLK_IMPN_I2C2 1 +#define CLK_IMPN_I2C4 2 +#define CLK_IMPN_I2C7 3 +#define CLK_IMPN_I2C8 4 +#define CLK_IMPN_I2C9 5 + +/* IMP_IIC_WRAP_C */ +#define CLK_IMPC_I2C11 0 +#define CLK_IMPC_I2C12 1 +#define CLK_IMPC_I2C13 2 +#define CLK_IMPC_I2C14 3 + +/* PERICFG_AO */ +#define CLK_PERI_AO_UART0_BCLK 0 +#define CLK_PERI_AO_UART1_BCLK 1 +#define CLK_PERI_AO_UART2_BCLK 2 +#define CLK_PERI_AO_UART3_BCLK 3 +#define CLK_PERI_AO_UART4_BCLK 4 +#define CLK_PERI_AO_UART5_BCLK 5 +#define CLK_PERI_AO_PWM_X16W_HCLK 6 +#define CLK_PERI_AO_PWM_X16W_BCLK 7 +#define CLK_PERI_AO_PWM_PWM_BCLK0 8 +#define CLK_PERI_AO_PWM_PWM_BCLK1 9 +#define CLK_PERI_AO_PWM_PWM_BCLK2 10 +#define CLK_PERI_AO_PWM_PWM_BCLK3 11 +#define CLK_PERI_AO_SPI0_BCLK 12 +#define CLK_PERI_AO_SPI1_BCLK 13 +#define CLK_PERI_AO_SPI2_BCLK 14 +#define CLK_PERI_AO_SPI3_BCLK 15 +#define CLK_PERI_AO_SPI4_BCLK 16 +#define CLK_PERI_AO_SPI5_BCLK 17 +#define CLK_PERI_AO_SPI6_BCLK 18 +#define CLK_PERI_AO_SPI7_BCLK 19 +#define CLK_PERI_AO_AP_DMA_X32W_BCLK 20 +#define CLK_PERI_AO_MSDC1_MSDC_SRC 21 +#define CLK_PERI_AO_MSDC1_HCLK 22 +#define CLK_PERI_AO_MSDC1_AXI 23 +#define CLK_PERI_AO_MSDC1_HCLK_WRAP 24 +#define CLK_PERI_AO_MSDC2_MSDC_SRC 25 +#define CLK_PERI_AO_MSDC2_HCLK 26 +#define CLK_PERI_AO_MSDC2_AXI 27 +#define CLK_PERI_AO_MSDC2_HCLK_WRAP 28 +#define CLK_PERI_AO_FLASHIF_FLASH 29 +#define CLK_PERI_AO_FLASHIF_27M 30 +#define CLK_PERI_AO_FLASHIF_DRAM 31 +#define CLK_PERI_AO_FLASHIF_AXI 32 +#define CLK_PERI_AO_FLASHIF_BCLK 33 + +/* UFSCFG_AO */ +#define CLK_UFSAO_UNIPRO_TX_SYM 0 +#define CLK_UFSAO_UNIPRO_RX_SYM0 1 +#define CLK_UFSAO_UNIPRO_RX_SYM1 2 +#define CLK_UFSAO_UNIPRO_SYS 3 +#define CLK_UFSAO_UNIPRO_SAP 4 +#define CLK_UFSAO_PHY_SAP 5 +#define CLK_UFSAO_UFSHCI_UFS 6 +#define CLK_UFSAO_UFSHCI_AES 7 + +/* PEXTP0CFG_AO */ +#define CLK_PEXT_PEXTP_MAC_P0_TL 0 +#define CLK_PEXT_PEXTP_MAC_P0_REF 1 +#define CLK_PEXT_PEXTP_PHY_P0_MCU_BUS 2 +#define CLK_PEXT_PEXTP_PHY_P0_PEXTP_REF 3 +#define CLK_PEXT_PEXTP_MAC_P0_AXI_250 4 +#define CLK_PEXT_PEXTP_MAC_P0_AHB_APB 5 +#define CLK_PEXT_PEXTP_MAC_P0_PL_P 6 +#define CLK_PEXT_PEXTP_VLP_AO_P0_LP 7 + +/* PEXTP1CFG_AO */ +#define CLK_PEXT1_PEXTP_MAC_P1_TL 0 +#define CLK_PEXT1_PEXTP_MAC_P1_REF 1 +#define CLK_PEXT1_PEXTP_MAC_P2_TL 2 +#define CLK_PEXT1_PEXTP_MAC_P2_REF 3 +#define CLK_PEXT1_PEXTP_PHY_P1_MCU_BUS 4 +#define CLK_PEXT1_PEXTP_PHY_P1_PEXTP_REF 5 +#define CLK_PEXT1_PEXTP_PHY_P2_MCU_BUS 6 +#define CLK_PEXT1_PEXTP_PHY_P2_PEXTP_REF 7 +#define CLK_PEXT1_PEXTP_MAC_P1_AXI_250 8 +#define CLK_PEXT1_PEXTP_MAC_P1_AHB_APB 9 +#define CLK_PEXT1_PEXTP_MAC_P1_PL_P 10 +#define CLK_PEXT1_PEXTP_MAC_P2_AXI_250 11 +#define CLK_PEXT1_PEXTP_MAC_P2_AHB_APB 12 +#define CLK_PEXT1_PEXTP_MAC_P2_PL_P 13 +#define CLK_PEXT1_PEXTP_VLP_AO_P1_LP 14 +#define CLK_PEXT1_PEXTP_VLP_AO_P2_LP 15 + +/* VLP_CKSYS */ +#define CLK_VLP_APLL1 0 +#define CLK_VLP_APLL2 1 +#define CLK_VLP_SCP 2 +#define CLK_VLP_SCP_SPI 3 +#define CLK_VLP_SCP_IIC 4 +#define CLK_VLP_SCP_IIC_HS 5 +#define CLK_VLP_PWRAP_ULPOSC 6 +#define CLK_VLP_SPMI_M_TIA_32K 7 +#define CLK_VLP_APXGPT_26M_B 8 +#define CLK_VLP_DPSW 9 +#define CLK_VLP_DPSW_CENTRAL 10 +#define CLK_VLP_SPMI_M_MST 11 +#define CLK_VLP_DVFSRC 12 +#define CLK_VLP_PWM_VLP 13 +#define CLK_VLP_AXI_VLP 14 +#define CLK_VLP_SYSTIMER_26M 15 +#define CLK_VLP_SSPM 16 +#define CLK_VLP_SRCK 17 +#define CLK_VLP_CAMTG0 18 +#define CLK_VLP_CAMTG1 19 +#define CLK_VLP_CAMTG2 20 +#define CLK_VLP_CAMTG3 21 +#define CLK_VLP_CAMTG4 22 +#define CLK_VLP_CAMTG5 23 +#define CLK_VLP_CAMTG6 24 +#define CLK_VLP_CAMTG7 25 +#define CLK_VLP_SSPM_26M 26 +#define CLK_VLP_ULPOSC_SSPM 27 +#define CLK_VLP_VLP_PBUS_26M 28 +#define CLK_VLP_DEBUG_ERR_FLAG 29 +#define CLK_VLP_DPMSRDMA 30 +#define CLK_VLP_VLP_PBUS_156M 31 +#define CLK_VLP_SPM 32 +#define CLK_VLP_MMINFRA 33 +#define CLK_VLP_USB_TOP 34 +#define CLK_VLP_USB_XHCI 35 +#define CLK_VLP_NOC_VLP 36 +#define CLK_VLP_AUDIO_H 37 +#define CLK_VLP_AUD_ENGEN1 38 +#define CLK_VLP_AUD_ENGEN2 39 +#define CLK_VLP_AUD_INTBUS 40 +#define CLK_VLP_SPVLP_26M 41 +#define CLK_VLP_SPU0_VLP 42 +#define CLK_VLP_SPU1_VLP 43 +#define CLK_VLP_CLK26M 44 +#define CLK_VLP_APLL1_D4 45 +#define CLK_VLP_APLL1_D8 46 +#define CLK_VLP_APLL2_D4 47 +#define CLK_VLP_APLL2_D8 48 + +/* DISPSYS_CONFIG */ +#define CLK_MM_CONFIG 0 +#define CLK_MM_DISP_MUTEX0 1 +#define CLK_MM_DISP_AAL0 2 +#define CLK_MM_DISP_AAL1 3 +#define CLK_MM_DISP_C3D0 4 +#define CLK_MM_DISP_C3D1 5 +#define CLK_MM_DISP_C3D2 6 +#define CLK_MM_DISP_C3D3 7 +#define CLK_MM_DISP_CCORR0 8 +#define CLK_MM_DISP_CCORR1 9 +#define CLK_MM_DISP_CCORR2 10 +#define CLK_MM_DISP_CCORR3 11 +#define CLK_MM_DISP_CHIST0 12 +#define CLK_MM_DISP_CHIST1 13 +#define CLK_MM_DISP_COLOR0 14 +#define CLK_MM_DISP_COLOR1 15 +#define CLK_MM_DISP_DITHER0 16 +#define CLK_MM_DISP_DITHER1 17 +#define CLK_MM_DISP_DLI_ASYNC0 18 +#define CLK_MM_DISP_DLI_ASYNC1 19 +#define CLK_MM_DISP_DLI_ASYNC2 20 +#define CLK_MM_DISP_DLI_ASYNC3 21 +#define CLK_MM_DISP_DLI_ASYNC4 22 +#define CLK_MM_DISP_DLI_ASYNC5 23 +#define CLK_MM_DISP_DLI_ASYNC6 24 +#define CLK_MM_DISP_DLI_ASYNC7 25 +#define CLK_MM_DISP_DLI_ASYNC8 26 +#define CLK_MM_DISP_DLI_ASYNC9 27 +#define CLK_MM_DISP_DLI_ASYNC10 28 +#define CLK_MM_DISP_DLI_ASYNC11 29 +#define CLK_MM_DISP_DLI_ASYNC12 30 +#define CLK_MM_DISP_DLI_ASYNC13 31 +#define CLK_MM_DISP_DLI_ASYNC14 32 +#define CLK_MM_DISP_DLI_ASYNC15 33 +#define CLK_MM_DISP_DLO_ASYNC0 34 +#define CLK_MM_DISP_DLO_ASYNC1 35 +#define CLK_MM_DISP_DLO_ASYNC2 36 +#define CLK_MM_DISP_DLO_ASYNC3 37 +#define CLK_MM_DISP_DLO_ASYNC4 38 +#define CLK_MM_DISP_DLO_ASYNC5 39 +#define CLK_MM_DISP_DLO_ASYNC6 40 +#define CLK_MM_DISP_DLO_ASYNC7 41 +#define CLK_MM_DISP_DLO_ASYNC8 42 +#define CLK_MM_DISP_GAMMA0 43 +#define CLK_MM_DISP_GAMMA1 44 +#define CLK_MM_MDP_AAL0 45 +#define CLK_MM_MDP_AAL1 46 +#define CLK_MM_MDP_RDMA0 47 +#define CLK_MM_DISP_POSTMASK0 48 +#define CLK_MM_DISP_POSTMASK1 49 +#define CLK_MM_MDP_RSZ0 50 +#define CLK_MM_MDP_RSZ1 51 +#define CLK_MM_DISP_SPR0 52 +#define CLK_MM_DISP_TDSHP0 53 +#define CLK_MM_DISP_TDSHP1 54 +#define CLK_MM_DISP_WDMA0 55 +#define CLK_MM_DISP_Y2R0 56 +#define CLK_MM_SMI_SUB_COMM0 57 +#define CLK_MM_DISP_FAKE_ENG0 58 + +/* DISPSYS1_CONFIG */ +#define CLK_MM1_DISPSYS1_CONFIG 0 +#define CLK_MM1_DISPSYS1_S_CONFIG 1 +#define CLK_MM1_DISP_MUTEX0 2 +#define CLK_MM1_DISP_DLI_ASYNC20 3 +#define CLK_MM1_DISP_DLI_ASYNC21 4 +#define CLK_MM1_DISP_DLI_ASYNC22 5 +#define CLK_MM1_DISP_DLI_ASYNC23 6 +#define CLK_MM1_DISP_DLI_ASYNC24 7 +#define CLK_MM1_DISP_DLI_ASYNC25 8 +#define CLK_MM1_DISP_DLI_ASYNC26 9 +#define CLK_MM1_DISP_DLI_ASYNC27 10 +#define CLK_MM1_DISP_DLI_ASYNC28 11 +#define CLK_MM1_DISP_RELAY0 12 +#define CLK_MM1_DISP_RELAY1 13 +#define CLK_MM1_DISP_RELAY2 14 +#define CLK_MM1_DISP_RELAY3 15 +#define CLK_MM1_DISP_DP_INTF0 16 +#define CLK_MM1_DISP_DP_INTF1 17 +#define CLK_MM1_DISP_DSC_WRAP0 18 +#define CLK_MM1_DISP_DSC_WRAP1 19 +#define CLK_MM1_DISP_DSC_WRAP2 20 +#define CLK_MM1_DISP_DSC_WRAP3 21 +#define CLK_MM1_DISP_DSI0 22 +#define CLK_MM1_DISP_DSI1 23 +#define CLK_MM1_DISP_DSI2 24 +#define CLK_MM1_DISP_DVO0 25 +#define CLK_MM1_DISP_GDMA0 26 +#define CLK_MM1_DISP_MERGE0 27 +#define CLK_MM1_DISP_MERGE1 28 +#define CLK_MM1_DISP_MERGE2 29 +#define CLK_MM1_DISP_ODDMR0 30 +#define CLK_MM1_DISP_POSTALIGN0 31 +#define CLK_MM1_DISP_DITHER2 32 +#define CLK_MM1_DISP_R2Y0 33 +#define CLK_MM1_DISP_SPLITTER0 34 +#define CLK_MM1_DISP_SPLITTER1 35 +#define CLK_MM1_DISP_SPLITTER2 36 +#define CLK_MM1_DISP_SPLITTER3 37 +#define CLK_MM1_DISP_VDCM0 38 +#define CLK_MM1_DISP_WDMA1 39 +#define CLK_MM1_DISP_WDMA2 40 +#define CLK_MM1_DISP_WDMA3 41 +#define CLK_MM1_DISP_WDMA4 42 +#define CLK_MM1_MDP_RDMA1 43 +#define CLK_MM1_SMI_LARB0 44 +#define CLK_MM1_MOD1 45 +#define CLK_MM1_MOD2 46 +#define CLK_MM1_MOD3 47 +#define CLK_MM1_MOD4 48 +#define CLK_MM1_MOD5 49 +#define CLK_MM1_MOD6 50 +#define CLK_MM1_CG0 51 +#define CLK_MM1_CG1 52 +#define CLK_MM1_CG2 53 +#define CLK_MM1_CG3 54 +#define CLK_MM1_CG4 55 +#define CLK_MM1_CG5 56 +#define CLK_MM1_CG6 57 +#define CLK_MM1_CG7 58 +#define CLK_MM1_F26M 59 + +/* OVLSYS_CONFIG */ +#define CLK_OVLSYS_CONFIG 0 +#define CLK_OVL_FAKE_ENG0 1 +#define CLK_OVL_FAKE_ENG1 2 +#define CLK_OVL_MUTEX0 3 +#define CLK_OVL_EXDMA0 4 +#define CLK_OVL_EXDMA1 5 +#define CLK_OVL_EXDMA2 6 +#define CLK_OVL_EXDMA3 7 +#define CLK_OVL_EXDMA4 8 +#define CLK_OVL_EXDMA5 9 +#define CLK_OVL_EXDMA6 10 +#define CLK_OVL_EXDMA7 11 +#define CLK_OVL_EXDMA8 12 +#define CLK_OVL_EXDMA9 13 +#define CLK_OVL_BLENDER0 14 +#define CLK_OVL_BLENDER1 15 +#define CLK_OVL_BLENDER2 16 +#define CLK_OVL_BLENDER3 17 +#define CLK_OVL_BLENDER4 18 +#define CLK_OVL_BLENDER5 19 +#define CLK_OVL_BLENDER6 20 +#define CLK_OVL_BLENDER7 21 +#define CLK_OVL_BLENDER8 22 +#define CLK_OVL_BLENDER9 23 +#define CLK_OVL_OUTPROC0 24 +#define CLK_OVL_OUTPROC1 25 +#define CLK_OVL_OUTPROC2 26 +#define CLK_OVL_OUTPROC3 27 +#define CLK_OVL_OUTPROC4 28 +#define CLK_OVL_OUTPROC5 29 +#define CLK_OVL_MDP_RSZ0 30 +#define CLK_OVL_MDP_RSZ1 31 +#define CLK_OVL_DISP_WDMA0 32 +#define CLK_OVL_DISP_WDMA1 33 +#define CLK_OVL_UFBC_WDMA0 34 +#define CLK_OVL_MDP_RDMA0 35 +#define CLK_OVL_MDP_RDMA1 36 +#define CLK_OVL_BWM0 37 +#define CLK_OVL_DLI0 38 +#define CLK_OVL_DLI1 39 +#define CLK_OVL_DLI2 40 +#define CLK_OVL_DLI3 41 +#define CLK_OVL_DLI4 42 +#define CLK_OVL_DLI5 43 +#define CLK_OVL_DLI6 44 +#define CLK_OVL_DLI7 45 +#define CLK_OVL_DLI8 46 +#define CLK_OVL_DLO0 47 +#define CLK_OVL_DLO1 48 +#define CLK_OVL_DLO2 49 +#define CLK_OVL_DLO3 50 +#define CLK_OVL_DLO4 51 +#define CLK_OVL_DLO5 52 +#define CLK_OVL_DLO6 53 +#define CLK_OVL_DLO7 54 +#define CLK_OVL_DLO8 55 +#define CLK_OVL_DLO9 56 +#define CLK_OVL_DLO10 57 +#define CLK_OVL_DLO11 58 +#define CLK_OVL_DLO12 59 +#define CLK_OVLSYS_RELAY0 60 +#define CLK_OVL_INLINEROT0 61 +#define CLK_OVL_SMI 62 +#define CLK_OVL_SMI_SMI 63 + + +/* OVLSYS1_CONFIG */ +#define CLK_OVL1_OVLSYS_CONFIG 0 +#define CLK_OVL1_OVL_FAKE_ENG0 1 +#define CLK_OVL1_OVL_FAKE_ENG1 2 +#define CLK_OVL1_OVL_MUTEX0 3 +#define CLK_OVL1_OVL_EXDMA0 4 +#define CLK_OVL1_OVL_EXDMA1 5 +#define CLK_OVL1_OVL_EXDMA2 6 +#define CLK_OVL1_OVL_EXDMA3 7 +#define CLK_OVL1_OVL_EXDMA4 8 +#define CLK_OVL1_OVL_EXDMA5 9 +#define CLK_OVL1_OVL_EXDMA6 10 +#define CLK_OVL1_OVL_EXDMA7 11 +#define CLK_OVL1_OVL_EXDMA8 12 +#define CLK_OVL1_OVL_EXDMA9 13 +#define CLK_OVL1_OVL_BLENDER0 14 +#define CLK_OVL1_OVL_BLENDER1 15 +#define CLK_OVL1_OVL_BLENDER2 16 +#define CLK_OVL1_OVL_BLENDER3 17 +#define CLK_OVL1_OVL_BLENDER4 18 +#define CLK_OVL1_OVL_BLENDER5 19 +#define CLK_OVL1_OVL_BLENDER6 20 +#define CLK_OVL1_OVL_BLENDER7 21 +#define CLK_OVL1_OVL_BLENDER8 22 +#define CLK_OVL1_OVL_BLENDER9 23 +#define CLK_OVL1_OVL_OUTPROC0 24 +#define CLK_OVL1_OVL_OUTPROC1 25 +#define CLK_OVL1_OVL_OUTPROC2 26 +#define CLK_OVL1_OVL_OUTPROC3 27 +#define CLK_OVL1_OVL_OUTPROC4 28 +#define CLK_OVL1_OVL_OUTPROC5 29 +#define CLK_OVL1_OVL_MDP_RSZ0 30 +#define CLK_OVL1_OVL_MDP_RSZ1 31 +#define CLK_OVL1_OVL_DISP_WDMA0 32 +#define CLK_OVL1_OVL_DISP_WDMA1 33 +#define CLK_OVL1_OVL_UFBC_WDMA0 34 +#define CLK_OVL1_OVL_MDP_RDMA0 35 +#define CLK_OVL1_OVL_MDP_RDMA1 36 +#define CLK_OVL1_OVL_BWM0 37 +#define CLK_OVL1_DLI0 38 +#define CLK_OVL1_DLI1 39 +#define CLK_OVL1_DLI2 40 +#define CLK_OVL1_DLI3 41 +#define CLK_OVL1_DLI4 42 +#define CLK_OVL1_DLI5 43 +#define CLK_OVL1_DLI6 44 +#define CLK_OVL1_DLI7 45 +#define CLK_OVL1_DLI8 46 +#define CLK_OVL1_DLO0 47 +#define CLK_OVL1_DLO1 48 +#define CLK_OVL1_DLO2 49 +#define CLK_OVL1_DLO3 50 +#define CLK_OVL1_DLO4 51 +#define CLK_OVL1_DLO5 52 +#define CLK_OVL1_DLO6 53 +#define CLK_OVL1_DLO7 54 +#define CLK_OVL1_DLO8 55 +#define CLK_OVL1_DLO9 56 +#define CLK_OVL1_DLO10 57 +#define CLK_OVL1_DLO11 58 +#define CLK_OVL1_DLO12 59 +#define CLK_OVL1_OVLSYS_RELAY0 60 +#define CLK_OVL1_OVL_INLINEROT0 61 +#define CLK_OVL1_SMI 62 + + +/* VDEC_SOC_GCON_BASE */ +#define CLK_VDE1_LARB1_CKEN 0 +#define CLK_VDE1_LAT_CKEN 1 +#define CLK_VDE1_LAT_ACTIVE 2 +#define CLK_VDE1_LAT_CKEN_ENG 3 +#define CLK_VDE1_VDEC_CKEN 4 +#define CLK_VDE1_VDEC_ACTIVE 5 +#define CLK_VDE1_VDEC_CKEN_ENG 6 +#define CLK_VDE1_VDEC_SOC_APTV_EN 7 +#define CLK_VDE1_VDEC_SOC_APTV_TOP_EN 8 +#define CLK_VDE1_VDEC_SOC_IPS_EN 9 + +/* VDEC_GCON_BASE */ +#define CLK_VDE2_LARB1_CKEN 0 +#define CLK_VDE2_LAT_CKEN 1 +#define CLK_VDE2_LAT_ACTIVE 2 +#define CLK_VDE2_LAT_CKEN_ENG 3 +#define CLK_VDE2_VDEC_CKEN 4 +#define CLK_VDE2_VDEC_ACTIVE 5 +#define CLK_VDE2_VDEC_CKEN_ENG 6 + +/* VENC_GCON */ +#define CLK_VEN1_CKE0_LARB 0 +#define CLK_VEN1_CKE1_VENC 1 +#define CLK_VEN1_CKE2_JPGENC 2 +#define CLK_VEN1_CKE3_JPGDEC 3 +#define CLK_VEN1_CKE4_JPGDEC_C1 4 +#define CLK_VEN1_CKE5_GALS 5 +#define CLK_VEN1_CKE29_VENC_ADAB_CTRL 6 +#define CLK_VEN1_CKE29_VENC_XPC_CTRL 7 +#define CLK_VEN1_CKE6_GALS_SRAM 8 +#define CLK_VEN1_RES_FLAT 9 + +/* VENC_GCON_CORE1 */ +#define CLK_VEN2_CKE0_LARB 0 +#define CLK_VEN2_CKE1_VENC 1 +#define CLK_VEN2_CKE2_JPGENC 2 +#define CLK_VEN2_CKE3_JPGDEC 3 +#define CLK_VEN2_CKE5_GALS 4 +#define CLK_VEN2_CKE29_VENC_XPC_CTRL 5 +#define CLK_VEN2_CKE6_GALS_SRAM 6 +#define CLK_VEN2_RES_FLAT 7 + +/* VENC_GCON_CORE2 */ +#define CLK_VEN_C2_CKE0_LARB 0 +#define CLK_VEN_C2_CKE1_VENC 1 +#define CLK_VEN_C2_CKE5_GALS 2 +#define CLK_VEN_C2_CKE29_VENC_XPC_CTRL 3 +#define CLK_VEN_C2_CKE6_GALS_SRAM 4 +#define CLK_VEN_C2_RES_FLAT 5 + +/* MDPSYS_CONFIG */ +#define CLK_MDP_MDP_MUTEX0 0 +#define CLK_MDP_SMI0 1 +#define CLK_MDP_SMI0_SMI 2 +#define CLK_MDP_APB_BUS 3 +#define CLK_MDP_MDP_RDMA0 4 +#define CLK_MDP_MDP_RDMA1 5 +#define CLK_MDP_MDP_RDMA2 6 +#define CLK_MDP_MDP_BIRSZ0 7 +#define CLK_MDP_MDP_HDR0 8 +#define CLK_MDP_MDP_AAL0 9 +#define CLK_MDP_MDP_RSZ0 10 +#define CLK_MDP_MDP_RSZ2 11 +#define CLK_MDP_MDP_TDSHP0 12 +#define CLK_MDP_MDP_COLOR0 13 +#define CLK_MDP_MDP_WROT0 14 +#define CLK_MDP_MDP_WROT1 15 +#define CLK_MDP_MDP_WROT2 16 +#define CLK_MDP_MDP_FAKE_ENG0 17 +#define CLK_MDP_APB_DB 18 +#define CLK_MDP_MDP_DLI_ASYNC0 19 +#define CLK_MDP_MDP_DLI_ASYNC1 20 +#define CLK_MDP_MDP_DLO_ASYNC0 21 +#define CLK_MDP_MDP_DLO_ASYNC1 22 +#define CLK_MDP_MDP_DLI_ASYNC2 23 +#define CLK_MDP_MDP_DLO_ASYNC2 24 +#define CLK_MDP_MDP_DLO_ASYNC3 25 +#define CLK_MDP_IMG_DL_ASYNC0 26 +#define CLK_MDP_MDP_RROT0 27 +#define CLK_MDP_MDP_MERGE0 28 +#define CLK_MDP_MDP_C3D0 29 +#define CLK_MDP_MDP_FG0 30 +#define CLK_MDP_MDP_CLA2 31 +#define CLK_MDP_MDP_DLO_ASYNC4 32 +#define CLK_MDP_VPP_RSZ0 33 +#define CLK_MDP_VPP_RSZ1 34 +#define CLK_MDP_MDP_DLO_ASYNC5 35 +#define CLK_MDP_IMG0 36 +#define CLK_MDP_F26M 37 +#define CLK_MDP_IMG_DL_RELAY0 38 +#define CLK_MDP_IMG_DL_RELAY1 39 + +/* MDPSYS1_CONFIG */ +#define CLK_MDP1_MDP_MUTEX0 0 +#define CLK_MDP1_SMI0 1 +#define CLK_MDP1_SMI0_SMI 2 +#define CLK_MDP1_APB_BUS 3 +#define CLK_MDP1_MDP_RDMA0 4 +#define CLK_MDP1_MDP_RDMA1 5 +#define CLK_MDP1_MDP_RDMA2 6 +#define CLK_MDP1_MDP_BIRSZ0 7 +#define CLK_MDP1_MDP_HDR0 8 +#define CLK_MDP1_MDP_AAL0 9 +#define CLK_MDP1_MDP_RSZ0 10 +#define CLK_MDP1_MDP_RSZ2 11 +#define CLK_MDP1_MDP_TDSHP0 12 +#define CLK_MDP1_MDP_COLOR0 13 +#define CLK_MDP1_MDP_WROT0 14 +#define CLK_MDP1_MDP_WROT1 15 +#define CLK_MDP1_MDP_WROT2 16 +#define CLK_MDP1_MDP_FAKE_ENG0 17 +#define CLK_MDP1_APB_DB 18 +#define CLK_MDP1_MDP_DLI_ASYNC0 19 +#define CLK_MDP1_MDP_DLI_ASYNC1 20 +#define CLK_MDP1_MDP_DLO_ASYNC0 21 +#define CLK_MDP1_MDP_DLO_ASYNC1 22 +#define CLK_MDP1_MDP_DLI_ASYNC2 23 +#define CLK_MDP1_MDP_DLO_ASYNC2 24 +#define CLK_MDP1_MDP_DLO_ASYNC3 25 +#define CLK_MDP1_IMG_DL_ASYNC0 26 +#define CLK_MDP1_MDP_RROT0 27 +#define CLK_MDP1_MDP_MERGE0 28 +#define CLK_MDP1_MDP_C3D0 29 +#define CLK_MDP1_MDP_FG0 30 +#define CLK_MDP1_MDP_CLA2 31 +#define CLK_MDP1_MDP_DLO_ASYNC4 32 +#define CLK_MDP1_VPP_RSZ0 33 +#define CLK_MDP1_VPP_RSZ1 34 +#define CLK_MDP1_MDP_DLO_ASYNC5 35 +#define CLK_MDP1_IMG0 36 +#define CLK_MDP1_F26M 37 +#define CLK_MDP1_IMG_DL_RELAY0 38 +#define CLK_MDP1_IMG_DL_RELAY1 39 + +/* DISP_VDISP_AO_CONFIG */ +#define CLK_MM_V_DISP_VDISP_AO_CONFIG 0 +#define CLK_MM_V_DISP_DPC 1 +#define CLK_MM_V_SMI_SUB_SOMM0 2 + +/* MFGPLL_PLL_CTRL */ +#define CLK_MFG_AO_MFGPLL 0 + +/* MFGPLL_SC0_PLL_CTRL */ +#define CLK_MFGSC0_AO_MFGPLL_SC0 0 + +/* MFGPLL_SC1_PLL_CTRL */ +#define CLK_MFGSC1_AO_MFGPLL_SC1 0 + +/* CCIPLL_PLL_CTRL */ +#define CLK_CCIPLL 0 + +/* ARMPLL_LL_PLL_CTRL */ +#define CLK_CPLL_ARMPLL_LL 0 + +/* ARMPLL_BL_PLL_CTRL */ +#define CLK_CPBL_ARMPLL_BL 0 + +/* ARMPLL_B_PLL_CTRL */ +#define CLK_CPB_ARMPLL_B 0 + +/* PTPPLL_PLL_CTRL */ +#define CLK_PTPPLL 0 + +#endif /* _DT_BINDINGS_CLK_MT8196_H */ diff --git a/include/dt-bindings/reset/mediatek,mt8196-resets.h b/include/dt-bindings/reset/mediatek,mt8196-resets.h new file mode 100644 index 000000000000..46ced0850d91 --- /dev/null +++ b/include/dt-bindings/reset/mediatek,mt8196-resets.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025 Collabora Ltd. + * Author: AngeloGioacchino Del Regno + */ + +#ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8196 +#define _DT_BINDINGS_RESET_CONTROLLER_MT8196 + +/* PEXTP0 resets */ +#define MT8196_PEXTP0_RST0_PCIE0_MAC 0 +#define MT8196_PEXTP0_RST0_PCIE0_PHY 1 + +/* PEXTP1 resets */ +#define MT8196_PEXTP1_RST0_PCIE1_MAC 0 +#define MT8196_PEXTP1_RST0_PCIE1_PHY 1 +#define MT8196_PEXTP1_RST0_PCIE2_MAC 2 +#define MT8196_PEXTP1_RST0_PCIE2_PHY 3 + +/* UFS resets */ +#define MT8196_UFSAO_RST0_UFS_MPHY 0 +#define MT8196_UFSAO_RST1_UFS_UNIPRO 1 +#define MT8196_UFSAO_RST1_UFS_CRYPTO 2 +#define MT8196_UFSAO_RST1_UFSHCI 3 + +#endif /* _DT_BINDINGS_RESET_CONTROLLER_MT8196 */ -- cgit v1.2.3 From 49f6c8b74d9a20c0dd16bd75c93b981a3d420a37 Mon Sep 17 00:00:00 2001 From: Gabriel Fernandez Date: Wed, 25 Jun 2025 11:07:24 +0200 Subject: dt-bindings: stm32: add STM32MP21 clocks and reset bindings Adds clock and reset binding entries for STM32MP21 SoC family. Signed-off-by: Nicolas Le Bayon Reviewed-by: Conor Dooley Signed-off-by: Gabriel Fernandez Signed-off-by: Stephen Boyd --- .../bindings/clock/st,stm32mp21-rcc.yaml | 199 ++++++++++ include/dt-bindings/clock/st,stm32mp21-rcc.h | 426 +++++++++++++++++++++ include/dt-bindings/reset/st,stm32mp21-rcc.h | 138 +++++++ 3 files changed, 763 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/st,stm32mp21-rcc.yaml create mode 100644 include/dt-bindings/clock/st,stm32mp21-rcc.h create mode 100644 include/dt-bindings/reset/st,stm32mp21-rcc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/st,stm32mp21-rcc.yaml b/Documentation/devicetree/bindings/clock/st,stm32mp21-rcc.yaml new file mode 100644 index 000000000000..4368063c6709 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/st,stm32mp21-rcc.yaml @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/st,stm32mp21-rcc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: STM32MP21 Reset Clock Controller + +maintainers: + - Gabriel Fernandez + +description: | + The RCC hardware block is both a reset and a clock controller. + RCC makes also power management (resume/suspend). + + See also: + include/dt-bindings/clock/st,stm32mp21-rcc.h + include/dt-bindings/reset/st,stm32mp21-rcc.h + +properties: + compatible: + enum: + - st,stm32mp21-rcc + + reg: + maxItems: 1 + + '#clock-cells': + const: 1 + + '#reset-cells': + const: 1 + + clocks: + items: + - description: CK_SCMI_HSE High Speed External oscillator (8 to 48 MHz) + - description: CK_SCMI_HSI High Speed Internal oscillator (~ 64 MHz) + - description: CK_SCMI_MSI Low Power Internal oscillator (~ 4 MHz or ~ 16 MHz) + - description: CK_SCMI_LSE Low Speed External oscillator (32 KHz) + - description: CK_SCMI_LSI Low Speed Internal oscillator (~ 32 KHz) + - description: CK_SCMI_HSE_DIV2 CK_SCMI_HSE divided by 2 (could be gated) + - description: CK_SCMI_ICN_HS_MCU High Speed interconnect bus clock + - description: CK_SCMI_ICN_LS_MCU Low Speed interconnect bus clock + - description: CK_SCMI_ICN_SDMMC SDMMC interconnect bus clock + - description: CK_SCMI_ICN_DDR DDR interconnect bus clock + - description: CK_SCMI_ICN_DISPLAY Display interconnect bus clock + - description: CK_SCMI_ICN_HSL HSL interconnect bus clock + - description: CK_SCMI_ICN_NIC NIC interconnect bus clock + - description: CK_SCMI_FLEXGEN_07 flexgen clock 7 + - description: CK_SCMI_FLEXGEN_08 flexgen clock 8 + - description: CK_SCMI_FLEXGEN_09 flexgen clock 9 + - description: CK_SCMI_FLEXGEN_10 flexgen clock 10 + - description: CK_SCMI_FLEXGEN_11 flexgen clock 11 + - description: CK_SCMI_FLEXGEN_12 flexgen clock 12 + - description: CK_SCMI_FLEXGEN_13 flexgen clock 13 + - description: CK_SCMI_FLEXGEN_14 flexgen clock 14 + - description: CK_SCMI_FLEXGEN_16 flexgen clock 16 + - description: CK_SCMI_FLEXGEN_17 flexgen clock 17 + - description: CK_SCMI_FLEXGEN_18 flexgen clock 18 + - description: CK_SCMI_FLEXGEN_19 flexgen clock 19 + - description: CK_SCMI_FLEXGEN_20 flexgen clock 20 + - description: CK_SCMI_FLEXGEN_21 flexgen clock 21 + - description: CK_SCMI_FLEXGEN_22 flexgen clock 22 + - description: CK_SCMI_FLEXGEN_23 flexgen clock 23 + - description: CK_SCMI_FLEXGEN_24 flexgen clock 24 + - description: CK_SCMI_FLEXGEN_25 flexgen clock 25 + - description: CK_SCMI_FLEXGEN_26 flexgen clock 26 + - description: CK_SCMI_FLEXGEN_27 flexgen clock 27 + - description: CK_SCMI_FLEXGEN_29 flexgen clock 29 + - description: CK_SCMI_FLEXGEN_30 flexgen clock 30 + - description: CK_SCMI_FLEXGEN_31 flexgen clock 31 + - description: CK_SCMI_FLEXGEN_33 flexgen clock 33 + - description: CK_SCMI_FLEXGEN_36 flexgen clock 36 + - description: CK_SCMI_FLEXGEN_37 flexgen clock 37 + - description: CK_SCMI_FLEXGEN_38 flexgen clock 38 + - description: CK_SCMI_FLEXGEN_39 flexgen clock 39 + - description: CK_SCMI_FLEXGEN_40 flexgen clock 40 + - description: CK_SCMI_FLEXGEN_41 flexgen clock 41 + - description: CK_SCMI_FLEXGEN_42 flexgen clock 42 + - description: CK_SCMI_FLEXGEN_43 flexgen clock 43 + - description: CK_SCMI_FLEXGEN_44 flexgen clock 44 + - description: CK_SCMI_FLEXGEN_45 flexgen clock 45 + - description: CK_SCMI_FLEXGEN_46 flexgen clock 46 + - description: CK_SCMI_FLEXGEN_47 flexgen clock 47 + - description: CK_SCMI_FLEXGEN_48 flexgen clock 48 + - description: CK_SCMI_FLEXGEN_50 flexgen clock 50 + - description: CK_SCMI_FLEXGEN_51 flexgen clock 51 + - description: CK_SCMI_FLEXGEN_52 flexgen clock 52 + - description: CK_SCMI_FLEXGEN_53 flexgen clock 53 + - description: CK_SCMI_FLEXGEN_54 flexgen clock 54 + - description: CK_SCMI_FLEXGEN_55 flexgen clock 55 + - description: CK_SCMI_FLEXGEN_56 flexgen clock 56 + - description: CK_SCMI_FLEXGEN_57 flexgen clock 57 + - description: CK_SCMI_FLEXGEN_58 flexgen clock 58 + - description: CK_SCMI_FLEXGEN_61 flexgen clock 61 + - description: CK_SCMI_FLEXGEN_62 flexgen clock 62 + - description: CK_SCMI_FLEXGEN_63 flexgen clock 63 + - description: CK_SCMI_ICN_APB1 Peripheral bridge 1 + - description: CK_SCMI_ICN_APB2 Peripheral bridge 2 + - description: CK_SCMI_ICN_APB3 Peripheral bridge 3 + - description: CK_SCMI_ICN_APB4 Peripheral bridge 4 + - description: CK_SCMI_ICN_APB5 Peripheral bridge 5 + - description: CK_SCMI_ICN_APBDBG Peripheral bridge for debug + - description: CK_SCMI_TIMG1 Peripheral bridge for timer1 + - description: CK_SCMI_TIMG2 Peripheral bridge for timer2 + + access-controllers: + maxItems: 1 + +required: + - compatible + - reg + - '#clock-cells' + - '#reset-cells' + - clocks + +additionalProperties: false + +examples: + - | + #include + + clock-controller@44200000 { + compatible = "st,stm32mp21-rcc"; + reg = <0x44200000 0x10000>; + #clock-cells = <1>; + #reset-cells = <1>; + clocks = <&scmi_clk CK_SCMI_HSE>, + <&scmi_clk CK_SCMI_HSI>, + <&scmi_clk CK_SCMI_MSI>, + <&scmi_clk CK_SCMI_LSE>, + <&scmi_clk CK_SCMI_LSI>, + <&scmi_clk CK_SCMI_HSE_DIV2>, + <&scmi_clk CK_SCMI_ICN_HS_MCU>, + <&scmi_clk CK_SCMI_ICN_LS_MCU>, + <&scmi_clk CK_SCMI_ICN_SDMMC>, + <&scmi_clk CK_SCMI_ICN_DDR>, + <&scmi_clk CK_SCMI_ICN_DISPLAY>, + <&scmi_clk CK_SCMI_ICN_HSL>, + <&scmi_clk CK_SCMI_ICN_NIC>, + <&scmi_clk CK_SCMI_FLEXGEN_07>, + <&scmi_clk CK_SCMI_FLEXGEN_08>, + <&scmi_clk CK_SCMI_FLEXGEN_09>, + <&scmi_clk CK_SCMI_FLEXGEN_10>, + <&scmi_clk CK_SCMI_FLEXGEN_11>, + <&scmi_clk CK_SCMI_FLEXGEN_12>, + <&scmi_clk CK_SCMI_FLEXGEN_13>, + <&scmi_clk CK_SCMI_FLEXGEN_14>, + <&scmi_clk CK_SCMI_FLEXGEN_16>, + <&scmi_clk CK_SCMI_FLEXGEN_17>, + <&scmi_clk CK_SCMI_FLEXGEN_18>, + <&scmi_clk CK_SCMI_FLEXGEN_19>, + <&scmi_clk CK_SCMI_FLEXGEN_20>, + <&scmi_clk CK_SCMI_FLEXGEN_21>, + <&scmi_clk CK_SCMI_FLEXGEN_22>, + <&scmi_clk CK_SCMI_FLEXGEN_23>, + <&scmi_clk CK_SCMI_FLEXGEN_24>, + <&scmi_clk CK_SCMI_FLEXGEN_25>, + <&scmi_clk CK_SCMI_FLEXGEN_26>, + <&scmi_clk CK_SCMI_FLEXGEN_27>, + <&scmi_clk CK_SCMI_FLEXGEN_29>, + <&scmi_clk CK_SCMI_FLEXGEN_30>, + <&scmi_clk CK_SCMI_FLEXGEN_31>, + <&scmi_clk CK_SCMI_FLEXGEN_33>, + <&scmi_clk CK_SCMI_FLEXGEN_36>, + <&scmi_clk CK_SCMI_FLEXGEN_37>, + <&scmi_clk CK_SCMI_FLEXGEN_38>, + <&scmi_clk CK_SCMI_FLEXGEN_39>, + <&scmi_clk CK_SCMI_FLEXGEN_40>, + <&scmi_clk CK_SCMI_FLEXGEN_41>, + <&scmi_clk CK_SCMI_FLEXGEN_42>, + <&scmi_clk CK_SCMI_FLEXGEN_43>, + <&scmi_clk CK_SCMI_FLEXGEN_44>, + <&scmi_clk CK_SCMI_FLEXGEN_45>, + <&scmi_clk CK_SCMI_FLEXGEN_46>, + <&scmi_clk CK_SCMI_FLEXGEN_47>, + <&scmi_clk CK_SCMI_FLEXGEN_48>, + <&scmi_clk CK_SCMI_FLEXGEN_50>, + <&scmi_clk CK_SCMI_FLEXGEN_51>, + <&scmi_clk CK_SCMI_FLEXGEN_52>, + <&scmi_clk CK_SCMI_FLEXGEN_53>, + <&scmi_clk CK_SCMI_FLEXGEN_54>, + <&scmi_clk CK_SCMI_FLEXGEN_55>, + <&scmi_clk CK_SCMI_FLEXGEN_56>, + <&scmi_clk CK_SCMI_FLEXGEN_57>, + <&scmi_clk CK_SCMI_FLEXGEN_58>, + <&scmi_clk CK_SCMI_FLEXGEN_61>, + <&scmi_clk CK_SCMI_FLEXGEN_62>, + <&scmi_clk CK_SCMI_FLEXGEN_63>, + <&scmi_clk CK_SCMI_ICN_APB1>, + <&scmi_clk CK_SCMI_ICN_APB2>, + <&scmi_clk CK_SCMI_ICN_APB3>, + <&scmi_clk CK_SCMI_ICN_APB4>, + <&scmi_clk CK_SCMI_ICN_APB5>, + <&scmi_clk CK_SCMI_ICN_APBDBG>, + <&scmi_clk CK_SCMI_TIMG1>, + <&scmi_clk CK_SCMI_TIMG2>; + }; +... diff --git a/include/dt-bindings/clock/st,stm32mp21-rcc.h b/include/dt-bindings/clock/st,stm32mp21-rcc.h new file mode 100644 index 000000000000..054b785f2796 --- /dev/null +++ b/include/dt-bindings/clock/st,stm32mp21-rcc.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Copyright (C) STMicroelectronics 2025 - All Rights Reserved + * Author: Gabriel Fernandez + */ + +#ifndef _DT_BINDINGS_STM32MP21_CLKS_H_ +#define _DT_BINDINGS_STM32MP21_CLKS_H_ + +/* INTERNAL/EXTERNAL OSCILLATORS */ +#define HSI_CK 0 +#define HSE_CK 1 +#define MSI_CK 2 +#define LSI_CK 3 +#define LSE_CK 4 +#define I2S_CK 5 +#define RTC_CK 6 +#define SPDIF_CK_SYMB 7 + +/* PLL CLOCKS */ +#define PLL1_CK 8 +#define PLL2_CK 9 +#define PLL4_CK 10 +#define PLL5_CK 11 +#define PLL6_CK 12 +#define PLL7_CK 13 +#define PLL8_CK 14 + +#define CK_CPU1 15 + +/* APB DIV CLOCKS */ +#define CK_ICN_APB1 16 +#define CK_ICN_APB2 17 +#define CK_ICN_APB3 18 +#define CK_ICN_APB4 19 +#define CK_ICN_APB5 20 +#define CK_ICN_APBDBG 21 + +/* GLOBAL TIMER */ +#define TIMG1_CK 22 +#define TIMG2_CK 23 + +/* FLEXGEN CLOCKS */ +#define CK_ICN_HS_MCU 24 +#define CK_ICN_SDMMC 25 +#define CK_ICN_DDR 26 +#define CK_ICN_DISPLAY 27 +#define CK_ICN_HSL 28 +#define CK_ICN_NIC 29 +#define CK_ICN_VID 30 +#define CK_FLEXGEN_07 31 +#define CK_FLEXGEN_08 32 +#define CK_FLEXGEN_09 33 +#define CK_FLEXGEN_10 34 +#define CK_FLEXGEN_11 35 +#define CK_FLEXGEN_12 36 +#define CK_FLEXGEN_13 37 +#define CK_FLEXGEN_14 38 +#define CK_FLEXGEN_15 39 +#define CK_FLEXGEN_16 40 +#define CK_FLEXGEN_17 41 +#define CK_FLEXGEN_18 42 +#define CK_FLEXGEN_19 43 +#define CK_FLEXGEN_20 44 +#define CK_FLEXGEN_21 45 +#define CK_FLEXGEN_22 46 +#define CK_FLEXGEN_23 47 +#define CK_FLEXGEN_24 48 +#define CK_FLEXGEN_25 49 +#define CK_FLEXGEN_26 50 +#define CK_FLEXGEN_27 51 +#define CK_FLEXGEN_28 52 +#define CK_FLEXGEN_29 53 +#define CK_FLEXGEN_30 54 +#define CK_FLEXGEN_31 55 +#define CK_FLEXGEN_32 56 +#define CK_FLEXGEN_33 57 +#define CK_FLEXGEN_34 58 +#define CK_FLEXGEN_35 59 +#define CK_FLEXGEN_36 60 +#define CK_FLEXGEN_37 61 +#define CK_FLEXGEN_38 62 +#define CK_FLEXGEN_39 63 +#define CK_FLEXGEN_40 64 +#define CK_FLEXGEN_41 65 +#define CK_FLEXGEN_42 66 +#define CK_FLEXGEN_43 67 +#define CK_FLEXGEN_44 68 +#define CK_FLEXGEN_45 69 +#define CK_FLEXGEN_46 70 +#define CK_FLEXGEN_47 71 +#define CK_FLEXGEN_48 72 +#define CK_FLEXGEN_49 73 +#define CK_FLEXGEN_50 74 +#define CK_FLEXGEN_51 75 +#define CK_FLEXGEN_52 76 +#define CK_FLEXGEN_53 77 +#define CK_FLEXGEN_54 78 +#define CK_FLEXGEN_55 79 +#define CK_FLEXGEN_56 80 +#define CK_FLEXGEN_57 81 +#define CK_FLEXGEN_58 82 +#define CK_FLEXGEN_59 83 +#define CK_FLEXGEN_60 84 +#define CK_FLEXGEN_61 85 +#define CK_FLEXGEN_62 86 +#define CK_FLEXGEN_63 87 + +/* LOW SPEED MCU CLOCK */ +#define CK_ICN_LS_MCU 88 + +#define CK_BUS_STM 89 +#define CK_BUS_FMC 90 +#define CK_BUS_ETH1 91 +#define CK_BUS_ETH2 92 +#define CK_BUS_DDRPHYC 93 +#define CK_BUS_SYSCPU1 94 +#define CK_BUS_HPDMA1 95 +#define CK_BUS_HPDMA2 96 +#define CK_BUS_HPDMA3 97 +#define CK_BUS_ADC1 98 +#define CK_BUS_ADC2 99 +#define CK_BUS_IPCC1 100 +#define CK_BUS_DCMIPSSI 101 +#define CK_BUS_CRC 102 +#define CK_BUS_MDF1 103 +#define CK_BUS_BKPSRAM 104 +#define CK_BUS_HASH1 105 +#define CK_BUS_HASH2 106 +#define CK_BUS_RNG1 107 +#define CK_BUS_RNG2 108 +#define CK_BUS_CRYP1 109 +#define CK_BUS_CRYP2 110 +#define CK_BUS_SAES 111 +#define CK_BUS_PKA 112 +#define CK_BUS_GPIOA 113 +#define CK_BUS_GPIOB 114 +#define CK_BUS_GPIOC 115 +#define CK_BUS_GPIOD 116 +#define CK_BUS_GPIOE 117 +#define CK_BUS_GPIOF 118 +#define CK_BUS_GPIOG 119 +#define CK_BUS_GPIOH 120 +#define CK_BUS_GPIOI 121 +#define CK_BUS_GPIOZ 122 +#define CK_BUS_RTC 124 +#define CK_BUS_LPUART1 125 +#define CK_BUS_LPTIM3 126 +#define CK_BUS_LPTIM4 127 +#define CK_BUS_LPTIM5 128 +#define CK_BUS_TIM2 129 +#define CK_BUS_TIM3 130 +#define CK_BUS_TIM4 131 +#define CK_BUS_TIM5 132 +#define CK_BUS_TIM6 133 +#define CK_BUS_TIM7 134 +#define CK_BUS_TIM10 135 +#define CK_BUS_TIM11 136 +#define CK_BUS_TIM12 137 +#define CK_BUS_TIM13 138 +#define CK_BUS_TIM14 139 +#define CK_BUS_LPTIM1 140 +#define CK_BUS_LPTIM2 141 +#define CK_BUS_SPI2 142 +#define CK_BUS_SPI3 143 +#define CK_BUS_SPDIFRX 144 +#define CK_BUS_USART2 145 +#define CK_BUS_USART3 146 +#define CK_BUS_UART4 147 +#define CK_BUS_UART5 148 +#define CK_BUS_I2C1 149 +#define CK_BUS_I2C2 150 +#define CK_BUS_I2C3 151 +#define CK_BUS_I3C1 152 +#define CK_BUS_I3C2 153 +#define CK_BUS_I3C3 154 +#define CK_BUS_TIM1 155 +#define CK_BUS_TIM8 156 +#define CK_BUS_TIM15 157 +#define CK_BUS_TIM16 158 +#define CK_BUS_TIM17 159 +#define CK_BUS_SAI1 160 +#define CK_BUS_SAI2 161 +#define CK_BUS_SAI3 162 +#define CK_BUS_SAI4 163 +#define CK_BUS_USART1 164 +#define CK_BUS_USART6 165 +#define CK_BUS_UART7 166 +#define CK_BUS_FDCAN 167 +#define CK_BUS_SPI1 168 +#define CK_BUS_SPI4 169 +#define CK_BUS_SPI5 170 +#define CK_BUS_SPI6 171 +#define CK_BUS_BSEC 172 +#define CK_BUS_IWDG1 173 +#define CK_BUS_IWDG2 174 +#define CK_BUS_IWDG3 175 +#define CK_BUS_IWDG4 176 +#define CK_BUS_WWDG1 177 +#define CK_BUS_VREF 178 +#define CK_BUS_DTS 179 +#define CK_BUS_SERC 180 +#define CK_BUS_HDP 181 +#define CK_BUS_DDRPERFM 182 +#define CK_BUS_OTG 183 +#define CK_BUS_LTDC 184 +#define CK_BUS_CSI 185 +#define CK_BUS_DCMIPP 186 +#define CK_BUS_DDRC 187 +#define CK_BUS_DDRCFG 188 +#define CK_BUS_STGEN 189 +#define CK_SYSDBG 190 +#define CK_KER_TIM2 191 +#define CK_KER_TIM3 192 +#define CK_KER_TIM4 193 +#define CK_KER_TIM5 194 +#define CK_KER_TIM6 195 +#define CK_KER_TIM7 196 +#define CK_KER_TIM10 197 +#define CK_KER_TIM11 198 +#define CK_KER_TIM12 199 +#define CK_KER_TIM13 200 +#define CK_KER_TIM14 201 +#define CK_KER_TIM1 202 +#define CK_KER_TIM8 203 +#define CK_KER_TIM15 204 +#define CK_KER_TIM16 205 +#define CK_KER_TIM17 206 +#define CK_BUS_SYSRAM 207 +#define CK_BUS_RETRAM 208 +#define CK_BUS_OSPI1 209 +#define CK_BUS_OTFD1 210 +#define CK_BUS_SRAM1 211 +#define CK_BUS_SDMMC1 212 +#define CK_BUS_SDMMC2 213 +#define CK_BUS_SDMMC3 214 +#define CK_BUS_DDR 215 +#define CK_BUS_RISAF4 216 +#define CK_BUS_USBHOHCI 217 +#define CK_BUS_USBHEHCI 218 +#define CK_KER_LPTIM1 219 +#define CK_KER_LPTIM2 220 +#define CK_KER_USART2 221 +#define CK_KER_UART4 222 +#define CK_KER_USART3 223 +#define CK_KER_UART5 224 +#define CK_KER_SPI2 225 +#define CK_KER_SPI3 226 +#define CK_KER_SPDIFRX 227 +#define CK_KER_I2C1 228 +#define CK_KER_I2C2 229 +#define CK_KER_I3C1 230 +#define CK_KER_I3C2 231 +#define CK_KER_I2C3 232 +#define CK_KER_I3C3 233 +#define CK_KER_SPI1 234 +#define CK_KER_SPI4 235 +#define CK_KER_SPI5 236 +#define CK_KER_SPI6 237 +#define CK_KER_USART1 238 +#define CK_KER_USART6 239 +#define CK_KER_UART7 240 +#define CK_KER_MDF1 241 +#define CK_KER_SAI1 242 +#define CK_KER_SAI2 243 +#define CK_KER_SAI3 244 +#define CK_KER_SAI4 245 +#define CK_KER_FDCAN 246 +#define CK_KER_CSI 247 +#define CK_KER_CSITXESC 248 +#define CK_KER_CSIPHY 249 +#define CK_KER_STGEN 250 +#define CK_KER_USB2PHY2EN 251 +#define CK_KER_LPUART1 252 +#define CK_KER_LPTIM3 253 +#define CK_KER_LPTIM4 254 +#define CK_KER_LPTIM5 255 +#define CK_KER_TSDBG 256 +#define CK_KER_TPIU 257 +#define CK_BUS_ETR 258 +#define CK_BUS_SYSATB 259 +#define CK_KER_ADC1 260 +#define CK_KER_ADC2 261 +#define CK_KER_OSPI1 262 +#define CK_KER_FMC 263 +#define CK_KER_SDMMC1 264 +#define CK_KER_SDMMC2 265 +#define CK_KER_SDMMC3 266 +#define CK_KER_ETH1 267 +#define CK_KER_ETH2 268 +#define CK_KER_ETH1PTP 269 +#define CK_KER_ETH2PTP 270 +#define CK_KER_USB2PHY1 271 +#define CK_KER_USB2PHY2 272 +#define CK_MCO1 273 +#define CK_MCO2 274 +#define CK_KER_DTS 275 +#define CK_ETH1_RX 276 +#define CK_ETH1_TX 277 +#define CK_ETH1_MAC 278 +#define CK_ETH2_RX 279 +#define CK_ETH2_TX 280 +#define CK_ETH2_MAC 281 +#define CK_ETH1_STP 282 +#define CK_ETH2_STP 283 +#define CK_KER_LTDC 284 +#define HSE_DIV2_CK 285 +#define CK_DBGMCU 286 +#define CK_DAP 287 +#define CK_KER_ETR 288 +#define CK_KER_STM 289 + +#define CK_SCMI_ICN_HS_MCU 0 +#define CK_SCMI_ICN_SDMMC 1 +#define CK_SCMI_ICN_DDR 2 +#define CK_SCMI_ICN_DISPLAY 3 +#define CK_SCMI_ICN_HSL 4 +#define CK_SCMI_ICN_NIC 5 +#define CK_SCMI_FLEXGEN_07 7 +#define CK_SCMI_FLEXGEN_08 8 +#define CK_SCMI_FLEXGEN_09 9 +#define CK_SCMI_FLEXGEN_10 10 +#define CK_SCMI_FLEXGEN_11 11 +#define CK_SCMI_FLEXGEN_12 12 +#define CK_SCMI_FLEXGEN_13 13 +#define CK_SCMI_FLEXGEN_14 14 +#define CK_SCMI_FLEXGEN_15 15 +#define CK_SCMI_FLEXGEN_16 16 +#define CK_SCMI_FLEXGEN_17 17 +#define CK_SCMI_FLEXGEN_18 18 +#define CK_SCMI_FLEXGEN_19 19 +#define CK_SCMI_FLEXGEN_20 20 +#define CK_SCMI_FLEXGEN_21 21 +#define CK_SCMI_FLEXGEN_22 22 +#define CK_SCMI_FLEXGEN_23 23 +#define CK_SCMI_FLEXGEN_24 24 +#define CK_SCMI_FLEXGEN_25 25 +#define CK_SCMI_FLEXGEN_26 26 +#define CK_SCMI_FLEXGEN_27 27 +#define CK_SCMI_FLEXGEN_28 28 +#define CK_SCMI_FLEXGEN_29 29 +#define CK_SCMI_FLEXGEN_30 30 +#define CK_SCMI_FLEXGEN_31 31 +#define CK_SCMI_FLEXGEN_32 32 +#define CK_SCMI_FLEXGEN_33 33 +#define CK_SCMI_FLEXGEN_34 34 +#define CK_SCMI_FLEXGEN_35 35 +#define CK_SCMI_FLEXGEN_36 36 +#define CK_SCMI_FLEXGEN_37 37 +#define CK_SCMI_FLEXGEN_38 38 +#define CK_SCMI_FLEXGEN_39 39 +#define CK_SCMI_FLEXGEN_40 40 +#define CK_SCMI_FLEXGEN_41 41 +#define CK_SCMI_FLEXGEN_42 42 +#define CK_SCMI_FLEXGEN_43 43 +#define CK_SCMI_FLEXGEN_44 44 +#define CK_SCMI_FLEXGEN_45 45 +#define CK_SCMI_FLEXGEN_46 46 +#define CK_SCMI_FLEXGEN_47 47 +#define CK_SCMI_FLEXGEN_48 48 +#define CK_SCMI_FLEXGEN_49 49 +#define CK_SCMI_FLEXGEN_50 50 +#define CK_SCMI_FLEXGEN_51 51 +#define CK_SCMI_FLEXGEN_52 52 +#define CK_SCMI_FLEXGEN_53 53 +#define CK_SCMI_FLEXGEN_54 54 +#define CK_SCMI_FLEXGEN_55 55 +#define CK_SCMI_FLEXGEN_56 56 +#define CK_SCMI_FLEXGEN_57 57 +#define CK_SCMI_FLEXGEN_58 58 +#define CK_SCMI_FLEXGEN_59 59 +#define CK_SCMI_FLEXGEN_60 60 +#define CK_SCMI_FLEXGEN_61 61 +#define CK_SCMI_FLEXGEN_62 62 +#define CK_SCMI_FLEXGEN_63 63 +#define CK_SCMI_ICN_LS_MCU 64 +#define CK_SCMI_HSE 65 +#define CK_SCMI_LSE 66 +#define CK_SCMI_HSI 67 +#define CK_SCMI_LSI 68 +#define CK_SCMI_MSI 69 +#define CK_SCMI_HSE_DIV2 70 +#define CK_SCMI_CPU1 71 +#define CK_SCMI_SYSCPU1 72 +#define CK_SCMI_PLL2 73 +#define CK_SCMI_RTC 74 +#define CK_SCMI_RTCCK 75 +#define CK_SCMI_ICN_APB1 76 +#define CK_SCMI_ICN_APB2 77 +#define CK_SCMI_ICN_APB3 78 +#define CK_SCMI_ICN_APB4 79 +#define CK_SCMI_ICN_APB5 80 +#define CK_SCMI_ICN_APBDBG 81 +#define CK_SCMI_TIMG1 82 +#define CK_SCMI_TIMG2 83 +#define CK_SCMI_BKPSRAM 84 +#define CK_SCMI_BSEC 85 +#define CK_SCMI_BUS_ETR 86 +#define CK_SCMI_FMC 87 +#define CK_SCMI_GPIOA 88 +#define CK_SCMI_GPIOB 89 +#define CK_SCMI_GPIOC 90 +#define CK_SCMI_GPIOD 91 +#define CK_SCMI_GPIOE 92 +#define CK_SCMI_GPIOF 93 +#define CK_SCMI_GPIOG 94 +#define CK_SCMI_GPIOH 95 +#define CK_SCMI_GPIOI 96 +#define CK_SCMI_GPIOZ 97 +#define CK_SCMI_HPDMA1 98 +#define CK_SCMI_HPDMA2 99 +#define CK_SCMI_HPDMA3 100 +#define CK_SCMI_IPCC1 101 +#define CK_SCMI_RETRAM 102 +#define CK_SCMI_SRAM1 103 +#define CK_SCMI_SYSRAM 104 +#define CK_SCMI_OSPI1 105 +#define CK_SCMI_TPIU 106 +#define CK_SCMI_SYSDBG 107 +#define CK_SCMI_SYSATB 108 +#define CK_SCMI_TSDBG 109 +#define CK_SCMI_BUS_STM 110 +#define CK_SCMI_KER_STM 111 +#define CK_SCMI_KER_ETR 112 + +#endif /* _DT_BINDINGS_STM32MP21_CLKS_H_ */ diff --git a/include/dt-bindings/reset/st,stm32mp21-rcc.h b/include/dt-bindings/reset/st,stm32mp21-rcc.h new file mode 100644 index 000000000000..6463bd73d025 --- /dev/null +++ b/include/dt-bindings/reset/st,stm32mp21-rcc.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Copyright (C) STMicroelectronics 2025 - All Rights Reserved + * Author: Gabriel Fernandez + */ + +#ifndef _DT_BINDINGS_STM32MP21_RESET_H_ +#define _DT_BINDINGS_STM32MP21_RESET_H_ + +#define TIM1_R 0 +#define TIM2_R 1 +#define TIM3_R 2 +#define TIM4_R 3 +#define TIM5_R 4 +#define TIM6_R 5 +#define TIM7_R 6 +#define TIM8_R 7 +#define TIM10_R 8 +#define TIM11_R 9 +#define TIM12_R 10 +#define TIM13_R 11 +#define TIM14_R 12 +#define TIM15_R 13 +#define TIM16_R 14 +#define TIM17_R 15 +#define LPTIM1_R 16 +#define LPTIM2_R 17 +#define LPTIM3_R 18 +#define LPTIM4_R 19 +#define LPTIM5_R 20 +#define SPI1_R 21 +#define SPI2_R 22 +#define SPI3_R 23 +#define SPI4_R 24 +#define SPI5_R 25 +#define SPI6_R 26 +#define SPDIFRX_R 27 +#define USART1_R 28 +#define USART2_R 29 +#define USART3_R 30 +#define UART4_R 31 +#define UART5_R 32 +#define USART6_R 33 +#define UART7_R 34 +#define LPUART1_R 35 +#define I2C1_R 36 +#define I2C2_R 37 +#define I2C3_R 38 +#define SAI1_R 39 +#define SAI2_R 40 +#define SAI3_R 41 +#define SAI4_R 42 +#define MDF1_R 43 +#define FDCAN_R 44 +#define HDP_R 45 +#define ADC1_R 46 +#define ADC2_R 47 +#define ETH1_R 48 +#define ETH2_R 49 +#define USBH_R 50 +#define USB2PHY1_R 51 +#define USB2PHY2_R 52 +#define SDMMC1_R 53 +#define SDMMC1DLL_R 54 +#define SDMMC2_R 55 +#define SDMMC2DLL_R 56 +#define SDMMC3_R 57 +#define SDMMC3DLL_R 58 +#define LTDC_R 59 +#define CSI_R 60 +#define DCMIPP_R 61 +#define DCMIPSSI_R 62 +#define WWDG1_R 63 +#define VREF_R 64 +#define DTS_R 65 +#define CRC_R 66 +#define SERC_R 67 +#define I3C1_R 68 +#define I3C2_R 69 +#define I3C3_R 70 +#define IWDG2_KER_R 71 +#define IWDG4_KER_R 72 +#define RNG1_R 73 +#define RNG2_R 74 +#define PKA_R 75 +#define SAES_R 76 +#define HASH1_R 77 +#define HASH2_R 78 +#define CRYP1_R 79 +#define CRYP2_R 80 +#define OSPI1_R 81 +#define OSPI1DLL_R 82 +#define OTG_R 83 +#define FMC_R 84 +#define DBG_R 85 +#define GPIOA_R 86 +#define GPIOB_R 87 +#define GPIOC_R 88 +#define GPIOD_R 89 +#define GPIOE_R 90 +#define GPIOF_R 91 +#define GPIOG_R 92 +#define GPIOH_R 93 +#define GPIOI_R 94 +#define GPIOZ_R 95 +#define HPDMA1_R 96 +#define HPDMA2_R 97 +#define HPDMA3_R 98 +#define IPCC1_R 99 +#define C2_HOLDBOOT_R 100 +#define C1_HOLDBOOT_R 101 +#define C1_R 102 +#define C1P1POR_R 103 +#define C1P1_R 104 +#define C2_R 105 +#define SYS_R 106 +#define VSW_R 107 +#define C1MS_R 108 +#define DDRCP_R 109 +#define DDRCAPB_R 110 +#define DDRPHYCAPB_R 111 +#define DDRCFG_R 112 +#define DDR_R 113 +#define DDRPERFM_R 114 +#define IWDG1_SYS_R 116 +#define IWDG2_SYS_R 117 +#define IWDG3_SYS_R 118 +#define IWDG4_SYS_R 119 + +#define RST_SCMI_C1_R 0 +#define RST_SCMI_C2_R 1 +#define RST_SCMI_C1_HOLDBOOT_R 2 +#define RST_SCMI_C2_HOLDBOOT_R 3 +#define RST_SCMI_FMC 4 +#define RST_SCMI_OSPI1 5 +#define RST_SCMI_OSPI1DLL 6 + +#endif /* _DT_BINDINGS_STM32MP21_RESET_H_ */ -- cgit v1.2.3 From 793e6b74806eea39da26f2fb7e4b640608d9598d Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Fri, 19 Sep 2025 14:26:42 +0000 Subject: dt-bindings: clock: loongson2: Add Loongson-2K0300 compatible Document the clock controller shipped in Loongson-2K0300 SoC, which generates various clock signals for SoC peripherals. Differing from previous generations of SoCs, LS2K0300 requires a 120MHz external clock input. Signed-off-by: Yao Zi Reviewed-by: Krzysztof Kozlowski Reviewed-by: Yanteng Si Signed-off-by: Stephen Boyd --- .../bindings/clock/loongson,ls2k-clk.yaml | 18 +++++++++-- include/dt-bindings/clock/loongson,ls2k-clk.h | 36 ++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml b/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml index 4f79cdb417ab..c07ad1f85857 100644 --- a/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml +++ b/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml @@ -16,6 +16,7 @@ description: | properties: compatible: enum: + - loongson,ls2k0300-clk - loongson,ls2k0500-clk - loongson,ls2k-clk # This is for Loongson-2K1000 - loongson,ls2k2000-clk @@ -24,8 +25,7 @@ properties: maxItems: 1 clocks: - items: - - description: 100m ref + maxItems: 1 clock-names: items: @@ -38,11 +38,23 @@ properties: ID in its "clocks" phandle cell. See include/dt-bindings/clock/loongson,ls2k-clk.h for the full list of Loongson-2 SoC clock IDs. +allOf: + - if: + properties: + compatible: + contains: + const: loongson,ls2k0300-clk + then: + properties: + clock-names: false + else: + required: + - clock-names + required: - compatible - reg - clocks - - clock-names - '#clock-cells' additionalProperties: false diff --git a/include/dt-bindings/clock/loongson,ls2k-clk.h b/include/dt-bindings/clock/loongson,ls2k-clk.h index 4279ba595f1e..8cbb86b2cf1e 100644 --- a/include/dt-bindings/clock/loongson,ls2k-clk.h +++ b/include/dt-bindings/clock/loongson,ls2k-clk.h @@ -43,4 +43,40 @@ #define LOONGSON2_I2S_CLK 33 #define LOONGSON2_MISC_CLK 34 +#define LS2K0300_CLK_STABLE 0 +#define LS2K0300_NODE_PLL 1 +#define LS2K0300_DDR_PLL 2 +#define LS2K0300_PIX_PLL 3 +#define LS2K0300_CLK_THSENS 4 +#define LS2K0300_CLK_NODE_DIV 5 +#define LS2K0300_CLK_NODE_PLL_GATE 6 +#define LS2K0300_CLK_NODE_SCALE 7 +#define LS2K0300_CLK_NODE_GATE 8 +#define LS2K0300_CLK_GMAC_DIV 9 +#define LS2K0300_CLK_GMAC_GATE 10 +#define LS2K0300_CLK_I2S_DIV 11 +#define LS2K0300_CLK_I2S_SCALE 12 +#define LS2K0300_CLK_I2S_GATE 13 +#define LS2K0300_CLK_DDR_DIV 14 +#define LS2K0300_CLK_DDR_GATE 15 +#define LS2K0300_CLK_NET_DIV 16 +#define LS2K0300_CLK_NET_GATE 17 +#define LS2K0300_CLK_DEV_DIV 18 +#define LS2K0300_CLK_DEV_GATE 19 +#define LS2K0300_CLK_PIX_DIV 20 +#define LS2K0300_CLK_PIX_PLL_GATE 21 +#define LS2K0300_CLK_PIX_SCALE 22 +#define LS2K0300_CLK_PIX_GATE 23 +#define LS2K0300_CLK_GMACBP_DIV 24 +#define LS2K0300_CLK_GMACBP_GATE 25 +#define LS2K0300_CLK_USB_SCALE 26 +#define LS2K0300_CLK_USB_GATE 27 +#define LS2K0300_CLK_APB_SCALE 28 +#define LS2K0300_CLK_APB_GATE 29 +#define LS2K0300_CLK_BOOT_SCALE 30 +#define LS2K0300_CLK_BOOT_GATE 31 +#define LS2K0300_CLK_SDIO_SCALE 32 +#define LS2K0300_CLK_SDIO_GATE 33 +#define LS2K0300_CLK_GMAC_IN 34 + #endif -- cgit v1.2.3 From 8ea304cff08f0605b425a3fb494ad47175214ea3 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Wed, 17 Sep 2025 10:05:37 +0800 Subject: dt-bindings: clock: ast2700: modify soc0/1 clock define -add SOC0_CLK_AHBMUX: add SOC0_CLK_AHBMUX for ahb clock source divide. mpll-> ahb_mux -> div_table -> clk_ahb hpll-> -new add clock: SOC0_CLK_MPHYSRC: UFS MPHY clock source. SOC0_CLK_U2PHY_REFCLKSRC: USB2.0 phy clock reference source. SOC1_CLK_I3C: I3C clock source. Signed-off-by: Ryan Chen Acked-by: Krzysztof Kozlowski Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/aspeed,ast2700-scu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/aspeed,ast2700-scu.h b/include/dt-bindings/clock/aspeed,ast2700-scu.h index 63021af3caf5..bacf712e8e04 100644 --- a/include/dt-bindings/clock/aspeed,ast2700-scu.h +++ b/include/dt-bindings/clock/aspeed,ast2700-scu.h @@ -68,6 +68,9 @@ #define SCU0_CLK_GATE_UFSCLK 53 #define SCU0_CLK_GATE_EMMCCLK 54 #define SCU0_CLK_GATE_RVAS1CLK 55 +#define SCU0_CLK_U2PHY_REFCLKSRC 56 +#define SCU0_CLK_AHBMUX 57 +#define SCU0_CLK_MPHYSRC 58 /* SOC1 clk */ #define SCU1_CLKIN 0 @@ -159,5 +162,6 @@ #define SCU1_CLK_GATE_PORTCUSB2CLK 84 #define SCU1_CLK_GATE_PORTDUSB2CLK 85 #define SCU1_CLK_GATE_LTPI1TXCLK 86 +#define SCU1_CLK_I3C 87 #endif -- cgit v1.2.3 From 1e338f4d99e6814ede16bad1db1cc463aad8032c Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Sun, 10 Aug 2025 17:57:45 +0500 Subject: kasan: introduce ARCH_DEFER_KASAN and unify static key across modes Patch series "kasan: unify kasan_enabled() and remove arch-specific implementations", v6. This patch series addresses the fragmentation in KASAN initialization across architectures by introducing a unified approach that eliminates duplicate static keys and arch-specific kasan_arch_is_ready() implementations. The core issue is that different architectures have inconsistent approaches to KASAN readiness tracking: - PowerPC, LoongArch, and UML arch, each implement own kasan_arch_is_ready() - Only HW_TAGS mode had a unified static key (kasan_flag_enabled) - Generic and SW_TAGS modes relied on arch-specific solutions or always-on behavior This patch (of 2): Introduce CONFIG_ARCH_DEFER_KASAN to identify architectures [1] that need to defer KASAN initialization until shadow memory is properly set up, and unify the static key infrastructure across all KASAN modes. [1] PowerPC, UML, LoongArch selects ARCH_DEFER_KASAN. The core issue is that different architectures haveinconsistent approaches to KASAN readiness tracking: - PowerPC, LoongArch, and UML arch, each implement own kasan_arch_is_ready() - Only HW_TAGS mode had a unified static key (kasan_flag_enabled) - Generic and SW_TAGS modes relied on arch-specific solutions or always-on behavior This patch addresses the fragmentation in KASAN initialization across architectures by introducing a unified approach that eliminates duplicate static keys and arch-specific kasan_arch_is_ready() implementations. Let's replace kasan_arch_is_ready() with existing kasan_enabled() check, which examines the static key being enabled if arch selects ARCH_DEFER_KASAN or has HW_TAGS mode support. For other arch, kasan_enabled() checks the enablement during compile time. Now KASAN users can use a single kasan_enabled() check everywhere. Link: https://lkml.kernel.org/r/20250810125746.1105476-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20250810125746.1105476-2-snovitoll@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217049 Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) #powerpc Cc: Alexander Gordeev Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Baoquan He Cc: David Gow Cc: Dmitriy Vyukov Cc: Heiko Carstens Cc: Huacai Chen Cc: Marco Elver Cc: Qing Zhang Cc: Sabyrzhan Tasbolatov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/kasan.h | 7 ------- arch/loongarch/mm/kasan_init.c | 8 +++----- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/kasan.h | 12 ------------ arch/powerpc/mm/kasan/init_32.c | 2 +- arch/powerpc/mm/kasan/init_book3e_64.c | 2 +- arch/powerpc/mm/kasan/init_book3s_64.c | 6 +----- arch/um/Kconfig | 1 + arch/um/include/asm/kasan.h | 5 ++--- arch/um/kernel/mem.c | 13 ++++++++++--- include/linux/kasan-enabled.h | 32 +++++++++++++++++++++++--------- include/linux/kasan.h | 6 ++++++ lib/Kconfig.kasan | 12 ++++++++++++ mm/kasan/common.c | 17 +++++++++++++---- mm/kasan/generic.c | 19 +++++++++++++++---- mm/kasan/hw_tags.c | 9 +-------- mm/kasan/kasan.h | 8 +++++++- mm/kasan/shadow.c | 12 ++++++------ mm/kasan/sw_tags.c | 1 + mm/kasan/tags.c | 2 +- 21 files changed, 106 insertions(+), 70 deletions(-) (limited to 'include') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac..e449e3fcecf9 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -9,6 +9,7 @@ config LOONGARCH select ACPI_PPTT if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_BINFMT_ELF_STATE + select ARCH_NEEDS_DEFER_KASAN select ARCH_DISABLE_KASAN_INLINE select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h index 62f139a9c87d..0e50e5b5e056 100644 --- a/arch/loongarch/include/asm/kasan.h +++ b/arch/loongarch/include/asm/kasan.h @@ -66,7 +66,6 @@ #define XKPRANGE_WC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_WC_KASAN_OFFSET) #define XKVRANGE_VC_SHADOW_OFFSET (KASAN_SHADOW_START + XKVRANGE_VC_KASAN_OFFSET) -extern bool kasan_early_stage; extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; #define kasan_mem_to_shadow kasan_mem_to_shadow @@ -75,12 +74,6 @@ void *kasan_mem_to_shadow(const void *addr); #define kasan_shadow_to_mem kasan_shadow_to_mem const void *kasan_shadow_to_mem(const void *shadow_addr); -#define kasan_arch_is_ready kasan_arch_is_ready -static __always_inline bool kasan_arch_is_ready(void) -{ - return !kasan_early_stage; -} - #define addr_has_metadata addr_has_metadata static __always_inline bool addr_has_metadata(const void *addr) { diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c index d2681272d8f0..170da98ad4f5 100644 --- a/arch/loongarch/mm/kasan_init.c +++ b/arch/loongarch/mm/kasan_init.c @@ -40,11 +40,9 @@ static pgd_t kasan_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); #define __pte_none(early, pte) (early ? pte_none(pte) : \ ((pte_val(pte) & _PFN_MASK) == (unsigned long)__pa(kasan_early_shadow_page))) -bool kasan_early_stage = true; - void *kasan_mem_to_shadow(const void *addr) { - if (!kasan_arch_is_ready()) { + if (!kasan_enabled()) { return (void *)(kasan_early_shadow_page); } else { unsigned long maddr = (unsigned long)addr; @@ -298,7 +296,8 @@ void __init kasan_init(void) kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START), kasan_mem_to_shadow((void *)KFENCE_AREA_END)); - kasan_early_stage = false; + /* Enable KASAN here before kasan_mem_to_shadow(). */ + kasan_init_generic(); /* Populate the linear mapping */ for_each_mem_range(i, &pa_start, &pa_end) { @@ -329,5 +328,4 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized.\n"); } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9f..4730c676b6bf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -122,6 +122,7 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_NEEDS_DEFER_KASAN if PPC_RADIX_MMU select ARCH_DISABLE_KASAN_INLINE if PPC_RADIX_MMU select ARCH_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index b5bbb94c51f6..957a57c1db58 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -53,18 +53,6 @@ #endif #ifdef CONFIG_KASAN -#ifdef CONFIG_PPC_BOOK3S_64 -DECLARE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); - -static __always_inline bool kasan_arch_is_ready(void) -{ - if (static_branch_likely(&powerpc_kasan_enabled_key)) - return true; - return false; -} - -#define kasan_arch_is_ready kasan_arch_is_ready -#endif void kasan_early_init(void); void kasan_mmu_init(void); diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index 03666d790a53..1d083597464f 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -165,7 +165,7 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 60c78aac0f63..0d3a73d6d4b0 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -127,7 +127,7 @@ void __init kasan_init(void) /* Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 7d959544c077..dcafa641804c 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -19,8 +19,6 @@ #include #include -DEFINE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); - static void __init kasan_init_phys_region(void *start, void *end) { unsigned long k_start, k_end, k_cur; @@ -92,11 +90,9 @@ void __init kasan_init(void) */ memset(kasan_early_shadow_page, 0, PAGE_SIZE); - static_branch_inc(&powerpc_kasan_enabled_key); - /* Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_early_init(void) { } diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 9083bfdb7735..1d4def0db841 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,6 +5,7 @@ menu "UML-specific options" config UML bool default y + select ARCH_NEEDS_DEFER_KASAN if STATIC_LINK select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_FINALIZE_INIT diff --git a/arch/um/include/asm/kasan.h b/arch/um/include/asm/kasan.h index f97bb1f7b851..b54a4e937fd1 100644 --- a/arch/um/include/asm/kasan.h +++ b/arch/um/include/asm/kasan.h @@ -24,10 +24,9 @@ #ifdef CONFIG_KASAN void kasan_init(void); -extern int kasan_um_is_ready; -#ifdef CONFIG_STATIC_LINK -#define kasan_arch_is_ready() (kasan_um_is_ready) +#if defined(CONFIG_STATIC_LINK) && defined(CONFIG_KASAN_INLINE) +#error UML does not work in KASAN_INLINE mode with STATIC_LINK enabled! #endif #else static inline void kasan_init(void) { } diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 76bec7de81b5..32e3b1972dc1 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -21,10 +21,10 @@ #include #include #include +#include #ifdef CONFIG_KASAN -int kasan_um_is_ready; -void kasan_init(void) +void __init kasan_init(void) { /* * kasan_map_memory will map all of the required address space and @@ -32,7 +32,11 @@ void kasan_init(void) */ kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); init_task.kasan_depth = 0; - kasan_um_is_ready = true; + /* + * Since kasan_init() is called before main(), + * KASAN is initialized but the enablement is deferred after + * jump_label_init(). See arch_mm_preinit(). + */ } static void (*kasan_init_ptr)(void) @@ -58,6 +62,9 @@ static unsigned long brk_end; void __init arch_mm_preinit(void) { + /* Safe to call after jump_label_init(). Enables KASAN. */ + kasan_init_generic(); + /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h index 6f612d69ea0c..9eca967d8526 100644 --- a/include/linux/kasan-enabled.h +++ b/include/linux/kasan-enabled.h @@ -4,32 +4,46 @@ #include -#ifdef CONFIG_KASAN_HW_TAGS - +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Global runtime flag for KASAN modes that need runtime control. + * Used by ARCH_DEFER_KASAN architectures and HW_TAGS mode. + */ DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled); +/* + * Runtime control for shadow memory initialization or HW_TAGS mode. + * Uses static key for architectures that need deferred KASAN or HW_TAGS. + */ static __always_inline bool kasan_enabled(void) { return static_branch_likely(&kasan_flag_enabled); } -static inline bool kasan_hw_tags_enabled(void) +static inline void kasan_enable(void) { - return kasan_enabled(); + static_branch_enable(&kasan_flag_enabled); } - -#else /* CONFIG_KASAN_HW_TAGS */ - -static inline bool kasan_enabled(void) +#else +/* For architectures that can enable KASAN early, use compile-time check. */ +static __always_inline bool kasan_enabled(void) { return IS_ENABLED(CONFIG_KASAN); } +static inline void kasan_enable(void) {} +#endif /* CONFIG_ARCH_DEFER_KASAN || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS +static inline bool kasan_hw_tags_enabled(void) +{ + return kasan_enabled(); +} +#else static inline bool kasan_hw_tags_enabled(void) { return false; } - #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* LINUX_KASAN_ENABLED_H */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index fe5ce9215821..b509a8d36949 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -543,6 +543,12 @@ void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC +void __init kasan_init_generic(void); +#else +static inline void kasan_init_generic(void) { } +#endif + #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f82889a830fa..a4bb610a7a6f 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -19,6 +19,18 @@ config ARCH_DISABLE_KASAN_INLINE Disables both inline and stack instrumentation. Selected by architectures that do not support these instrumentation types. +config ARCH_NEEDS_DEFER_KASAN + bool + +config ARCH_DEFER_KASAN + def_bool y + depends on KASAN && ARCH_NEEDS_DEFER_KASAN + help + Architectures should select this if they need to defer KASAN + initialization until shadow memory is properly set up. This + enables runtime control via static keys. Otherwise, KASAN uses + compile-time constants for better performance. + config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..e3765931a31f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -32,6 +32,15 @@ #include "kasan.h" #include "../slab.h" +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Definition of the unified static key declared in kasan-enabled.h. + * This provides consistent runtime enable/disable across KASAN modes. + */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); +EXPORT_SYMBOL_GPL(kasan_flag_enabled); +#endif + struct slab *kasan_addr_to_slab(const void *addr) { if (virt_addr_valid(addr)) @@ -246,7 +255,7 @@ static inline void poison_slab_object(struct kmem_cache *cache, void *object, bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, unsigned long ip) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; return check_slab_allocation(cache, object, ip); } @@ -254,7 +263,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, bool still_accessible) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; /* @@ -293,7 +302,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return false; if (ptr != page_address(virt_to_head_page(ptr))) { @@ -522,7 +531,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + if (is_kfence_address(ptr)) return true; slab = folio_slab(folio); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d54e89f8c3e7..b413c46b3e04 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -36,6 +36,17 @@ #include "kasan.h" #include "../slab.h" +/* + * Initialize Generic KASAN and enable runtime checks. + * This should be called from arch kasan_init() once shadow memory is ready. + */ +void __init kasan_init_generic(void) +{ + kasan_enable(); + + pr_info("KernelAddressSanitizer initialized (generic)\n"); +} + /* * All functions below always inlined so compiler could * perform better optimizations in each of __asan_loadX/__assn_storeX @@ -165,7 +176,7 @@ static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; if (unlikely(size == 0)) @@ -193,7 +204,7 @@ bool kasan_byte_accessible(const void *addr) { s8 shadow_byte; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); @@ -495,7 +506,7 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* Check if free meta is valid. */ @@ -562,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_save_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9a6927394b54..c8289a3feabf 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -45,13 +45,6 @@ static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -/* - * Whether KASAN is enabled at all. - * The value remains false until KASAN is initialized by kasan_init_hw_tags(). - */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); -EXPORT_SYMBOL(kasan_flag_enabled); - /* * Whether the selected mode is synchronous, asynchronous, or asymmetric. * Defaults to KASAN_MODE_SYNC. @@ -260,7 +253,7 @@ void __init kasan_init_hw_tags(void) kasan_init_tags(); /* KASAN is now initialized, enable it. */ - static_branch_enable(&kasan_flag_enabled); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 129178be5e64..8a9d8a6ea717 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -398,7 +398,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack); void kasan_save_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object); + +void __kasan_save_free_info(struct kmem_cache *cache, void *object); +static inline void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + if (kasan_enabled()) + __kasan_save_free_info(cache, object); +} #ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 11d472a5c4e8..5d2a876035d6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -125,7 +125,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* @@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC void kasan_poison_last_granule(const void *addr, size_t size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (size & KASAN_GRANULE_MASK) { @@ -408,7 +408,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas unsigned long shadow_start, shadow_end; int ret; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return 0; if (!is_vmalloc_or_module_addr((void *)addr)) @@ -583,7 +583,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -634,7 +634,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return (void *)start; if (!is_vmalloc_or_module_addr(start)) @@ -659,7 +659,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (!is_vmalloc_or_module_addr(start)) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index b9382b5b6a37..c75741a74602 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -44,6 +44,7 @@ void __init kasan_init_sw_tags(void) per_cpu(prng_state, cpu) = (u32)get_cycles(); kasan_init_tags(); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", str_on_off(kasan_stack_collection_enabled())); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index d65d48b85f90..b9f31293622b 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) save_stack_info(cache, object, flags, false); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { save_stack_info(cache, object, 0, true); } -- cgit v1.2.3 From 2ccd9fecd9163f168761d4398564c81554f636ef Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 29 Aug 2025 17:15:27 +0100 Subject: mm: remove unused zpool layer With zswap using zsmalloc directly, there are no more in-tree users of this code. Remove it. With zpool gone, zsmalloc is now always a simple dependency and no longer something the user needs to configure. Hide CONFIG_ZSMALLOC from the user and have zswap and zram pull it in as needed. Link: https://lkml.kernel.org/r/20250829162212.208258-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: SeongJae Park Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Nhat Pham Cc: Vitaly Wool Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 33 +-- Documentation/core-api/mm-api.rst | 1 - Documentation/driver-api/crypto/iaa/iaa-crypto.rst | 2 - MAINTAINERS | 2 - arch/loongarch/configs/loongson3_defconfig | 1 - include/linux/zpool.h | 86 ------ mm/Kconfig | 49 +-- mm/Makefile | 1 - mm/zpool.c | 328 --------------------- mm/zsmalloc.c | 79 ----- tools/testing/selftests/zram/README | 1 - 11 files changed, 23 insertions(+), 560 deletions(-) delete mode 100644 include/linux/zpool.h delete mode 100644 mm/zpool.c (limited to 'include') diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index fd3370aa43fe..283d77217c6f 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -53,26 +53,17 @@ Zswap receives pages for compression from the swap subsystem and is able to evict pages from its own compressed pool on an LRU basis and write them back to the backing swap device in the case that the compressed pool is full. -Zswap makes use of zpool for the managing the compressed memory pool. Each -allocation in zpool is not directly accessible by address. Rather, a handle is +Zswap makes use of zsmalloc for the managing the compressed memory pool. Each +allocation in zsmalloc is not directly accessible by address. Rather, a handle is returned by the allocation routine and that handle must be mapped before being accessed. The compressed memory pool grows on demand and shrinks as compressed -pages are freed. The pool is not preallocated. By default, a zpool -of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created, -but it can be overridden at boot time by setting the ``zpool`` attribute, -e.g. ``zswap.zpool=zsmalloc``. It can also be changed at runtime using the sysfs -``zpool`` attribute, e.g.:: - - echo zsmalloc > /sys/module/zswap/parameters/zpool - -The zsmalloc type zpool has a complex compressed page storage method, and it -can achieve great storage densities. +pages are freed. The pool is not preallocated. When a swap page is passed from swapout to zswap, zswap maintains a mapping -of the swap entry, a combination of the swap type and swap offset, to the zpool -handle that references that compressed swap page. This mapping is achieved -with a red-black tree per swap type. The swap offset is the search key for the -tree nodes. +of the swap entry, a combination of the swap type and swap offset, to the +zsmalloc handle that references that compressed swap page. This mapping is +achieved with a red-black tree per swap type. The swap offset is the search +key for the tree nodes. During a page fault on a PTE that is a swap entry, the swapin code calls the zswap load function to decompress the page into the page allocated by the page @@ -96,11 +87,11 @@ attribute, e.g.:: echo lzo > /sys/module/zswap/parameters/compressor -When the zpool and/or compressor parameter is changed at runtime, any existing -compressed pages are not modified; they are left in their own zpool. When a -request is made for a page in an old zpool, it is uncompressed using its -original compressor. Once all pages are removed from an old zpool, the zpool -and its compressor are freed. +When the compressor parameter is changed at runtime, any existing compressed +pages are not modified; they are left in their own pool. When a request is +made for a page in an old pool, it is uncompressed using its original +compressor. Once all pages are removed from an old pool, the pool and its +compressor are freed. Some of the pages in zswap are same-value filled pages (i.e. contents of the page have same value or repetitive pattern). These pages include zero-filled diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 5063179cfc70..68193a4cfcf5 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -118,7 +118,6 @@ More Memory Management Functions .. kernel-doc:: mm/memremap.c .. kernel-doc:: mm/hugetlb.c .. kernel-doc:: mm/swap.c -.. kernel-doc:: mm/zpool.c .. kernel-doc:: mm/memcontrol.c .. #kernel-doc:: mm/memory-tiers.c (build warnings) .. kernel-doc:: mm/shmem.c diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst index 8e50b900d51c..f815d4fd8372 100644 --- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst +++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst @@ -476,7 +476,6 @@ Use the following commands to enable zswap:: # echo 0 > /sys/module/zswap/parameters/enabled # echo 50 > /sys/module/zswap/parameters/max_pool_percent # echo deflate-iaa > /sys/module/zswap/parameters/compressor - # echo zsmalloc > /sys/module/zswap/parameters/zpool # echo 1 > /sys/module/zswap/parameters/enabled # echo 100 > /proc/sys/vm/swappiness # echo never > /sys/kernel/mm/transparent_hugepage/enabled @@ -625,7 +624,6 @@ the 'fixed' compression mode:: echo 0 > /sys/module/zswap/parameters/enabled echo 50 > /sys/module/zswap/parameters/max_pool_percent echo deflate-iaa > /sys/module/zswap/parameters/compressor - echo zsmalloc > /sys/module/zswap/parameters/zpool echo 1 > /sys/module/zswap/parameters/enabled echo 100 > /proc/sys/vm/swappiness diff --git a/MAINTAINERS b/MAINTAINERS index 9344c33c52e1..a7e123ddf05a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27879,9 +27879,7 @@ R: Chengming Zhou L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/zswap.rst -F: include/linux/zpool.h F: include/linux/zswap.h -F: mm/zpool.c F: mm/zswap.c F: tools/testing/selftests/cgroup/test_zswap.c diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 34eaee0384c9..2b8df0e9e42a 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -106,7 +106,6 @@ CONFIG_CMDLINE_PARTITION=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m -CONFIG_ZPOOL=y CONFIG_ZSWAP=y CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y CONFIG_ZSMALLOC=y diff --git a/include/linux/zpool.h b/include/linux/zpool.h deleted file mode 100644 index 369ef068fad8..000000000000 --- a/include/linux/zpool.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for the zswap compressed memory storage - * implementations. - */ - -#ifndef _ZPOOL_H_ -#define _ZPOOL_H_ - -struct zpool; - -bool zpool_has_pool(char *type); - -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); - -const char *zpool_get_type(struct zpool *pool); - -void zpool_destroy_pool(struct zpool *pool); - -int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - -void zpool_free(struct zpool *pool, unsigned long handle); - -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy); - -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem); - -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len); - -u64 zpool_get_total_pages(struct zpool *pool); - - -/** - * struct zpool_driver - driver implementation for zpool - * @type: name of the driver. - * @list: entry in the list of zpool drivers. - * @create: create a new pool. - * @destroy: destroy a pool. - * @malloc: allocate mem from a pool. - * @free: free mem from a pool. - * @sleep_mapped: whether zpool driver can sleep during map. - * @map: map a handle. - * @unmap: unmap a handle. - * @total_size: get total size of a pool. - * - * This is created by a zpool implementation and registered - * with zpool. - */ -struct zpool_driver { - char *type; - struct module *owner; - atomic_t refcount; - struct list_head list; - - void *(*create)(const char *name, gfp_t gfp); - void (*destroy)(void *pool); - - int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - void (*free)(void *pool, unsigned long handle); - - void *(*obj_read_begin)(void *pool, unsigned long handle, - void *local_copy); - void (*obj_read_end)(void *pool, unsigned long handle, - void *handle_mem); - void (*obj_write)(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len); - - u64 (*total_pages)(void *pool); -}; - -void zpool_register_driver(struct zpool_driver *driver); - -int zpool_unregister_driver(struct zpool_driver *driver); - -bool zpool_can_sleep_mapped(struct zpool *pool); - -#endif diff --git a/mm/Kconfig b/mm/Kconfig index 4108bcd96784..b971d35c43c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,9 +9,6 @@ menu "Memory Management options" config ARCH_NO_SWAP bool -config ZPOOL - bool - menuconfig SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK && !ARCH_NO_SWAP @@ -26,7 +23,7 @@ config ZSWAP bool "Compressed cache for swap pages" depends on SWAP select CRYPTO - select ZPOOL + select ZSMALLOC help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -125,45 +122,18 @@ config ZSWAP_COMPRESSOR_DEFAULT default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD default "" -choice - prompt "Default allocator" - depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU - help - Selects the default allocator for the compressed cache for - swap pages. - The default is 'zbud' for compatibility, however please do - read the description of each of the allocators below before - making a right choice. - - The selection made here can be overridden by using the kernel - command line 'zswap.zpool=' option. +config ZSMALLOC + tristate -config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - bool "zsmalloc" - select ZSMALLOC - help - Use the zsmalloc allocator as the default allocator. -endchoice +if ZSMALLOC -config ZSWAP_ZPOOL_DEFAULT - string - depends on ZSWAP - default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - default "" +menu "Zsmalloc allocator options" + depends on ZSMALLOC -config ZSMALLOC - tristate - prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) - depends on MMU - help - zsmalloc is a slab-based memory allocator designed to store - pages of various compression levels efficiently. It achieves - the highest storage density with the least amount of fragmentation. +comment "Zsmalloc is a common backend allocator for zswap & zram" config ZSMALLOC_STAT bool "Export zsmalloc statistics" - depends on ZSMALLOC select DEBUG_FS help This option enables code in the zsmalloc to collect various @@ -175,7 +145,6 @@ config ZSMALLOC_CHAIN_SIZE int "Maximum number of physical pages per-zspage" default 8 range 4 16 - depends on ZSMALLOC help This option sets the upper limit on the number of physical pages that a zmalloc page (zspage) can consist of. The optimal zspage @@ -190,6 +159,10 @@ config ZSMALLOC_CHAIN_SIZE For more information, see zsmalloc documentation. +endmenu + +endif + menu "Slab allocator options" config SLUB diff --git a/mm/Makefile b/mm/Makefile index ef54aa615d9d..21abb3353550 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -115,7 +115,6 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o -obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o diff --git a/mm/zpool.c b/mm/zpool.c deleted file mode 100644 index 0a71d03369f1..000000000000 --- a/mm/zpool.c +++ /dev/null @@ -1,328 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for memory storage pool implementations. - * Typically, this is used to store compressed memory. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -struct zpool { - struct zpool_driver *driver; - void *pool; -}; - -static LIST_HEAD(drivers_head); -static DEFINE_SPINLOCK(drivers_lock); - -/** - * zpool_register_driver() - register a zpool implementation. - * @driver: driver to register - */ -void zpool_register_driver(struct zpool_driver *driver) -{ - spin_lock(&drivers_lock); - atomic_set(&driver->refcount, 0); - list_add(&driver->list, &drivers_head); - spin_unlock(&drivers_lock); -} -EXPORT_SYMBOL(zpool_register_driver); - -/** - * zpool_unregister_driver() - unregister a zpool implementation. - * @driver: driver to unregister. - * - * Module usage counting is used to prevent using a driver - * while/after unloading, so if this is called from module - * exit function, this should never fail; if called from - * other than the module exit function, and this returns - * failure, the driver is in use and must remain available. - */ -int zpool_unregister_driver(struct zpool_driver *driver) -{ - int ret = 0, refcount; - - spin_lock(&drivers_lock); - refcount = atomic_read(&driver->refcount); - WARN_ON(refcount < 0); - if (refcount > 0) - ret = -EBUSY; - else - list_del(&driver->list); - spin_unlock(&drivers_lock); - - return ret; -} -EXPORT_SYMBOL(zpool_unregister_driver); - -/* this assumes @type is null-terminated. */ -static struct zpool_driver *zpool_get_driver(const char *type) -{ - struct zpool_driver *driver; - - spin_lock(&drivers_lock); - list_for_each_entry(driver, &drivers_head, list) { - if (!strcmp(driver->type, type)) { - bool got = try_module_get(driver->owner); - - if (got) - atomic_inc(&driver->refcount); - spin_unlock(&drivers_lock); - return got ? driver : NULL; - } - } - - spin_unlock(&drivers_lock); - return NULL; -} - -static void zpool_put_driver(struct zpool_driver *driver) -{ - atomic_dec(&driver->refcount); - module_put(driver->owner); -} - -/** - * zpool_has_pool() - Check if the pool driver is available - * @type: The type of the zpool to check (e.g. zsmalloc) - * - * This checks if the @type pool driver is available. This will try to load - * the requested module, if needed, but there is no guarantee the module will - * still be loaded and available immediately after calling. If this returns - * true, the caller should assume the pool is available, but must be prepared - * to handle the @zpool_create_pool() returning failure. However if this - * returns false, the caller should assume the requested pool type is not - * available; either the requested pool type module does not exist, or could - * not be loaded, and calling @zpool_create_pool() with the pool type will - * fail. - * - * The @type string must be null-terminated. - * - * Returns: true if @type pool is available, false if not - */ -bool zpool_has_pool(char *type) -{ - struct zpool_driver *driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) - return false; - - zpool_put_driver(driver); - return true; -} -EXPORT_SYMBOL(zpool_has_pool); - -/** - * zpool_create_pool() - Create a new zpool - * @type: The type of the zpool to create (e.g. zsmalloc) - * @name: The name of the zpool (e.g. zram0, zswap) - * @gfp: The GFP flags to use when allocating the pool. - * - * This creates a new zpool of the specified type. The gfp flags will be - * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be evictable. - * - * Implementations must guarantee this to be thread-safe. - * - * The @type and @name strings must be null-terminated. - * - * Returns: New zpool on success, NULL on failure. - */ -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) -{ - struct zpool_driver *driver; - struct zpool *zpool; - - pr_debug("creating pool type %s\n", type); - - driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) { - pr_err("no driver for type %s\n", type); - return NULL; - } - - zpool = kmalloc(sizeof(*zpool), gfp); - if (!zpool) { - pr_err("couldn't create zpool - out of memory\n"); - zpool_put_driver(driver); - return NULL; - } - - zpool->driver = driver; - zpool->pool = driver->create(name, gfp); - - if (!zpool->pool) { - pr_err("couldn't create %s pool\n", type); - zpool_put_driver(driver); - kfree(zpool); - return NULL; - } - - pr_debug("created pool type %s\n", type); - - return zpool; -} - -/** - * zpool_destroy_pool() - Destroy a zpool - * @zpool: The zpool to destroy. - * - * Implementations must guarantee this to be thread-safe, - * however only when destroying different pools. The same - * pool should only be destroyed once, and should not be used - * after it is destroyed. - * - * This destroys an existing zpool. The zpool should not be in use. - */ -void zpool_destroy_pool(struct zpool *zpool) -{ - pr_debug("destroying pool type %s\n", zpool->driver->type); - - zpool->driver->destroy(zpool->pool); - zpool_put_driver(zpool->driver); - kfree(zpool); -} - -/** - * zpool_get_type() - Get the type of the zpool - * @zpool: The zpool to check - * - * This returns the type of the pool. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: The type of zpool. - */ -const char *zpool_get_type(struct zpool *zpool) -{ - return zpool->driver->type; -} - -/** - * zpool_malloc() - Allocate memory - * @zpool: The zpool to allocate from. - * @size: The amount of memory to allocate. - * @gfp: The GFP flags to use when allocating memory. - * @handle: Pointer to the handle to set - * @nid: The preferred node id. - * - * This allocates the requested amount of memory from the pool. - * The gfp flags will be used when allocating memory, if the - * implementation supports it. The provided @handle will be - * set to the allocated object handle. The allocation will - * prefer the NUMA node specified by @nid. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: 0 on success, negative value on error. - */ -int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); -} - -/** - * zpool_free() - Free previously allocated memory - * @zpool: The zpool that allocated the memory. - * @handle: The handle to the memory to free. - * - * This frees previously allocated memory. This does not guarantee - * that the pool will actually free memory, only that the memory - * in the pool will become available for use by the pool. - * - * Implementations must guarantee this to be thread-safe, - * however only when freeing different handles. The same - * handle should only be freed once, and should not be used - * after freeing. - */ -void zpool_free(struct zpool *zpool, unsigned long handle) -{ - zpool->driver->free(zpool->pool, handle); -} - -/** - * zpool_obj_read_begin() - Start reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @local_copy: A local buffer to use if needed. - * - * This starts a read operation of a previously allocated handle. The passed - * @local_copy buffer may be used if needed by copying the memory into. - * zpool_obj_read_end() MUST be called after the read is completed to undo any - * actions taken (e.g. release locks). - * - * Returns: A pointer to the handle memory to be read, if @local_copy is used, - * the returned pointer is @local_copy. - */ -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy) -{ - return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy); -} - -/** - * zpool_obj_read_end() - Finish reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The pointer returned by zpool_obj_read_begin() - * - * Finishes a read operation previously started by zpool_obj_read_begin(). - */ -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem) -{ - zpool->driver->obj_read_end(zpool->pool, handle, handle_mem); -} - -/** - * zpool_obj_write() - Write to a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The memory to copy from into the handle. - * @mem_len: The length of memory to be written. - * - */ -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len); -} - -/** - * zpool_get_total_pages() - The total size of the pool - * @zpool: The zpool to check - * - * This returns the total size in pages of the pool. - * - * Returns: Total size of the zpool in pages. - */ -u64 zpool_get_total_pages(struct zpool *zpool) -{ - return zpool->driver->total_pages(zpool->pool); -} - -MODULE_AUTHOR("Dan Streetman "); -MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 153783d49d34..5bf832f9c05c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include "zpdesc.h" @@ -433,78 +432,6 @@ static void record_obj(unsigned long handle, unsigned long obj) *(unsigned long *)handle = obj; } -/* zpool driver */ - -#ifdef CONFIG_ZPOOL - -static void *zs_zpool_create(const char *name, gfp_t gfp) -{ - /* - * Ignore global gfp flags: zs_malloc() may be invoked from - * different contexts and its caller must provide a valid - * gfp mask. - */ - return zs_create_pool(name); -} - -static void zs_zpool_destroy(void *pool) -{ - zs_destroy_pool(pool); -} - -static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - *handle = zs_malloc(pool, size, gfp, nid); - - if (IS_ERR_VALUE(*handle)) - return PTR_ERR((void *)*handle); - return 0; -} -static void zs_zpool_free(void *pool, unsigned long handle) -{ - zs_free(pool, handle); -} - -static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, - void *local_copy) -{ - return zs_obj_read_begin(pool, handle, local_copy); -} - -static void zs_zpool_obj_read_end(void *pool, unsigned long handle, - void *handle_mem) -{ - zs_obj_read_end(pool, handle, handle_mem); -} - -static void zs_zpool_obj_write(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zs_obj_write(pool, handle, handle_mem, mem_len); -} - -static u64 zs_zpool_total_pages(void *pool) -{ - return zs_get_total_pages(pool); -} - -static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .obj_read_begin = zs_zpool_obj_read_begin, - .obj_read_end = zs_zpool_obj_read_end, - .obj_write = zs_zpool_obj_write, - .total_pages = zs_zpool_total_pages, -}; - -MODULE_ALIAS("zpool-zsmalloc"); -#endif /* CONFIG_ZPOOL */ - static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { return PagePrivate(zpdesc_page(zpdesc)); @@ -2248,9 +2175,6 @@ static int __init zs_init(void) { int rc __maybe_unused; -#ifdef CONFIG_ZPOOL - zpool_register_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); if (rc) @@ -2262,9 +2186,6 @@ static int __init zs_init(void) static void __exit zs_exit(void) { -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION set_movable_ops(NULL, PGTY_zsmalloc); #endif diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 110b34834a6f..82921c75681c 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -14,7 +14,6 @@ Statistics for individual zram devices are exported through sysfs nodes at Kconfig required: CONFIG_ZRAM=y CONFIG_CRYPTO_LZ4=y -CONFIG_ZPOOL=y CONFIG_ZSMALLOC=y ZRAM Testcases -- cgit v1.2.3 From 0bf2edf041dcb0b304a8dbda8c699771d5a245d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:27 +0200 Subject: mm/page_alloc: reject unreasonable folio/compound page sizes in alloc_contig_range_noprof() Let's reject them early, which in turn makes folio_alloc_gigantic() reject them properly. To avoid converting from order to nr_pages, let's just add MAX_FOLIO_ORDER and calculate MAX_FOLIO_NR_PAGES based on that. While at it, let's just make the order a "const unsigned order". Link: https://lkml.kernel.org/r/20250901150359.867252-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/page_alloc.c | 10 +++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 00c8a54127d3..77737cbf2216 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2055,11 +2055,13 @@ static inline long folio_nr_pages(const struct folio *folio) /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#define MAX_FOLIO_ORDER PUD_ORDER #else -#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER #endif +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0873d640f26c..54dbb6f0d14e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6838,6 +6838,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask) { + const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; int ret = 0; @@ -6855,6 +6856,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, PB_ISOLATE_MODE_CMA_ALLOC : PB_ISOLATE_MODE_OTHER; + /* + * In contrast to the buddy, we allow for orders here that exceed + * MAX_PAGE_ORDER, so we must manually make sure that we are not + * exceeding the maximum folio order. + */ + if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER)) + return -EINVAL; + gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) return -EINVAL; @@ -6952,7 +6961,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); - int order = ilog2(end - start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); -- cgit v1.2.3 From 4751c39eee0c3fcc742aa7d7242ce2b78faa3606 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:32 +0200 Subject: mm: limit folio/compound page sizes in problematic kernel configs Let's limit the maximum folio size in problematic kernel config where the memmap is allocated per memory section (SPARSEMEM without SPARSEMEM_VMEMMAP) to a single memory section. Currently, only a single architectures supports ARCH_HAS_GIGANTIC_PAGE but not SPARSEMEM_VMEMMAP: sh. Fortunately, the biggest hugetlb size sh supports is 64 MiB (HUGETLB_PAGE_SIZE_64MB) and the section size is at least 64 MiB (SECTION_SIZE_BITS == 26), so their use case is not degraded. As folios and memory sections are naturally aligned to their order-2 size in memory, consequently a single folio can no longer span multiple memory sections on these problematic kernel configs. nth_page() is no longer required when operating within a single compound page / folio. Link: https://lkml.kernel.org/r/20250901150359.867252-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 77737cbf2216..2dee79fa2efc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2053,11 +2053,25 @@ static inline long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_ORDER PUD_ORDER -#else +#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ #define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#else +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (e.g., hugetlb, dax). + */ +#define MAX_FOLIO_ORDER PUD_ORDER #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) -- cgit v1.2.3 From 73b3294b1152e94c1971a735b8db8c7503fd97a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:33 +0200 Subject: mm: simplify folio_page() and folio_page_idx() Now that a single folio/compound page can no longer span memory sections in problematic kernel configurations, we can stop using nth_page() in folio_page() and folio_page_idx(). While at it, turn both macros into static inline functions and add kernel doc for folio_page_idx(). Link: https://lkml.kernel.org/r/20250901150359.867252-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++-- include/linux/page-flags.h | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dee79fa2efc..f6880e3225c5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,10 +210,8 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) -#define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ @@ -225,6 +223,20 @@ extern unsigned long sysctl_admin_reserve_kbytes; /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) +/** + * folio_page_idx - Return the number of a page in a folio. + * @folio: The folio. + * @page: The folio page. + * + * This function expects that the page is actually part of the folio. + * The returned number is relative to the start of the folio. + */ +static inline unsigned long folio_page_idx(const struct folio *folio, + const struct page *page) +{ + return page - &folio->page; +} + static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d53a86e68c89..a88b61eec3f8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,7 +316,10 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -#define folio_page(folio, n) nth_page(&(folio)->page, n) +static inline struct page *folio_page(struct folio *folio, unsigned long n) +{ + return &folio->page + n; +} static __always_inline int PageTail(const struct page *page) { -- cgit v1.2.3 From 6972706f95926838f9bd3ec2b2393c034bdb85ba Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:43 +0200 Subject: mm/cma: refuse handing out non-contiguous page ranges Let's disallow handing out PFN ranges with non-contiguous pages, so we can remove the nth-page usage in __cma_alloc(), and so any callers don't have to worry about that either when wanting to blindly iterate pages. This is really only a problem in configs with SPARSEMEM but without SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some cases. Will this cause harm? Probably not, because it's mostly 32bit that does not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could look into allocating the memmap for the memory sections spanned by a single CMA region in one go from memblock. [david@redhat.com: we can have NUMMU configs with SPARSEMEM enabled] Link: https://lkml.kernel.org/r/6ec933b1-b3f7-41c0-95d8-e518bb87375e@redhat.com Link: https://lkml.kernel.org/r/20250901150359.867252-23-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Alexandru Elisei Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ mm/cma.c | 39 ++++++++++++++++++++++++--------------- mm/util.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f6880e3225c5..2ca1eb2db63e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +bool page_range_contiguous(const struct page *page, unsigned long nr_pages); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else #define nth_page(page,n) ((page) + (n)) +static inline bool page_range_contiguous(const struct page *page, + unsigned long nr_pages) +{ + return true; +} #endif /* to align the pointer to the (next) page boundary */ diff --git a/mm/cma.c b/mm/cma.c index e56ec64d0567..813e6dc7b095 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long count, unsigned int align, struct page **pagep, gfp_t gfp) { - unsigned long mask, offset; - unsigned long pfn = -1; - unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + unsigned long start, pfn, mask, offset; int ret = -EBUSY; struct page *page = NULL; @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, if (bitmap_count > bitmap_maxno) goto out; - for (;;) { + for (start = 0; ; start = bitmap_no + mask + 1) { spin_lock_irq(&cma->lock); /* * If the request is larger than the available number @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); break; } + + pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); + page = pfn_to_page(pfn); + + /* + * Do not hand out page ranges that are not contiguous, so + * callers can just iterate the pages without having to worry + * about these corner cases. + */ + if (!page_range_contiguous(page, count)) { + spin_unlock_irq(&cma->lock); + pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]", + __func__, cma->name, pfn, pfn + count - 1); + continue; + } + bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count -= count; /* @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, */ spin_unlock_irq(&cma->lock); - pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); + if (!ret) break; - } cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", - __func__, pfn, pfn_to_page(pfn)); + __func__, pfn, page); - trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), - count, align); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; + trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align); } out: - *pagep = page; + if (!ret) + *pagep = page; return ret; } @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(nth_page(page, i)); + page_kasan_tag_reset(page + i); } if (ret && !(gfp & __GFP_NOWARN)) { diff --git a/mm/util.c b/mm/util.c index d235b74f7aff..4b9d40c71286 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1281,3 +1281,38 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } #endif /* CONFIG_MMU */ + +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/** + * page_range_contiguous - test whether the page range is contiguous + * @page: the start of the page range. + * @nr_pages: the number of pages in the range. + * + * Test whether the page range is contiguous, such that they can be iterated + * naively, corresponding to iterating a contiguous PFN range. + * + * This function should primarily only be used for debug checks, or when + * working with page ranges that are not naturally contiguous (e.g., pages + * within a folio are). + * + * Returns true if contiguous, otherwise false. + */ +bool page_range_contiguous(const struct page *page, unsigned long nr_pages) +{ + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * The memmap is allocated per memory section, so no need to check + * within the first section. However, we need to check each other + * spanned memory section once, making sure the first page in a + * section could similarly be reached by just iterating pages. + */ + for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); + pfn < end_pfn; pfn += PAGES_PER_SECTION) + if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) + return false; + return true; +} +#endif -- cgit v1.2.3 From 80e7bb74d4ff24725f0ddb1c72d8de45a3d975f6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:45 +0200 Subject: scatterlist: disallow non-contigous page ranges in a single SG entry The expectation is that there is currently no user that would pass in non-contigous page ranges: no allocator, not even VMA, will hand these out. The only problematic part would be if someone would provide a range obtained directly from memblock, or manually merge problematic ranges. If we find such cases, we should fix them to create separate SG entries. Let's check in sg_set_page() that this is really the case. No need to check in sg_set_folio(), as pages in a folio are guaranteed to be contiguous. As sg_set_page() gets inlined into modules, we have to export the page_range_contiguous() helper -- use EXPORT_SYMBOL, there is nothing special about this helper such that we would want to enforce GPL-only modules. We can now drop the nth_page() usage in sg_page_iter_page(). Link: https://lkml.kernel.org/r/20250901150359.867252-25-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Marek Szyprowski Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 3 ++- mm/util.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6f8a4965f9b9..29f6ceb98d74 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { + VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE)); sg_assign_page(sg, page); sg->offset = offset; sg->length = len; @@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter, */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { - return nth_page(sg_page(piter->sg), piter->sg_pgoffset); + return sg_page(piter->sg) + piter->sg_pgoffset; } /** diff --git a/mm/util.c b/mm/util.c index 4b9d40c71286..e29d3310e26b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1315,4 +1315,5 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages) return false; return true; } +EXPORT_SYMBOL(page_range_contiguous); #endif -- cgit v1.2.3 From ce00897b94bc5c62fab962625efcf1ab824d3688 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:54 +0200 Subject: crypto: remove nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-34-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton --- crypto/ahash.c | 4 ++-- crypto/scompress.c | 8 ++++---- include/crypto/scatterwalk.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/crypto/ahash.c b/crypto/ahash.c index a227793d2c5b..dfb4f5476428 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -88,7 +88,7 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk) sg = walk->sg; walk->offset = sg->offset; - walk->pg = nth_page(sg_page(walk->sg), (walk->offset >> PAGE_SHIFT)); + walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT); walk->offset = offset_in_page(walk->offset); walk->entrylen = sg->length; @@ -226,7 +226,7 @@ int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) if (!IS_ENABLED(CONFIG_HIGHMEM)) return crypto_shash_digest(desc, data, nbytes, req->result); - page = nth_page(page, offset >> PAGE_SHIFT); + page += offset >> PAGE_SHIFT; offset = offset_in_page(offset); if (nbytes > (unsigned int)PAGE_SIZE - offset) diff --git a/crypto/scompress.c b/crypto/scompress.c index c651e7f2197a..1a7ed8ae65b0 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -198,7 +198,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) } else return -ENOSYS; - dpage = nth_page(dpage, doff / PAGE_SIZE); + dpage += doff / PAGE_SIZE; doff = offset_in_page(doff); n = (dlen - 1) / PAGE_SIZE; @@ -220,12 +220,12 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) } else break; - spage = nth_page(spage, soff / PAGE_SIZE); + spage = spage + soff / PAGE_SIZE; soff = offset_in_page(soff); n = (slen - 1) / PAGE_SIZE; n += (offset_in_page(slen - 1) + soff) / PAGE_SIZE; - if (PageHighMem(nth_page(spage, n)) && + if (PageHighMem(spage + n) && size_add(soff, slen) > PAGE_SIZE) break; src = kmap_local_page(spage) + soff; @@ -270,7 +270,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) if (dlen <= PAGE_SIZE) break; dlen -= PAGE_SIZE; - dpage = nth_page(dpage, 1); + dpage++; } } diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index 15ab743f68c8..83d14376ff2b 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -159,7 +159,7 @@ static inline void scatterwalk_map(struct scatter_walk *walk) if (IS_ENABLED(CONFIG_HIGHMEM)) { struct page *page; - page = nth_page(base_page, offset >> PAGE_SHIFT); + page = base_page + (offset >> PAGE_SHIFT); offset = offset_in_page(offset); addr = kmap_local_page(page) + offset; } else { @@ -259,7 +259,7 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk, end += (offset_in_page(offset) + offset_in_page(nbytes) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = start; i < end; i++) - flush_dcache_page(nth_page(base_page, i)); + flush_dcache_page(base_page + i); } scatterwalk_advance(walk, nbytes); } -- cgit v1.2.3 From d5170ce4d71b3843613ee1840bca50ad71c3671e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:57 +0200 Subject: block: update comment of "struct bio_vec" regarding nth_page() Ever since commit 858c708d9efb ("block: move the bi_size update out of __bio_try_merge_page"), page_is_mergeable() no longer exists, and the logic in bvec_try_merge_page() is now a simple page pointer comparison. Link: https://lkml.kernel.org/r/20250901150359.867252-37-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/bvec.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0a80e1f9aa20..3fc0efa0825b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -22,11 +22,8 @@ struct page; * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * - * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: - * - * nth_page(@bv_page, n) == @bv_page + n - * - * This holds because page_is_mergeable() checks the above property. + * All pages within a bio_vec starting from @bv_page are contiguous and + * can simply be iterated (see bvec_advance()). */ struct bio_vec { struct page *bv_page; -- cgit v1.2.3 From 84efbefa26df36a845a9210ee962aa6866f99bb7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:58 +0200 Subject: mm: remove nth_page() Now that all users are gone, let's remove it. Link: https://lkml.kernel.org/r/20250901150359.867252-38-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- tools/testing/scatterlist/linux/mm.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca1eb2db63e..b26ca8b2162d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,9 +210,7 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) bool page_range_contiguous(const struct page *page, unsigned long nr_pages); -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else -#define nth_page(page,n) ((page) + (n)) static inline bool page_range_contiguous(const struct page *page, unsigned long nr_pages) { diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index 5bd9e6e80625..121ae78d6e88 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -51,7 +51,6 @@ static inline unsigned long page_to_phys(struct page *page) #define page_to_pfn(page) ((unsigned long)(page) / PAGE_SIZE) #define pfn_to_page(pfn) (void *)((pfn) * PAGE_SIZE) -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define __min(t1, t2, min1, min2, x, y) ({ \ t1 min1 = (x); \ -- cgit v1.2.3 From 4a25f995bd59843a898b531bb3e472d710ef9439 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:56 +0800 Subject: mm: hugetlb: directly pass order when allocate a hugetlb folio Use order instead of struct hstate to remove huge_page_order() call from all hugetlb folio allocation, also order_is_gigantic() is added to check whether it is a gigantic order. Link: https://lkml.kernel.org/r/20250910133958.301467-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Jane Chu Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ++++++- mm/hugetlb.c | 29 ++++++++++++++--------------- mm/hugetlb_cma.c | 3 +-- mm/hugetlb_cma.h | 6 +++--- 4 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 526d27e88b3b..8e63e46b8e1f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -788,9 +788,14 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +static inline bool order_is_gigantic(unsigned int order) +{ + return order > MAX_PAGE_ORDER; +} + static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_PAGE_ORDER; + return order_is_gigantic(huge_page_order(h)); } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef6284ec85b6..7f33e4a158c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1473,17 +1473,16 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; - int order = huge_page_order(h); bool retried = false; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); + folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { if (hugetlb_cma_exclusive_alloc()) return NULL; @@ -1506,16 +1505,16 @@ retry: } #else /* !CONFIG_CONTIG_ALLOC */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } @@ -1926,11 +1925,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) return NULL; } -static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; @@ -1980,11 +1977,13 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { struct folio *folio; + int order = huge_page_order(h); - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + if (order_is_gigantic(order)) + folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); + folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, + node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2872,7 +2871,7 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (folio_order(folio) > MAX_PAGE_ORDER) + if (order_is_gigantic(folio_order(folio))) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f58ef4969e7a..e8e4dc7182d5 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -26,11 +26,10 @@ void hugetlb_cma_free_folio(struct folio *folio) } -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { int node; - int order = huge_page_order(h); struct folio *folio = NULL; if (hugetlb_cma[nid]) diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index f7d7fb9880a2..2c2ec8a7e134 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -4,7 +4,7 @@ #ifdef CONFIG_CMA void hugetlb_cma_free_folio(struct folio *folio); -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); @@ -18,8 +18,8 @@ static inline void hugetlb_cma_free_folio(struct folio *folio) { } -static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nodemask) +static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } -- cgit v1.2.3 From 8eccb066f28747e966bda716cb90dbca13b78032 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:10 +0200 Subject: mm: constify shmem related test functions for improved const-correctness Patch series "mm: establish const-correctness for pointer parameters", v6. This series is to improved const-correctness in the low-level memory-management subsystem, which provides a basis for further constification further up the call stack (e.g. filesystems). I started this work when I tried to constify the Ceph filesystem code, but found that to be impossible because many "mm" functions accept non-const pointers, even though they modify nothing. This patch (of 12): We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-1-max.kellermann@ionos.com Link: https://lkml.kernel.org/r/20250901205021.3573313-2-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- include/linux/shmem_fs.h | 4 ++-- mm/shmem.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b26ca8b2162d..45a47b555499 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -995,11 +995,11 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ -bool vma_is_shmem(struct vm_area_struct *vma); -bool vma_is_anon_shmem(struct vm_area_struct *vma); +bool vma_is_shmem(const struct vm_area_struct *vma); +bool vma_is_anon_shmem(const struct vm_area_struct *vma); #else -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } -static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6d0f9c599ff7..0e47465ef0fd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -99,9 +99,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM -bool shmem_mapping(struct address_space *mapping); +bool shmem_mapping(const struct address_space *mapping); #else -static inline bool shmem_mapping(struct address_space *mapping) +static inline bool shmem_mapping(const struct address_space *mapping) { return false; } diff --git a/mm/shmem.c b/mm/shmem.c index 640fecc42f60..2df26f4d6e60 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -275,18 +275,18 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; -bool shmem_mapping(struct address_space *mapping) +bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); -bool vma_is_anon_shmem(struct vm_area_struct *vma) +bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } -bool vma_is_shmem(struct vm_area_struct *vma) +bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } -- cgit v1.2.3 From 7c3e97ac0d75306d9d03de575c9878f8fd9efe3b Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:11 +0200 Subject: mm: constify pagemap related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-3-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 55 +++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f0dfdfb13cd9..0d66a252b06f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -140,7 +140,7 @@ static inline int inode_drain_writes(struct inode *inode) return filemap_write_and_wait(inode->i_mapping); } -static inline bool mapping_empty(struct address_space *mapping) +static inline bool mapping_empty(const struct address_space *mapping) { return xa_empty(&mapping->i_pages); } @@ -166,7 +166,7 @@ static inline bool mapping_empty(struct address_space *mapping) * refcount and the referenced bit, which will be elevated or set in * the process of adding new cache pages to an inode. */ -static inline bool mapping_shrinkable(struct address_space *mapping) +static inline bool mapping_shrinkable(const struct address_space *mapping) { void *head; @@ -267,7 +267,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) clear_bit(AS_UNEVICTABLE, &mapping->flags); } -static inline bool mapping_unevictable(struct address_space *mapping) +static inline bool mapping_unevictable(const struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } @@ -277,7 +277,7 @@ static inline void mapping_set_exiting(struct address_space *mapping) set_bit(AS_EXITING, &mapping->flags); } -static inline int mapping_exiting(struct address_space *mapping) +static inline int mapping_exiting(const struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } @@ -287,7 +287,7 @@ static inline void mapping_set_no_writeback_tags(struct address_space *mapping) set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } -static inline int mapping_use_writeback_tags(struct address_space *mapping) +static inline int mapping_use_writeback_tags(const struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } @@ -333,7 +333,7 @@ static inline void mapping_set_inaccessible(struct address_space *mapping) set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_inaccessible(struct address_space *mapping) +static inline bool mapping_inaccessible(const struct address_space *mapping) { return test_bit(AS_INACCESSIBLE, &mapping->flags); } @@ -343,18 +343,18 @@ static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_ set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping) { return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline gfp_t mapping_gfp_mask(struct address_space * mapping) +static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ -static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, +static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; @@ -477,7 +477,7 @@ mapping_min_folio_order(const struct address_space *mapping) } static inline unsigned long -mapping_min_folio_nrpages(struct address_space *mapping) +mapping_min_folio_nrpages(const struct address_space *mapping) { return 1UL << mapping_min_folio_order(mapping); } @@ -491,7 +491,7 @@ mapping_min_folio_nrpages(struct address_space *mapping) * new folio to the page cache and need to know what index to give it, * call this function. */ -static inline pgoff_t mapping_align_index(struct address_space *mapping, +static inline pgoff_t mapping_align_index(const struct address_space *mapping, pgoff_t index) { return round_down(index, mapping_min_folio_nrpages(mapping)); @@ -501,7 +501,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping, * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. */ -static inline bool mapping_large_folio_support(struct address_space *mapping) +static inline bool mapping_large_folio_support(const struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, @@ -516,7 +516,7 @@ static inline size_t mapping_max_folio_size(const struct address_space *mapping) return PAGE_SIZE << mapping_max_folio_order(mapping); } -static inline int filemap_nr_thps(struct address_space *mapping) +static inline int filemap_nr_thps(const struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); @@ -930,7 +930,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, * * Return: The index of the folio which follows this folio in the file. */ -static inline pgoff_t folio_next_index(struct folio *folio) +static inline pgoff_t folio_next_index(const struct folio *folio) { return folio->index + folio_nr_pages(folio); } @@ -959,7 +959,7 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ -static inline bool folio_contains(struct folio *folio, pgoff_t index) +static inline bool folio_contains(const struct folio *folio, pgoff_t index) { VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); return index - folio->index < folio_nr_pages(folio); @@ -1036,13 +1036,13 @@ static inline loff_t page_offset(struct page *page) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -static inline pgoff_t folio_pgoff(struct folio *folio) +static inline pgoff_t folio_pgoff(const struct folio *folio) { return folio->index; } -static inline pgoff_t linear_page_index(struct vm_area_struct *vma, - unsigned long address) +static inline pgoff_t linear_page_index(const struct vm_area_struct *vma, + const unsigned long address) { pgoff_t pgoff; pgoff = (address - vma->vm_start) >> PAGE_SHIFT; @@ -1462,7 +1462,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ -static inline loff_t readahead_pos(struct readahead_control *rac) +static inline loff_t readahead_pos(const struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } @@ -1471,7 +1471,7 @@ static inline loff_t readahead_pos(struct readahead_control *rac) * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ -static inline size_t readahead_length(struct readahead_control *rac) +static inline size_t readahead_length(const struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } @@ -1480,7 +1480,7 @@ static inline size_t readahead_length(struct readahead_control *rac) * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ -static inline pgoff_t readahead_index(struct readahead_control *rac) +static inline pgoff_t readahead_index(const struct readahead_control *rac) { return rac->_index; } @@ -1489,7 +1489,7 @@ static inline pgoff_t readahead_index(struct readahead_control *rac) * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ -static inline unsigned int readahead_count(struct readahead_control *rac) +static inline unsigned int readahead_count(const struct readahead_control *rac) { return rac->_nr_pages; } @@ -1498,12 +1498,12 @@ static inline unsigned int readahead_count(struct readahead_control *rac) * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ -static inline size_t readahead_batch_length(struct readahead_control *rac) +static inline size_t readahead_batch_length(const struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } -static inline unsigned long dir_pages(struct inode *inode) +static inline unsigned long dir_pages(const struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1517,8 +1517,8 @@ static inline unsigned long dir_pages(struct inode *inode) * Return: the number of bytes in the folio up to EOF, * or -EFAULT if the folio was truncated. */ -static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, - struct inode *inode) +static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio, + const struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; @@ -1549,7 +1549,8 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, * Return: The number of filesystem blocks covered by this folio. */ static inline -unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +unsigned int i_blocks_per_folio(const struct inode *inode, + const struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } -- cgit v1.2.3 From 959b0886256b6896b44633e0e07c5464169087c1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:12 +0200 Subject: mm: constify zone related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-4-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f3272ef5131b..6c4eae96160d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1104,7 +1104,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z) return wmark_pages(z, WMARK_PROMO); } -static inline unsigned long zone_managed_pages(struct zone *zone) +static inline unsigned long zone_managed_pages(const struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } @@ -1128,12 +1128,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } -static inline bool zone_is_initialized(struct zone *zone) +static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; } -static inline bool zone_is_empty(struct zone *zone) +static inline bool zone_is_empty(const struct zone *zone) { return zone->spanned_pages == 0; } @@ -1273,7 +1273,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio) * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ -static inline bool zone_intersects(struct zone *zone, +static inline bool zone_intersects(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) @@ -1581,12 +1581,12 @@ static inline int local_memory_node(int node_id) { return node_id; }; #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return false; } @@ -1598,19 +1598,19 @@ static inline bool zone_is_zone_device(struct zone *zone) * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ -static inline bool managed_zone(struct zone *zone) +static inline bool managed_zone(const struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ -static inline bool populated_zone(struct zone *zone) +static inline bool populated_zone(const struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return zone->node; } @@ -1620,7 +1620,7 @@ static inline void zone_set_nid(struct zone *zone, int nid) zone->node = nid; } #else -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return 0; } @@ -1647,7 +1647,7 @@ static inline int is_highmem_idx(enum zone_type idx) * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ -static inline int is_highmem(struct zone *zone) +static inline int is_highmem(const struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } @@ -1713,12 +1713,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref) return zoneref->zone; } -static inline int zonelist_zone_idx(struct zoneref *zoneref) +static inline int zonelist_zone_idx(const struct zoneref *zoneref) { return zoneref->zone_idx; } -static inline int zonelist_node_idx(struct zoneref *zoneref) +static inline int zonelist_node_idx(const struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } @@ -2021,7 +2021,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section) return (struct page *)map; } -static inline int present_section(struct mem_section *section) +static inline int present_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } @@ -2031,12 +2031,12 @@ static inline int present_section_nr(unsigned long nr) return present_section(__nr_to_section(nr)); } -static inline int valid_section(struct mem_section *section) +static inline int valid_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } -static inline int early_section(struct mem_section *section) +static inline int early_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } @@ -2046,27 +2046,27 @@ static inline int valid_section_nr(unsigned long nr) return valid_section(__nr_to_section(nr)); } -static inline int online_section(struct mem_section *section) +static inline int online_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); @@ -2076,7 +2076,7 @@ void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return 0; } -- cgit v1.2.3 From b119fb0927738f150cbd179d23d08057dccd75c1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:13 +0200 Subject: fs: constify mapping related test functions for improved const-correctness We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-5-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0783c5d05d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -537,7 +537,7 @@ struct address_space { /* * Returns true if any of the pages in the mapping are marked with the tag. */ -static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) +static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } @@ -585,7 +585,7 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping) /* * Might pages of this file be mapped into userspace? */ -static inline int mapping_mapped(struct address_space *mapping) +static inline int mapping_mapped(const struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } @@ -599,7 +599,7 @@ static inline int mapping_mapped(struct address_space *mapping) * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ -static inline int mapping_writably_mapped(struct address_space *mapping) +static inline int mapping_writably_mapped(const struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } -- cgit v1.2.3 From 4680092f8ccb4406e771a6b1a2c0243ebd40bab7 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:14 +0200 Subject: mm: constify process_shares_mm() for improved const-correctness This function only reads from the pointer arguments. Local (loop) variables are also annotated with `const` to clarify that these will not be written to. Link: https://lkml.kernel.org/r/20250901205021.3573313-6-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/oom_kill.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 45a47b555499..b3b63058e1a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3872,7 +3872,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ -extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm); void drop_slab(void); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 17650f0b516e..58bd4cf71d52 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,12 +490,12 @@ static bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm) { - struct task_struct *t; + const struct task_struct *t; for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); + const struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm) return t_mm == mm; } -- cgit v1.2.3 From 0bf25cfc9e795ab302ee23550fdeebd2aeedf800 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:15 +0200 Subject: mm, s390: constify mapping related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. (Even though seemingly unrelated, this also constifies the pointer parameter of mmap_is_legacy() in arch/s390/mm/mmap.c because a copy of the function exists in mm/util.c.) Link: https://lkml.kernel.org/r/20250901205021.3573313-7-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 2 +- include/linux/mm.h | 6 +++--- include/linux/pagemap.h | 2 +- mm/util.c | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 547104ccc22a..e188cb6d4946 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -27,7 +27,7 @@ static unsigned long stack_maxrandom_size(void) return STACK_RND_MASK << PAGE_SHIFT; } -static inline int mmap_is_legacy(struct rlimit *rlim_stack) +static inline int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; diff --git a/include/linux/mm.h b/include/linux/mm.h index b3b63058e1a3..221e98bb7689 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1002,7 +1002,7 @@ static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif -int vma_is_stack_for_current(struct vm_area_struct *vma); +int vma_is_stack_for_current(const struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } @@ -2617,7 +2617,7 @@ void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim); + const struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); @@ -3380,7 +3380,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ -extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); +extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0d66a252b06f..aec4a11565bc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -545,7 +545,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *folio_mapping(struct folio *); +struct address_space *folio_mapping(const struct folio *folio); /** * folio_flush_mapping - Find the file mapping this folio belongs to. diff --git a/mm/util.c b/mm/util.c index e29d3310e26b..391f6e7daf83 100644 --- a/mm/util.c +++ b/mm/util.c @@ -315,7 +315,7 @@ void *memdup_user_nul(const void __user *src, size_t len) EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_current(struct vm_area_struct *vma) +int vma_is_stack_for_current(const struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; @@ -410,7 +410,7 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static int mmap_is_legacy(struct rlimit *rlim_stack) +static int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; @@ -504,7 +504,7 @@ EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim) + const struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; @@ -688,7 +688,7 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ -struct address_space *folio_mapping(struct folio *folio) +struct address_space *folio_mapping(const struct folio *folio) { struct address_space *mapping; @@ -926,7 +926,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; -- cgit v1.2.3 From a955cca37288fe37cc1cde8d291e02717c8a7409 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:17 +0200 Subject: mm: constify arch_pick_mmap_layout() for improved const-correctness This function only reads from the rlimit pointer (but writes to the mm_struct pointer which is kept without `const`). All callees are already const-ified or (internal functions) are being constified by this patch. Link: https://lkml.kernel.org/r/20250901205021.3573313-9-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/x86/mm/mmap.c | 6 +++--- include/linux/sched/mm.h | 4 ++-- mm/util.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index e188cb6d4946..197c1d9497a7 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -47,7 +47,7 @@ static unsigned long mmap_base_legacy(unsigned long rnd) } static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; @@ -169,7 +169,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 785e9909340f..55faf2effa46 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -294,7 +294,7 @@ static unsigned long mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = mmap_rnd(); unsigned long gap; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 708f85dc9380..82f3a987f7cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -80,7 +80,7 @@ unsigned long arch_mmap_rnd(void) } static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; @@ -110,7 +110,7 @@ static unsigned long mmap_legacy_base(unsigned long rnd, */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, unsigned long random_factor, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) @@ -119,7 +119,7 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm_flags_clear(MMF_TOPDOWN, mm); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2201da0afecc..0232d983b715 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -178,7 +178,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack); + const struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -211,7 +211,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack) {} + const struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/mm/util.c b/mm/util.c index 391f6e7daf83..732a2dfcaec7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -431,7 +431,7 @@ static int mmap_is_legacy(const struct rlimit *rlim_stack) #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* @@ -462,7 +462,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) #endif } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -478,7 +478,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm_flags_clear(MMF_TOPDOWN, mm); -- cgit v1.2.3 From 89bf840b84bb53393436426cd4acd80604bd26fd Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:18 +0200 Subject: mm: constify ptdesc_pmd_pts_count() and folio_get_private() These functions from mm_types.h are trivial getters that should never write to the given pointers. Link: https://lkml.kernel.org/r/20250901205021.3573313-10-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d934a3a5b443..275e8060d918 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -632,7 +632,7 @@ static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) atomic_dec(&ptdesc->pt_share_count); } -static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } @@ -660,7 +660,7 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } -static inline void *folio_get_private(struct folio *folio) +static inline void *folio_get_private(const struct folio *folio) { return folio->private; } -- cgit v1.2.3 From f346a9473a2fbbab785d1733d475160f1fc54e5a Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:19 +0200 Subject: mm: constify various inline functions for improved const-correctness We select certain test functions plus folio_migrate_refs() from mm_inline.h which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. One exception is the function folio_migrate_refs() which does write to the "new" folio pointer; there, only the "old" folio pointer is being constified; only its "flags" field is read, but nothing written. Link: https://lkml.kernel.org/r/20250901205021.3573313-11-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 150302b4a905..d6c1011b38f2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -25,7 +25,7 @@ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ -static inline int folio_is_file_lru(struct folio *folio) +static inline int folio_is_file_lru(const struct folio *folio) { return !folio_test_swapbacked(folio); } @@ -84,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list folio_lru_list(struct folio *folio) +static __always_inline enum lru_list folio_lru_list(const struct folio *folio) { enum lru_list lru; @@ -141,7 +141,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset) return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } -static inline int folio_lru_refs(struct folio *folio) +static inline int folio_lru_refs(const struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags.f); @@ -154,14 +154,14 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } -static inline int folio_lru_gen(struct folio *folio) +static inline int folio_lru_gen(const struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } -static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; @@ -217,12 +217,13 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } -static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, +static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, + const struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); - struct lru_gen_folio *lrugen = &lruvec->lrugen; + const struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ @@ -302,7 +303,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return true; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; @@ -330,7 +331,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return false; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { } @@ -508,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm) atomic_dec(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +static inline bool mm_tlb_flush_pending(const struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that @@ -521,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL @@ -605,7 +606,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, return false; } -static inline bool vma_has_recency(struct vm_area_struct *vma) +static inline bool vma_has_recency(const struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; -- cgit v1.2.3 From da0045587d59d4ffd7710fa45cea51e5a48453a4 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:20 +0200 Subject: mm: constify assert/test functions in mm.h For improved const-correctness. We select certain assert and test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-12-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 221e98bb7689..a6bfa46937a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -719,7 +719,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); @@ -732,7 +732,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } @@ -875,7 +875,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) vma->vm_end >= vma->vm_mm->start_stack; } -static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -889,7 +889,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) return false; } -static inline bool vma_is_foreign(struct vm_area_struct *vma) +static inline bool vma_is_foreign(const struct vm_area_struct *vma) { if (!current->mm) return true; @@ -900,7 +900,7 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) +static inline bool vma_is_accessible(const struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } @@ -911,7 +911,7 @@ static inline bool is_shared_maywrite(vm_flags_t vm_flags) (VM_SHARED | VM_MAYWRITE); } -static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } @@ -1855,7 +1855,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) } #ifdef CONFIG_MMU -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } @@ -1870,7 +1870,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } @@ -1886,7 +1886,7 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot) { return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); } @@ -1902,7 +1902,7 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot) +static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot) { return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); } @@ -3520,7 +3520,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) return mtree_load(&mm->mm_mt, addr); } -static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; @@ -3532,7 +3532,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) return 0; } -static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +static inline unsigned long vm_start_gap(const struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; @@ -3543,7 +3543,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma) return vm_start; } -static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +static inline unsigned long vm_end_gap(const struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; @@ -3555,7 +3555,7 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma) return vm_end; } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline unsigned long vma_pages(const struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } @@ -3572,7 +3572,7 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } -static inline bool range_in_vma(struct vm_area_struct *vma, +static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); @@ -3688,7 +3688,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ -static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, +static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma, unsigned int flags) { /* @@ -3818,7 +3818,7 @@ static inline bool debug_guardpage_enabled(void) return static_branch_unlikely(&_debug_guardpage_enabled); } -static inline bool page_is_guard(struct page *page) +static inline bool page_is_guard(const struct page *page) { if (!debug_guardpage_enabled()) return false; @@ -3849,7 +3849,7 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } +static inline bool page_is_guard(const struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -3931,7 +3931,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) @@ -3945,7 +3945,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap, altmap->alloc -= nr_pfns; } #else -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { return 0; } -- cgit v1.2.3 From a847b17009ec271514b269c90320a3893cd9b667 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:21 +0200 Subject: mm: constify highmem related functions for improved const-correctness Lots of functions in mm/highmem.c do not write to the given pointers and do not call functions that take non-const pointers and can therefore be constified. This includes functions like kunmap() which might be implemented in a way that writes to the pointer (e.g. to update reference counters or mapping fields), but currently are not. kmap() on the other hand cannot be made const because it calls set_page_address() which is non-const in some architectures/configurations. [akpm@linux-foundation.org: "fix" folio_page() build failure] Link: https://lkml.kernel.org/r/20250901205021.3573313-13-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/arm/include/asm/highmem.h | 6 +++--- arch/xtensa/include/asm/highmem.h | 2 +- include/linux/highmem-internal.h | 36 ++++++++++++++++++------------------ include/linux/highmem.h | 8 ++++---- include/linux/page-flags.h | 4 ++-- mm/highmem.c | 10 +++++----- 6 files changed, 33 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index b4b66220952d..bdb209e002a4 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -46,9 +46,9 @@ extern pte_t *pkmap_page_table; #endif #ifdef ARCH_NEEDS_KMAP_HIGH_GET -extern void *kmap_high_get(struct page *page); +extern void *kmap_high_get(const struct page *page); -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) return NULL; @@ -57,7 +57,7 @@ static inline void *arch_kmap_local_high_get(struct page *page) #define arch_kmap_local_high_get arch_kmap_local_high_get #else /* ARCH_NEEDS_KMAP_HIGH_GET */ -static inline void *kmap_high_get(struct page *page) +static inline void *kmap_high_get(const struct page *page) { return NULL; } diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 34b8b620e7f1..b55235f4adac 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -29,7 +29,7 @@ #if DCACHE_WAY_SIZE > PAGE_SIZE #define get_pkmap_color get_pkmap_color -static inline int get_pkmap_color(struct page *page) +static inline int get_pkmap_color(const struct page *page) { return DCACHE_ALIAS(page_to_phys(page)); } diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 36053c3d6d64..0574c21ca45d 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -7,7 +7,7 @@ */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); @@ -33,7 +33,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { } #endif void *kmap_high(struct page *page); -void kunmap_high(struct page *page); +void kunmap_high(const struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); @@ -50,7 +50,7 @@ static inline void *kmap(struct page *page) return addr; } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { might_sleep(); if (!PageHighMem(page)) @@ -68,12 +68,12 @@ static inline void kmap_flush_unused(void) __kmap_flush_unused(); } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { if (!PageHighMem(page)) return page_address(page); @@ -81,13 +81,13 @@ static inline void *kmap_local_page_try_from_panic(struct page *page) return NULL; } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { - struct page *page = folio_page(folio, offset / PAGE_SIZE); + const struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } @@ -102,7 +102,7 @@ static inline void __kunmap_local(const void *vaddr) kunmap_local_indexed(vaddr); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -113,7 +113,7 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return __kmap_local_page_prot(page, prot); } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { return kmap_atomic_prot(page, kmap_prot); } @@ -173,32 +173,32 @@ static inline void *kmap(struct page *page) return page_address(page); } -static inline void kunmap_high(struct page *page) { } +static inline void kunmap_high(const struct page *page) { } static inline void kmap_flush_unused(void) { } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return page_address(page); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { return page_address(page); } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { return folio_address(folio) + offset; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return kmap_local_page(page); } @@ -215,7 +215,7 @@ static inline void __kunmap_local(const void *addr) #endif } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -225,7 +225,7 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { return kmap_atomic(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6234f316468c..105cc4c00cc3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -43,7 +43,7 @@ static inline void *kmap(struct page *page); * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ -static inline void kunmap(struct page *page); +static inline void kunmap(const struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address @@ -93,7 +93,7 @@ static inline void kmap_flush_unused(void); * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ -static inline void *kmap_local_page(struct page *page); +static inline void *kmap_local_page(const struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage @@ -129,7 +129,7 @@ static inline void *kmap_local_page(struct page *page); * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ -static inline void *kmap_local_folio(struct folio *folio, size_t offset); +static inline void *kmap_local_folio(const struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! @@ -176,7 +176,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ -static inline void *kmap_atomic(struct page *page); +static inline void *kmap_atomic(const struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88b61eec3f8..568011930e35 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,9 +316,9 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -static inline struct page *folio_page(struct folio *folio, unsigned long n) +static inline struct page *folio_page(const struct folio *folio, unsigned long n) { - return &folio->page + n; + return (struct page *)(&folio->page + n); } static __always_inline int PageTail(const struct page *page) diff --git a/mm/highmem.c b/mm/highmem.c index ef3189b36cad..b5c8e4c2d5d4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -61,7 +61,7 @@ static inline int kmap_local_calc_idx(int idx) /* * Determine color of virtual address where the page should be mapped. */ -static inline unsigned int get_pkmap_color(struct page *page) +static inline unsigned int get_pkmap_color(const struct page *page) { return 0; } @@ -334,7 +334,7 @@ EXPORT_SYMBOL(kmap_high); * * This can be called from any context. */ -void *kmap_high_get(struct page *page) +void *kmap_high_get(const struct page *page) { unsigned long vaddr, flags; @@ -356,7 +356,7 @@ void *kmap_high_get(struct page *page) * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called * only from user context. */ -void kunmap_high(struct page *page) +void kunmap_high(const struct page *page) { unsigned long vaddr; unsigned long nr; @@ -508,7 +508,7 @@ static inline void kmap_local_idx_pop(void) #endif #ifndef arch_kmap_local_high_get -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { return NULL; } @@ -572,7 +572,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) } EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot) { void *kmap; -- cgit v1.2.3 From 9fd53c8122271d9fe8b687f50a9bdf5588d41d0b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 11 Jul 2025 13:55:09 +0800 Subject: mm/filemap: align last_index to folio size On XFS systems with pagesize=4K, blocksize=16K, and CONFIG_TRANSPARENT_HUGEPAGE enabled, We observed the following readahead behaviors: # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=64k count=1 # ./tools/mm/page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136d4c __RU_l_________H______t_________________F_1 1 136d4d __RU_l__________T_____t_________________F_1 2 136d4e __RU_l__________T_____t_________________F_1 3 136d4f __RU_l__________T_____t_________________F_1 ... c 136bb8 __RU_l_________H______t_________________F_1 d 136bb9 __RU_l__________T_____t_________________F_1 e 136bba __RU_l__________T_____t_________________F_1 f 136bbb __RU_l__________T_____t_________________F_1 <-- first read 10 13c2cc ___U_l_________H______t______________I__F_1 <-- readahead flag 11 13c2cd ___U_l__________T_____t______________I__F_1 12 13c2ce ___U_l__________T_____t______________I__F_1 13 13c2cf ___U_l__________T_____t______________I__F_1 ... 1c 1405d4 ___U_l_________H______t_________________F_1 1d 1405d5 ___U_l__________T_____t_________________F_1 1e 1405d6 ___U_l__________T_____t_________________F_1 1f 1405d7 ___U_l__________T_____t_________________F_1 [ra_size = 32, req_count = 16, async_size = 16] # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=60k count=1 # ./page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136048 __RU_l_________H______t_________________F_1 ... c 110a40 __RU_l_________H______t_________________F_1 d 110a41 __RU_l__________T_____t_________________F_1 e 110a42 __RU_l__________T_____t_________________F_1 <-- first read f 110a43 __RU_l__________T_____t_________________F_1 <-- first readahead flag 10 13e7a8 ___U_l_________H______t_________________F_1 ... 20 137a00 ___U_l_________H______t_______P______I__F_1 <-- second readahead flag (20 - 2f) 21 137a01 ___U_l__________T_____t_______P______I__F_1 ... 3f 10d4af ___U_l__________T_____t_______P_________F_1 [first readahead: ra_size = 32, req_count = 15, async_size = 17] When reading 64k data (same for 61-63k range, where last_index is page-aligned in filemap_get_pages()), 128k readahead is triggered via page_cache_sync_ra() and the PG_readahead flag is set on the next folio (the one containing 0x10 page). When reading 60k data, 128k readahead is also triggered via page_cache_sync_ra(). However, in this case the readahead flag is set on the 0xf page. Although the requested read size (req_count) is 60k, the actual read will be aligned to folio size (64k), which triggers the readahead flag and initiates asynchronous readahead via page_cache_async_ra(). This results in two readahead operations totaling 256k. The root cause is that when the requested size is smaller than the actual read size (due to folio alignment), it triggers asynchronous readahead. By changing last_index alignment from page size to folio size, we ensure the requested size matches the actual read size, preventing the case where a single read operation triggers two readahead operations. After applying the patch: # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=60k count=1 # ./page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136d4c __RU_l_________H______t_________________F_1 1 136d4d __RU_l__________T_____t_________________F_1 2 136d4e __RU_l__________T_____t_________________F_1 3 136d4f __RU_l__________T_____t_________________F_1 ... c 136bb8 __RU_l_________H______t_________________F_1 d 136bb9 __RU_l__________T_____t_________________F_1 e 136bba __RU_l__________T_____t_________________F_1 <-- first read f 136bbb __RU_l__________T_____t_________________F_1 10 13c2cc ___U_l_________H______t______________I__F_1 <-- readahead flag 11 13c2cd ___U_l__________T_____t______________I__F_1 12 13c2ce ___U_l__________T_____t______________I__F_1 13 13c2cf ___U_l__________T_____t______________I__F_1 ... 1c 1405d4 ___U_l_________H______t_________________F_1 1d 1405d5 ___U_l__________T_____t_________________F_1 1e 1405d6 ___U_l__________T_____t_________________F_1 1f 1405d7 ___U_l__________T_____t_________________F_1 [ra_size = 32, req_count = 16, async_size = 16] The same phenomenon will occur when reading from 49k to 64k. Set the readahead flag to the next folio. Because the minimum order of folio in address_space equals the block size (at least in xfs and bcachefs that already support bs > ps), having request_count aligned to block size will not cause overread. [klarasmodin@gmail.com: fix overflow on 32-bit] Link: https://lkml.kernel.org/r/yru7qf5gvyzccq5ohhpylvxug5lr5tf54omspbjh4sm6pcdb2r@fpjgj2pxw7va [akpm@linux-foundation.org: update it for Max's constification efforts] Link: https://lkml.kernel.org/r/20250711055509.91587-1-youling.tang@linux.dev Co-developed-by: Chi Zhiling Signed-off-by: Chi Zhiling Signed-off-by: Youling Tang Signed-off-by: Klara Modin Reviewed-by: Ryan Roberts Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Youling Tang Cc: David Hildenbrand Cc: Klara Modin Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 6 ++++++ mm/filemap.c | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index aec4a11565bc..185644e288ea 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -482,6 +482,12 @@ mapping_min_folio_nrpages(const struct address_space *mapping) return 1UL << mapping_min_folio_order(mapping); } +static inline unsigned long +mapping_min_folio_nrbytes(const struct address_space *mapping) +{ + return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT; +} + /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. diff --git a/mm/filemap.c b/mm/filemap.c index cd9387b0a5b5..344ab106c21c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2600,8 +2600,9 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, unsigned int flags; int err = 0; - /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); + /* "last_index" is the index of the folio beyond the end of the read */ + last_index = round_up(iocb->ki_pos + count, + mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; -- cgit v1.2.3 From 94326d3130b5e78a35265bbf7822148372b39231 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Sep 2025 20:10:39 +0100 Subject: mm: remove mlock_count from struct page All users now use folio->mlock_count so we can remove this element of struct page. Move the useful comments over to struct folio. Link: https://lkml.kernel.org/r/20250903191041.1630338-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 275e8060d918..ff2b4e13215f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -94,14 +94,6 @@ struct page { union { struct list_head lru; - /* Or, for the Unevictable "LRU list" slot */ - struct { - /* Always even, to negate PageTail */ - void *__filler; - /* Count page's or folio's mlocks */ - unsigned int mlock_count; - }; - /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; @@ -391,7 +383,9 @@ struct folio { union { struct list_head lru; /* private: avoid cluttering the output */ + /* For the Unevictable "LRU list" slot */ struct { + /* Avoid compound_head */ void *__filler; /* public: */ unsigned int mlock_count; -- cgit v1.2.3 From da939ef4c494246bc2102ecb628bbcc71d650410 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:35:11 +0000 Subject: rust: maple_tree: add MapleTree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Add Rust abstraction for Maple Trees", v3. This will be used in the Tyr driver [1] to allocate from the GPU's VA space that is not owned by userspace, but by the kernel, for kernel GPU mappings. Danilo tells me that in nouveau, the maple tree is used for keeping track of "VM regions" on top of GPUVM, and that he will most likely end up doing the same in the Rust Nova driver as well. These abstractions intentionally do not expose any way to make use of external locking. You are required to use the internal spinlock. For now, we do not support loads that only utilize rcu for protection. This contains some parts taken from Andrew Ballance's RFC [2] from April. However, it has also been reworked significantly compared to that RFC taking the use-cases in Tyr into account. This patch (of 3): The maple tree will be used in the Tyr driver to allocate and keep track of GPU allocations created internally (i.e. not by userspace). It will likely also be used in the Nova driver eventually. This adds the simplest methods for additional and removal that do not require any special care with respect to concurrency. This implementation is based on the RFC by Andrew but with significant changes to simplify the implementation. [ojeda@kernel.org: fix intra-doc links] Link: https://lkml.kernel.org/r/20250910140212.997771-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-0-fb5c8958fb1e@google.com Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-1-fb5c8958fb1e@google.com Link: https://lore.kernel.org/r/20250627-tyr-v1-1-cb5f4c6ced46@collabora.com [1] Link: https://lore.kernel.org/r/20250405060154.1550858-1-andrewjballance@gmail.com [2] Co-developed-by: Andrew Ballance Signed-off-by: Andrew Ballance Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Daniel Almeida Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- MAINTAINERS | 4 + include/linux/maple_tree.h | 3 + rust/helpers/helpers.c | 1 + rust/helpers/maple_tree.c | 8 ++ rust/kernel/lib.rs | 1 + rust/kernel/maple_tree.rs | 349 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 366 insertions(+) create mode 100644 rust/helpers/maple_tree.c create mode 100644 rust/kernel/maple_tree.rs (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index a7e123ddf05a..68d29f0220fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14672,6 +14672,8 @@ F: net/mctp/ MAPLE TREE M: Liam R. Howlett +R: Alice Ryhl +R: Andrew Ballance L: maple-tree@lists.infradead.org L: linux-mm@kvack.org S: Supported @@ -14680,6 +14682,8 @@ F: include/linux/maple_tree.h F: include/trace/events/maple_tree.h F: lib/maple_tree.c F: lib/test_maple_tree.c +F: rust/helpers/maple_tree.c +F: rust/kernel/maple_tree.rs F: tools/testing/radix-tree/maple.c F: tools/testing/shared/linux/maple_tree.h diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 41e633264e51..05730171d201 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -481,6 +481,9 @@ struct ma_wr_state { #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) +/* + * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs + */ #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..c5d42e0f7ce6 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -26,6 +26,7 @@ #include "io.c" #include "jump_label.c" #include "kunit.c" +#include "maple_tree.c" #include "mm.c" #include "mutex.c" #include "of.c" diff --git a/rust/helpers/maple_tree.c b/rust/helpers/maple_tree.c new file mode 100644 index 000000000000..1dd9ac84a13f --- /dev/null +++ b/rust/helpers/maple_tree.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt_init_flags(mt, flags); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index ed53169e795c..6b0a5689669f 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -96,6 +96,7 @@ pub mod jump_label; #[cfg(CONFIG_KUNIT)] pub mod kunit; pub mod list; +pub mod maple_tree; pub mod miscdevice; pub mod mm; #[cfg(CONFIG_NET)] diff --git a/rust/kernel/maple_tree.rs b/rust/kernel/maple_tree.rs new file mode 100644 index 000000000000..319772878b89 --- /dev/null +++ b/rust/kernel/maple_tree.rs @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Maple trees. +//! +//! C header: [`include/linux/maple_tree.h`](srctree/include/linux/maple_tree.h) +//! +//! Reference: + +use core::{ + marker::PhantomData, + ops::{Bound, RangeBounds}, + ptr, +}; + +use kernel::{ + alloc::Flags, + error::to_result, + prelude::*, + types::{ForeignOwnable, Opaque}, +}; + +/// A maple tree optimized for storing non-overlapping ranges. +/// +/// # Invariants +/// +/// Each range in the maple tree owns an instance of `T`. +#[pin_data(PinnedDrop)] +#[repr(transparent)] +pub struct MapleTree { + #[pin] + tree: Opaque, + _p: PhantomData, +} + +#[inline] +fn to_maple_range(range: impl RangeBounds) -> Option<(usize, usize)> { + let first = match range.start_bound() { + Bound::Included(start) => *start, + Bound::Excluded(start) => start.checked_add(1)?, + Bound::Unbounded => 0, + }; + + let last = match range.end_bound() { + Bound::Included(end) => *end, + Bound::Excluded(end) => end.checked_sub(1)?, + Bound::Unbounded => usize::MAX, + }; + + if last < first { + return None; + } + + Some((first, last)) +} + +impl MapleTree { + /// Create a new maple tree. + /// + /// The tree will use the regular implementation with a higher branching factor, rather than + /// the allocation tree. + #[inline] + pub fn new() -> impl PinInit { + pin_init!(MapleTree { + // SAFETY: This initializes a maple tree into a pinned slot. The maple tree will be + // destroyed in Drop before the memory location becomes invalid. + tree <- Opaque::ffi_init(|slot| unsafe { bindings::mt_init_flags(slot, 0) }), + _p: PhantomData, + }) + } + + /// Insert the value at the given index. + /// + /// # Errors + /// + /// If the maple tree already contains a range using the given index, then this call will + /// return an [`InsertErrorKind::Occupied`]. It may also fail if memory allocation fails. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{InsertErrorKind, MapleTree}; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let the_answer = KBox::new(42, GFP_KERNEL)?; + /// + /// // These calls will succeed. + /// tree.insert(100, ten, GFP_KERNEL)?; + /// tree.insert(101, twenty, GFP_KERNEL)?; + /// + /// // This will fail because the index is already in use. + /// assert_eq!( + /// tree.insert(100, the_answer, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::Occupied, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn insert(&self, index: usize, value: T, gfp: Flags) -> Result<(), InsertError> { + self.insert_range(index..=index, value, gfp) + } + + /// Insert a value to the specified range, failing on overlap. + /// + /// This accepts the usual types of Rust ranges using the `..` and `..=` syntax for exclusive + /// and inclusive ranges respectively. The range must not be empty, and must not overlap with + /// any existing range. + /// + /// # Errors + /// + /// If the maple tree already contains an overlapping range, then this call will return an + /// [`InsertErrorKind::Occupied`]. It may also fail if memory allocation fails or if the + /// requested range is invalid (e.g. empty). + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{InsertErrorKind, MapleTree}; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let the_answer = KBox::new(42, GFP_KERNEL)?; + /// let hundred = KBox::new(100, GFP_KERNEL)?; + /// + /// // Insert the value 10 at the indices 100 to 499. + /// tree.insert_range(100..500, ten, GFP_KERNEL)?; + /// + /// // Insert the value 20 at the indices 500 to 1000. + /// tree.insert_range(500..=1000, twenty, GFP_KERNEL)?; + /// + /// // This will fail due to overlap with the previous range on index 1000. + /// assert_eq!( + /// tree.insert_range(1000..1200, the_answer, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::Occupied, + /// ); + /// + /// // When using .. to specify the range, you must be careful to ensure that the range is + /// // non-empty. + /// assert_eq!( + /// tree.insert_range(72..72, hundred, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::InvalidRequest, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + pub fn insert_range(&self, range: R, value: T, gfp: Flags) -> Result<(), InsertError> + where + R: RangeBounds, + { + let Some((first, last)) = to_maple_range(range) else { + return Err(InsertError { + value, + cause: InsertErrorKind::InvalidRequest, + }); + }; + + let ptr = T::into_foreign(value); + + // SAFETY: The tree is valid, and we are passing a pointer to an owned instance of `T`. + let res = to_result(unsafe { + bindings::mtree_insert_range(self.tree.get(), first, last, ptr, gfp.as_raw()) + }); + + if let Err(err) = res { + // SAFETY: As `mtree_insert_range` failed, it is safe to take back ownership. + let value = unsafe { T::from_foreign(ptr) }; + + let cause = if err == ENOMEM { + InsertErrorKind::AllocError(kernel::alloc::AllocError) + } else if err == EEXIST { + InsertErrorKind::Occupied + } else { + InsertErrorKind::InvalidRequest + }; + Err(InsertError { value, cause }) + } else { + Ok(()) + } + } + + /// Erase the range containing the given index. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::MapleTree; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// + /// tree.insert_range(100..500, ten, GFP_KERNEL)?; + /// tree.insert(67, twenty, GFP_KERNEL)?; + /// + /// assert_eq!(tree.erase(67).map(|v| *v), Some(20)); + /// assert_eq!(tree.erase(275).map(|v| *v), Some(10)); + /// + /// // The previous call erased the entire range, not just index 275. + /// assert!(tree.erase(127).is_none()); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn erase(&self, index: usize) -> Option { + // SAFETY: `self.tree` contains a valid maple tree. + let ret = unsafe { bindings::mtree_erase(self.tree.get(), index) }; + + // SAFETY: If the pointer is not null, then we took ownership of a valid instance of `T` + // from the tree. + unsafe { T::try_from_foreign(ret) } + } + + /// Free all `T` instances in this tree. + /// + /// # Safety + /// + /// This frees Rust data referenced by the maple tree without removing it from the maple tree, + /// leaving it in an invalid state. The caller must ensure that this invalid state cannot be + /// observed by the end-user. + unsafe fn free_all_entries(self: Pin<&mut Self>) { + // SAFETY: The caller provides exclusive access to the entire maple tree, so we have + // exclusive access to the entire maple tree despite not holding the lock. + let mut ma_state = unsafe { MaState::new_raw(self.into_ref().get_ref(), 0, usize::MAX) }; + + loop { + // This uses the raw accessor because we're destroying pointers without removing them + // from the maple tree, which is only valid because this is the destructor. + let ptr = ma_state.mas_find_raw(usize::MAX); + if ptr.is_null() { + break; + } + // SAFETY: By the type invariants, this pointer references a valid value of type `T`. + // By the safety requirements, it is okay to free it without removing it from the maple + // tree. + drop(unsafe { T::from_foreign(ptr) }); + } + } +} + +#[pinned_drop] +impl PinnedDrop for MapleTree { + #[inline] + fn drop(mut self: Pin<&mut Self>) { + // We only iterate the tree if the Rust value has a destructor. + if core::mem::needs_drop::() { + // SAFETY: Other than the below `mtree_destroy` call, the tree will not be accessed + // after this call. + unsafe { self.as_mut().free_all_entries() }; + } + + // SAFETY: The tree is valid, and will not be accessed after this call. + unsafe { bindings::mtree_destroy(self.tree.get()) }; + } +} + +/// A helper type used for navigating a [`MapleTree`]. +/// +/// # Invariants +/// +/// For the duration of `'tree`: +/// +/// * The `ma_state` references a valid `MapleTree`. +/// * The `ma_state` has read/write access to the tree. +pub struct MaState<'tree, T: ForeignOwnable> { + state: bindings::ma_state, + _phantom: PhantomData<&'tree mut MapleTree>, +} + +impl<'tree, T: ForeignOwnable> MaState<'tree, T> { + /// Initialize a new `MaState` with the given tree. + /// + /// # Safety + /// + /// The caller must ensure that this `MaState` has read/write access to the maple tree. + #[inline] + unsafe fn new_raw(mt: &'tree MapleTree, first: usize, end: usize) -> Self { + // INVARIANT: + // * Having a reference ensures that the `MapleTree` is valid for `'tree`. + // * The caller ensures that we have read/write access. + Self { + state: bindings::ma_state { + tree: mt.tree.get(), + index: first, + last: end, + node: ptr::null_mut(), + status: bindings::maple_status_ma_start, + min: 0, + max: usize::MAX, + alloc: ptr::null_mut(), + mas_flags: 0, + store_type: bindings::store_type_wr_invalid, + ..Default::default() + }, + _phantom: PhantomData, + } + } + + #[inline] + fn as_raw(&mut self) -> *mut bindings::ma_state { + &raw mut self.state + } + + #[inline] + fn mas_find_raw(&mut self, max: usize) -> *mut c_void { + // SAFETY: By the type invariants, the `ma_state` is active and we have read/write access + // to the tree. + unsafe { bindings::mas_find(self.as_raw(), max) } + } +} + +/// Error type for failure to insert a new value. +pub struct InsertError { + /// The value that could not be inserted. + pub value: T, + /// The reason for the failure to insert. + pub cause: InsertErrorKind, +} + +/// The reason for the failure to insert. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub enum InsertErrorKind { + /// There is already a value in the requested range. + Occupied, + /// Failure to allocate memory. + AllocError(kernel::alloc::AllocError), + /// The insertion request was invalid. + InvalidRequest, +} + +impl From for Error { + #[inline] + fn from(kind: InsertErrorKind) -> Error { + match kind { + InsertErrorKind::Occupied => EEXIST, + InsertErrorKind::AllocError(kernel::alloc::AllocError) => ENOMEM, + InsertErrorKind::InvalidRequest => EINVAL, + } + } +} + +impl From> for Error { + #[inline] + fn from(insert_err: InsertError) -> Error { + Error::from(insert_err.cause) + } +} -- cgit v1.2.3 From a488ba3124c82d704963fcd760fe653df1987b13 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 5 Sep 2025 17:00:12 +0200 Subject: huge_memory: return -EINVAL in folio split functions when THP is disabled split_huge_page_to_list_[to_order](), split_huge_page() and try_folio_split() return 0 on success and error codes on failure. When THP is disabled, these functions return 0 indicating success even though an error code should be returned as it is not possible to split a folio when THP is disabled. Make all these functions return -EINVAL to indicate failure instead of 0. As large folios depend on CONFIG_THP, issue warning as this function should not be called without a large folio. Link: https://lkml.kernel.org/r/20250905150012.93714-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509051753.riCeG7LC-lkp@intel.com/ Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Kiryl Shutsemau Reviewed-by: Lorenzo Stoakes Reviewed-by: Barry Song Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 29ef70022da1..f327d62fc985 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -588,22 +588,26 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_huge_page(struct page *page) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline int try_folio_split(struct folio *folio, struct page *page, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} -- cgit v1.2.3 From 614d850efda98e4455e4f2b55e64864f68a4e370 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 4 Sep 2025 08:59:26 +1000 Subject: mm/memremap: remove unused get_dev_pagemap() parameter GUP no longer uses get_dev_pagemap(). As it was the only user of the get_dev_pagemap() pgmap caching feature it can be removed. Link: https://lkml.kernel.org/r/20250903225926.34702-2-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Cc: John Hubbard Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/memremap.h | 6 ++---- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/memremap.c | 22 ++++------------------ 4 files changed, 8 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index aa1b6aa877a0..e5951ba12a28 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -211,8 +211,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap); +struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); @@ -234,8 +233,7 @@ static inline void devm_memunmap_pages(struct device *dev, { } -static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { return NULL; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2a95b41e0535..6d9134e3d115 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2194,7 +2194,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (pfn_valid(pfn)) { - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 74318c787715..883b8e4d51ba 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -375,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn) * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ diff --git a/mm/memremap.c b/mm/memremap.c index a2d4bb88f64b..46cb1b0b6f72 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -153,14 +153,14 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, "altmap not supported for multiple ranges\n")) return -EINVAL; - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); return -ENOMEM; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -397,26 +397,12 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap - * is non-NULL but does not cover @pfn the reference to it will be released. */ -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { + struct dev_pagemap *pgmap; resource_size_t phys = PFN_PHYS(pfn); - /* - * In the cached case we're already holding a live reference. - */ - if (pgmap) { - if (phys >= pgmap->range.start && phys <= pgmap->range.end) - return pgmap; - put_dev_pagemap(pgmap); - } - - /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) -- cgit v1.2.3 From 4522aed4fffbbd18ab3581d733d0572d45780d07 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:51 +0800 Subject: mm, swap: rename and move some swap cluster definition and helpers No feature change, move cluster related definitions and helpers to mm/swap.h, also tidy up and add a "swap_" prefix for cluster lock/unlock helpers, so they can be used outside of swap files. And while at it, add kerneldoc. Link: https://lkml.kernel.org/r/20250916160100.31545-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Acked-by: Nhat Pham Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 34 ------------------ mm/swap.h | 70 +++++++++++++++++++++++++++++++++++++ mm/swapfile.c | 97 ++++++++++++++++------------------------------------ 3 files changed, 99 insertions(+), 102 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index a2bb20841616..78cc48a65512 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,40 +235,6 @@ enum { /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ -/* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The flags field determines if a cluster is free. This is - * protected by cluster lock. - */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * other than list, and swap_info_struct->swap_map - * elements corresponding to the swap cluster. - */ - u16 count; - u8 flags; - u8 order; - struct list_head list; -}; - -/* All on-list cluster must have a non-zero flag. */ -enum swap_cluster_flags { - CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ - CLUSTER_FLAG_FREE, - CLUSTER_FLAG_NONFULL, - CLUSTER_FLAG_FRAG, - /* Clusters with flags above are allocatable */ - CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, - CLUSTER_FLAG_FULL, - CLUSTER_FLAG_DISCARD, - CLUSTER_FLAG_MAX, -}; - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the diff --git a/mm/swap.h b/mm/swap.h index 7d868f8de696..138b5197c35e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -7,10 +7,80 @@ struct swap_iocb; extern int page_cluster; +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#define swap_entry_order(order) (order) +#else +#define SWAPFILE_CLUSTER 256 +#define swap_entry_order(order) 0 +#endif + +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The flags field determines if a cluster is free. This is + * protected by cluster lock. + */ +struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. + */ + u16 count; + u8 flags; + u8 order; + struct list_head list; +}; + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; + #ifdef CONFIG_SWAP #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline struct swap_cluster_info *swp_offset_cluster( + struct swap_info_struct *si, pgoff_t offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + +/** + * swap_cluster_lock - Lock and return the swap cluster of given offset. + * @si: swap device the cluster belongs to. + * @offset: the swap entry offset, pointing to a valid slot. + * + * Context: The caller must ensure the offset is in the valid range and + * protect the swap device with reference count or locks. + */ +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset) +{ + struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + + spin_lock(&ci->lock); + return ci; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ + spin_unlock(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; diff --git a/mm/swapfile.c b/mm/swapfile.c index c3c3364cb42e..700e07cb1cbd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,9 +58,6 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset); -static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -258,9 +255,9 @@ again: * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(si, offset, nr_pages); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -385,19 +382,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR - -#define swap_entry_order(order) (order) -#else -#define SWAPFILE_CLUSTER 256 - -/* - * Define swap_entry_order() as constant to let compiler to optimize - * out some code if !CONFIG_THP_SWAP - */ -#define swap_entry_order(order) 0 -#endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) @@ -425,34 +409,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } -static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; -} - static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - struct swap_cluster_info *ci; - - ci = offset_to_cluster(si, offset); - spin_lock(&ci->lock); - - return ci; -} - -static inline void unlock_cluster(struct swap_cluster_info *ci) -{ - spin_unlock(&ci->lock); -} - static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -808,7 +770,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, } out: relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -875,7 +837,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (to_scan <= 0) break; } @@ -914,7 +876,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (offset == SWAP_ENTRY_INVALID) goto new_cluster; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) @@ -922,7 +884,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } if (found) goto done; @@ -1203,7 +1165,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (!si || !offset || !get_swap_device_info(si)) return false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); @@ -1211,7 +1173,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (found) *entry = swp_entry(si->type, found); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } put_swap_device(si); @@ -1479,14 +1441,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, nr)) + ci = swap_cluster_lock(si, offset); + if (swap_only_has_cache(si, offset, nr)) { swap_entries_free(si, ci, entry, nr); - else { + } else { for (int i = 0; i < nr; i++, entry.val++) swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } - unlock_cluster(ci); + swap_cluster_unlock(ci); } static bool swap_entries_put_map(struct swap_info_struct *si, @@ -1504,7 +1466,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback; } @@ -1513,21 +1475,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si, else for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; fallback: - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); locked_fallback: for (i = 0; i < nr; i++, entry.val++) { count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE) has_cache = true; } - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; - } /* @@ -1577,7 +1538,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1652,9 +1613,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster(ci); + swap_cluster_unlock(ci); return !!count; } @@ -1677,7 +1638,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1700,7 +1661,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return count; } @@ -1715,7 +1676,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; @@ -1728,7 +1689,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return ret; } @@ -2662,8 +2623,8 @@ static void wait_for_allocation(struct swap_info_struct *si) BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); - unlock_cluster(ci); + ci = swap_cluster_lock(si, offset); + swap_cluster_unlock(ci); } } @@ -3579,7 +3540,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3634,7 +3595,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return err; } @@ -3733,7 +3694,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); @@ -3793,7 +3754,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out_unlock_cont: spin_unlock(&si->cont_lock); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); put_swap_device(si); outer: if (page) -- cgit v1.2.3 From 0fcf8ef4fdab8e5c91d1bce39c7fe6565974ffad Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:52 +0800 Subject: mm, swap: tidy up swap device and cluster info helpers swp_swap_info is the most commonly used helper for retrieving swap info. It has an internal check that may lead to a NULL return value, but almost none of its caller checks the return value, making the internal check pointless. In fact, most of these callers already ensured the entry is valid and never expect a NULL value. Tidy this up and improve the function names. If the caller can make sure the swap entry/type is valid and the device is pinned, use the new introduced __swap_entry_to_info/__swap_type_to_info instead. They have more debug sanity checks and lower overhead as they are inlined. Callers that may expect a NULL value should use swap_entry_to_info/swap_type_to_info instead. No feature change. The rearranged codes should have had no effect, or they should have been hitting NULL de-ref bugs already. Only some new sanity checks are added so potential issues may show up in debug build. The new helpers will be frequently used with swap table later when working with swap cache folios. A locked swap cache folio ensures the entries are valid and stable so these helpers are very helpful. Link: https://lkml.kernel.org/r/20250916160100.31545-8-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ mm/page_io.c | 12 ++++++------ mm/swap.h | 38 +++++++++++++++++++++++++++++++++----- mm/swap_state.c | 4 ++-- mm/swapfile.c | 37 +++++++++++++++++++------------------ 5 files changed, 60 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 78cc48a65512..762f8db0e811 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -479,7 +479,6 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -492,11 +491,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index a2056a5ecb13..3c342db77ce3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; @@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; @@ -374,7 +374,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -446,7 +446,7 @@ static void swap_writepage_bdev_async(struct folio *folio, void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* @@ -537,7 +537,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); @@ -608,7 +608,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; diff --git a/mm/swap.h b/mm/swap.h index 138b5197c35e..30b1039c27fe 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -15,6 +15,8 @@ extern int page_cluster; #define swap_entry_order(order) 0 #endif +extern struct swap_info_struct *swap_info[]; + /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All @@ -53,9 +55,29 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ -static inline struct swap_cluster_info *swp_offset_cluster( +/* + * Callers of all helpers below must ensure the entry, type, or offset is + * valid, and protect the swap device with reference count or locks. + */ +static inline struct swap_info_struct *__swap_type_to_info(int type) +{ + struct swap_info_struct *si; + + si = READ_ONCE(swap_info[type]); /* rcu_dereference() */ + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + return si; +} + +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return __swap_type_to_info(swp_type(entry)); +} + +static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + VM_WARN_ON_ONCE(offset >= si->max); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } @@ -70,8 +92,9 @@ static inline struct swap_cluster_info *swp_offset_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ spin_lock(&ci->lock); return ci; } @@ -170,7 +193,7 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, static inline unsigned int folio_swap_flags(struct folio *folio) { - return swp_swap_info(folio->swap)->flags; + return __swap_entry_to_info(folio->swap)->flags; } /* @@ -181,7 +204,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *is_zeromap) { - struct swap_info_struct *sis = swp_swap_info(entry); + struct swap_info_struct *sis = __swap_entry_to_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + max_nr; bool first_bit; @@ -200,7 +223,7 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); int i; @@ -219,6 +242,11 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return NULL; +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9225d6b695ad..0ad4f3b41f1b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,7 +336,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; @@ -560,7 +560,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; diff --git a/mm/swapfile.c b/mm/swapfile.c index 700e07cb1cbd..6f7a8c98d14d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -102,7 +102,7 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -124,14 +124,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; -static struct swap_info_struct *swap_type_to_swap_info(int type) +/* May return NULL on invalid type, caller must check for NULL return */ +static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; - return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } +/* May return NULL on invalid entry, caller must check for NULL return */ +static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) +{ + return swap_type_to_info(swp_type(entry)); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -342,7 +348,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; @@ -1300,7 +1306,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) @@ -1415,7 +1421,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) @@ -1538,7 +1544,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1596,7 +1602,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); @@ -1827,7 +1833,7 @@ out: swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; swp_entry_t entry = {0}; @@ -1908,7 +1914,7 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) @@ -2837,7 +2843,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2858,7 +2864,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) type = si->type + 1; ++(*pos); - for (; (si = swap_type_to_swap_info(type)); type++) { + for (; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; @@ -3531,7 +3537,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) unsigned char has_cache; int err, i; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; @@ -3646,11 +3652,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) swap_entries_put_cache(si, entry, nr); } -struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return swap_type_to_swap_info(swp_type(entry)); -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's -- cgit v1.2.3 From 8578e0c00dcf0c58fbc32d4904ecaf8e802a6590 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:56 +0800 Subject: mm, swap: use the swap table for the swap cache and switch API Introduce basic swap table infrastructures, which are now just a fixed-sized flat array inside each swap cluster, with access wrappers. Each cluster contains a swap table of 512 entries. Each table entry is an opaque atomic long. It could be in 3 types: a shadow type (XA_VALUE), a folio type (pointer), or NULL. In this first step, it only supports storing a folio or shadow, and it is a drop-in replacement for the current swap cache. Convert all swap cache users to use the new sets of APIs. Chris Li has been suggesting using a new infrastructure for swap cache for better performance, and that idea combined well with the swap table as the new backing structure. Now the lock contention range is reduced to 2M clusters, which is much smaller than the 64M address_space. And we can also drop the multiple address_space design. All the internal works are done with swap_cache_get_* helpers. Swap cache lookup is still lock-less like before, and the helper's contexts are same with original swap cache helpers. They still require a pin on the swap device to prevent the backing data from being freed. Swap cache updates are now protected by the swap cluster lock instead of the XArray lock. This is mostly handled internally, but new __swap_cache_* helpers require the caller to lock the cluster. So, a few new cluster access and locking helpers are also introduced. A fully cluster-based unified swap table can be implemented on top of this to take care of all count tracking and synchronization work, with dynamic allocation. It should reduce the memory usage while making the performance even better. Link: https://lkml.kernel.org/r/20250916160100.31545-12-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Acked-by: Chris Li Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/swap.h | 2 - mm/huge_memory.c | 13 ++- mm/migrate.c | 19 +++- mm/shmem.c | 8 +- mm/swap.h | 154 +++++++++++++++++++++------ mm/swap_state.c | 293 ++++++++++++++++++++++----------------------------- mm/swap_table.h | 97 +++++++++++++++++ mm/swapfile.c | 100 +++++++++++++----- mm/vmscan.c | 20 ++-- 10 files changed, 459 insertions(+), 248 deletions(-) create mode 100644 mm/swap_table.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 3d113bfc3c82..4c8bbf70a3c7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16232,6 +16232,7 @@ F: include/linux/swapops.h F: mm/page_io.c F: mm/swap.c F: mm/swap.h +F: mm/swap_table.h F: mm/swap_state.c F: mm/swapfile.c diff --git a/include/linux/swap.h b/include/linux/swap.h index 762f8db0e811..e818fbade1e2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -480,8 +480,6 @@ extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4c66e358685b..a9fc7a09167a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3720,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct address_space *swap_cache = NULL; + struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; int expected_refs; @@ -3764,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto fail; } - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); + ci = swap_cluster_get_and_lock(folio); } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ @@ -3797,8 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * Anonymous folio with swap cache. * NOTE: shmem in swap cache is not supported yet. */ - if (swap_cache) { - __swap_cache_replace_folio(folio, new_folio); + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); continue; } @@ -3833,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, unlock_page_lruvec(lruvec); - if (swap_cache) - xa_unlock(&swap_cache->i_pages); + if (ci) + swap_cluster_unlock(ci); } else { spin_unlock(&ds_queue->split_queue_lock); ret = -EAGAIN; diff --git a/mm/migrate.c b/mm/migrate.c index c69cc13db692..aee61a980374 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -563,6 +563,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); @@ -591,9 +592,16 @@ static int __folio_migrate_mapping(struct address_space *mapping, oldzone = folio_zone(folio); newzone = folio_zone(newfolio); - xas_lock_irq(&xas); + if (folio_test_swapcache(folio)) + ci = swap_cluster_get_and_lock_irq(folio); + else + xas_lock_irq(&xas); + if (!folio_ref_freeze(folio, expected_count)) { - xas_unlock_irq(&xas); + if (ci) + swap_cluster_unlock_irq(ci); + else + xas_unlock_irq(&xas); return -EAGAIN; } @@ -624,7 +632,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } if (folio_test_swapcache(folio)) - __swap_cache_replace_folio(folio, newfolio); + __swap_cache_replace_folio(ci, folio, newfolio); else xas_store(&xas, newfolio); @@ -635,8 +643,11 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ folio_ref_unfreeze(folio, expected_count - nr); - xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (ci) + swap_cluster_unlock(ci); + else + xas_unlock(&xas); /* * If moved to a different zone then also account diff --git a/mm/shmem.c b/mm/shmem.c index bbfbbc1bc4d6..cf0171a72e47 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2083,9 +2083,9 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { + struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; - struct address_space *swap_mapping = swap_address_space(entry); int nr_pages = folio_nr_pages(old); int error = 0; @@ -2116,12 +2116,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - xa_lock_irq(&swap_mapping->i_pages); - __swap_cache_replace_folio(old, new); + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); - xa_unlock_irq(&swap_mapping->i_pages); + swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; diff --git a/mm/swap.h b/mm/swap.h index fe579c81c6c4..742db4d46d23 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,7 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +#include /* for atomic_long_t */ struct mempolicy; struct swap_iocb; @@ -35,6 +36,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; + atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; @@ -55,6 +57,11 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline unsigned int swp_cluster_offset(swp_entry_t entry) +{ + return swp_offset(entry) % SWAPFILE_CLUSTER; +} + /* * Callers of all helpers below must ensure the entry, type, or offset is * valid, and protect the swap device with reference count or locks. @@ -81,6 +88,25 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } +static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry) +{ + return __swap_offset_to_cluster(__swap_entry_to_info(entry), + swp_offset(entry)); +} + +static __always_inline struct swap_cluster_info *__swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset, bool irq) +{ + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + if (irq) + spin_lock_irq(&ci->lock); + else + spin_lock(&ci->lock); + return ci; +} + /** * swap_cluster_lock - Lock and return the swap cluster of given offset. * @si: swap device the cluster belongs to. @@ -92,11 +118,49 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + return __swap_cluster_lock(si, offset, false); +} - VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - spin_lock(&ci->lock); - return ci; +static inline struct swap_cluster_info *__swap_cluster_get_and_lock( + const struct folio *folio, bool irq) +{ + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + return __swap_cluster_lock(__swap_entry_to_info(folio->swap), + swp_offset(folio->swap), irq); +} + +/* + * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * This locks and returns the swap cluster that contains a folio's swap + * entries. The swap entries of a folio are always in one single cluster. + * The folio has to be locked so its swap entries won't change and the + * cluster won't be freed. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, false); +} + +/* + * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * Same as swap_cluster_get_and_lock but also disable IRQ. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, true); } static inline void swap_cluster_unlock(struct swap_cluster_info *ci) @@ -104,6 +168,11 @@ static inline void swap_cluster_unlock(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ + spin_unlock_irq(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -123,10 +192,11 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) +extern struct address_space swap_space; +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return &swap_space; +} /* * Return the swap device position of the swap entry. @@ -136,15 +206,6 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } -/* - * Return the swap cache index of the swap entry. - */ -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); - return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; -} - /** * folio_matches_swap_entry - Check if a folio matches a given swap entry. * @folio: The folio. @@ -180,14 +241,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadow); +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); void swap_cache_del_folio(struct folio *folio); -void __swap_cache_del_folio(struct folio *folio, - swp_entry_t entry, void *shadow); -void __swap_cache_replace_folio(struct folio *old, struct folio *new); -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end); +/* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new); +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); @@ -255,6 +316,32 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, pgoff_t offset, bool irq) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + struct folio *folio) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + struct folio *folio) +{ + return NULL; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ +} + +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ +} + static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) { return NULL; @@ -272,11 +359,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - return 0; -} - static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) { return false; @@ -323,21 +405,21 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, - gfp_t gfp, void **shadow) +static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) { - return -EINVAL; } static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow) +static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow) { } -static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new) +static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { } @@ -371,8 +453,10 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) */ static inline pgoff_t folio_index(struct folio *folio) { +#ifdef CONFIG_SWAP if (unlikely(folio_test_swapcache(folio))) - return swap_cache_index(folio->swap); + return swp_offset(folio->swap); +#endif return folio->index; } diff --git a/mm/swap_state.c b/mm/swap_state.c index d1f5b8fa52fc..2558a648d671 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "swap_table.h" #include "swap.h" /* @@ -36,8 +37,10 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +struct address_space swap_space __read_mostly = { + .a_ops = &swap_aops, +}; + static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 @@ -83,11 +86,20 @@ void show_swap_cache_info(void) */ struct folio *swap_cache_get_folio(swp_entry_t entry) { - struct folio *folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (IS_ERR(folio)) - return NULL; - return folio; + unsigned long swp_tb; + struct folio *folio; + + for (;;) { + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (!swp_tb_is_folio(swp_tb)) + return NULL; + folio = swp_tb_to_folio(swp_tb); + if (likely(folio_try_get(folio))) + return folio; + } + + return NULL; } /** @@ -100,13 +112,13 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) */ void *swap_cache_get_shadow(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - void *shadow; + unsigned long swp_tb; + + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_is_shadow(swp_tb)) + return swp_tb_to_shadow(swp_tb); - shadow = xa_load(&address_space->i_pages, idx); - if (xa_is_value(shadow)) - return shadow; return NULL; } @@ -119,61 +131,48 @@ void *swap_cache_get_shadow(swp_entry_t entry) * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to mark the corresponding swap_map slots with - * SWAP_HAS_CACHE to avoid race or conflict. - * Return: Returns 0 on success, error code otherwise. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. */ -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); - unsigned long i, nr = folio_nr_pages(folio); - void *old; - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + void *shadow = NULL; + unsigned long old_tb, new_tb; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + if (swp_tb_is_shadow(old_tb)) + shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); - folio_ref_add(folio, nr); + folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; + swap_cluster_unlock(ci); - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (xas_error(&xas)) - goto unlock; - for (i = 0; i < nr; i++) { - VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - if (shadowp) { - old = xas_load(&xas); - if (xa_is_value(old)) - *shadowp = old; - } - xas_store(&xas, folio); - xas_next(&xas); - } - address_space->nrpages += nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: - xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); - - if (!xas_error(&xas)) - return 0; + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - folio_clear_swapcache(folio); - folio_ref_sub(folio, nr); - return xas_error(&xas); + if (shadowp) + *shadowp = shadow; } /** * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. * @folio: The folio. * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. @@ -181,34 +180,36 @@ unlock: * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * - * Context: Caller must hold the xa_lock, ensure the folio is - * locked and in the swap cache, using the index of @entry. + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. */ -void __swap_cache_del_folio(struct folio *folio, +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { - struct address_space *address_space = swap_address_space(entry); - int i; - long nr = folio_nr_pages(folio); - pgoff_t idx = swap_cache_index(entry); - XA_STATE(xas, &address_space->i_pages, idx); - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - - for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != folio, entry); - xas_next(&xas); - } + unsigned long old_tb, new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + + new_tb = shadow_swp_to_tb(shadow); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + do { + /* If shadow is NULL, we sets an empty shadow */ + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || + swp_tb_to_folio(old_tb) != folio); + } while (++ci_off < ci_end); + folio->swap.val = 0; folio_clear_swapcache(folio); - address_space->nrpages -= nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } /** @@ -223,12 +224,12 @@ void __swap_cache_del_folio(struct folio *folio, */ void swap_cache_del_folio(struct folio *folio) { + struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; - struct address_space *address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - __swap_cache_del_folio(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + __swap_cache_del_folio(ci, folio, entry, NULL); + swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); @@ -236,6 +237,7 @@ void swap_cache_del_folio(struct folio *folio) /** * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. * @old: The old folio to be replaced. * @new: The new folio. * @@ -244,65 +246,62 @@ void swap_cache_del_folio(struct folio *folio) * entries. Replacement will take the new folio's swap entry value as * the starting offset to override all slots covered by the new folio. * - * Context: Caller must ensure both folios are locked, also lock the - * swap address_space that holds the old folio to avoid races. + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. */ -void __swap_cache_replace_folio(struct folio *old, struct folio *new) +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { swp_entry_t entry = new->swap; unsigned long nr_pages = folio_nr_pages(new); - unsigned long offset = swap_cache_index(entry); - unsigned long end = offset + nr_pages; - - XA_STATE(xas, &swap_address_space(entry)->i_pages, offset); + unsigned int ci_off = swp_cluster_offset(entry); + unsigned int ci_end = ci_off + nr_pages; + unsigned long old_tb, new_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ + new_tb = folio_to_swp_tb(new); do { - WARN_ON_ONCE(xas_store(&xas, new) != old); - xas_next(&xas); - } while (++offset < end); + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + } while (++ci_off < ci_end); + + /* + * If the old folio is partially replaced (e.g., splitting a large + * folio, the old folio is shrunk, and new split sub folios replace + * the shrunk part), ensure the new folio doesn't overlap it. + */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + folio_order(old) != folio_order(new)) { + ci_off = swp_cluster_offset(old->swap); + ci_end = ci_off + folio_nr_pages(old); + while (ci_off++ < ci_end) + WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); + } } /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. - * @type: Indicates the swap device. - * @begin: Beginning offset of the range. - * @end: Ending offset of the range. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. * - * Context: Caller must ensure the range is valid and hold a reference to - * the swap device. + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. */ -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end) +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) { - unsigned long curr = begin; - void *old; - - for (;;) { - swp_entry_t entry = swp_entry(type, curr); - unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; - struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - - xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { - if (!xa_is_value(old)) - continue; - xas_store(&xas, NULL); - } - xa_unlock_irq(&address_space->i_pages); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry), ci_end; + unsigned long old; - /* search the next swapcache until we meet end */ - curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); - if (curr > end) - break; - } + ci_end = ci_off + nr_ents; + do { + old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); + WARN_ON_ONCE(swp_tb_is_folio(old)); + } while (++ci_off < ci_end); } /* @@ -482,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) - goto fail_unlock; - + swap_cache_add_folio(new_folio, entry, &shadow); memcg1_swapin(entry, 1); if (shadow) @@ -677,41 +673,6 @@ skip: return folio; } -int init_swap_address_space(unsigned int type, unsigned long nr_pages) -{ - struct address_space *spaces, *space; - unsigned int i, nr; - - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); - if (!spaces) - return -ENOMEM; - for (i = 0; i < nr; i++) { - space = spaces + i; - xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); - atomic_set(&space->i_mmap_writable, 0); - space->a_ops = &swap_aops; - /* swap cache doesn't use writeback related tags */ - mapping_set_no_writeback_tags(space); - } - nr_swapper_spaces[type] = nr; - swapper_spaces[type] = spaces; - - return 0; -} - -void exit_swap_address_space(unsigned int type) -{ - int i; - struct address_space *spaces = swapper_spaces[type]; - - for (i = 0; i < nr_swapper_spaces[type]; i++) - VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); - kvfree(spaces); - nr_swapper_spaces[type] = 0; - swapper_spaces[type] = NULL; -} - static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { @@ -884,7 +845,7 @@ static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void) { int err; struct kobject *swap_kobj; @@ -899,11 +860,13 @@ static int __init swap_init_sysfs(void) pr_err("failed to register swap group\n"); goto delete_obj; } + /* Swap cache writeback is LRU based, no tags for it */ + mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init); #endif diff --git a/mm/swap_table.h b/mm/swap_table.h new file mode 100644 index 000000000000..e1f7cc009701 --- /dev/null +++ b/mm/swap_table.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_TABLE_H +#define _MM_SWAP_TABLE_H + +#include "swap.h" + +/* + * A swap table entry represents the status of a swap slot on a swap + * (physical or virtual) device. The swap table in each cluster is a + * 1:1 map of the swap slots in this cluster. + * + * Each swap table entry could be a pointer (folio), a XA_VALUE + * (shadow), or NULL. + */ + +/* + * Helpers for casting one type of info into a swap table entry. + */ +static inline unsigned long null_to_swp_tb(void) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); + return 0; +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + return (unsigned long)folio; +} + +static inline unsigned long shadow_swp_to_tb(void *shadow) +{ + BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != + BITS_PER_BYTE * sizeof(unsigned long)); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); + return (unsigned long)shadow; +} + +/* + * Helpers for swap table entry type checking. + */ +static inline bool swp_tb_is_null(unsigned long swp_tb) +{ + return !swp_tb; +} + +static inline bool swp_tb_is_folio(unsigned long swp_tb) +{ + return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); +} + +static inline bool swp_tb_is_shadow(unsigned long swp_tb) +{ + return xa_is_value((void *)swp_tb); +} + +/* + * Helpers for retrieving info from swap table. + */ +static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_folio(swp_tb)); + return (void *)swp_tb; +} + +static inline void *swp_tb_to_shadow(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); + return (void *)swp_tb; +} + +/* + * Helpers for accessing or modifying the swap table of a cluster, + * the swap cluster must be locked. + */ +static inline void __swap_table_set(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + atomic_long_set(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + /* Ordering is guaranteed by cluster lock, relax */ + return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + return atomic_long_read(&ci->table[off]); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 51f781c43537..b183e96be289 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include #include #include +#include "swap_table.h" #include "internal.h" #include "swap.h" @@ -421,6 +422,34 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } +static int swap_cluster_alloc_table(struct swap_cluster_info *ci) +{ + WARN_ON(ci->table); + ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); + if (!ci->table) + return -ENOMEM; + return 0; +} + +static void swap_cluster_free_table(struct swap_cluster_info *ci) +{ + unsigned int ci_off; + unsigned long swp_tb; + + if (!ci->table) + return; + + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { + swp_tb = __swap_table_get(ci, ci_off); + if (!swp_tb_is_null(swp_tb)) + pr_err_once("swap: unclean swap space on swapoff: 0x%lx", + swp_tb); + } + + kfree(ci->table); + ci->table = NULL; +} + static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -703,6 +732,26 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } +/* + * Currently, the swap table is not used for count tracking, just + * do a sanity check here to ensure nothing leaked, so the swap + * table should be empty upon freeing. + */ +static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, + unsigned int start, unsigned int nr) +{ + unsigned int ci_off = start % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + do { + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + } +} + static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) @@ -722,6 +771,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster ci->order = order; memset(si->swap_map + start, usage, nr_pages); + swap_cluster_assert_table_empty(ci, start, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; @@ -1124,7 +1174,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - swap_cache_clear_shadow(si->type, begin, end); + __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1281,16 +1331,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (!entry.val) return -ENOMEM; - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) - goto out_free; + swap_cache_add_folio(folio, entry, NULL); return 0; @@ -1556,6 +1597,7 @@ static void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + swap_cluster_assert_table_empty(ci, offset, nr_pages); if (!ci->count) free_cluster(si, ci); @@ -2634,6 +2676,18 @@ static void wait_for_allocation(struct swap_info_struct *si) } } +static void free_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) +{ + int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + if (!cluster_info) + return; + for (i = 0; i < nr_clusters; i++) + swap_cluster_free_table(&cluster_info[i]); + kvfree(cluster_info); +} + /* * Called after swap device's reference count is dead, so * neither scan nor allocation will use it. @@ -2768,12 +2822,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; + free_cluster_info(cluster_info, p->max); + p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -2784,10 +2839,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); - exit_swap_address_space(p->type); inode = mapping->host; @@ -3171,8 +3224,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) + for (i = 0; i < nr_clusters; i++) { spin_lock_init(&cluster_info[i].lock); + if (swap_cluster_alloc_table(&cluster_info[i])) + goto err_free; + } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3233,9 +3289,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; - err_free: - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); err: return ERR_PTR(err); } @@ -3429,13 +3484,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = init_swap_address_space(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - error = zswap_swapon(si->type, maxpages); if (error) - goto free_swap_address_space; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3470,8 +3521,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto out; free_swap_zswap: zswap_swapoff(si->type); -free_swap_address_space: - exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: @@ -3486,7 +3535,8 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + if (cluster_info) + free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) diff --git a/mm/vmscan.c b/mm/vmscan.c index c79c6806560b..e170c12e2065 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -730,13 +730,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, { int refcount; void *shadow = NULL; + struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xa_lock_irq(&mapping->i_pages); + } + /* * The non racy check for a busy folio. * @@ -776,9 +781,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(folio, swap, shadow); + __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); - xa_unlock_irq(&mapping->i_pages); + swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -816,9 +821,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, return 1; cannot_free: - xa_unlock_irq(&mapping->i_pages); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); + } return 0; } -- cgit v1.2.3 From 6106864b878e1ce5ecab4b8ffffff85e9ec69b78 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:36:11 +0000 Subject: maple_tree: remove lockdep_map_p typedef Having the ma_external_lock field exist when CONFIG_LOCKDEP=n isn't used anywhere, so just get rid of it. This also avoids generating a typedef called lockdep_map_p that could overlap with typedefs in other header files. Link: https://lkml.kernel.org/r/20250902-maple-lockdep-p-v1-1-3ae5a398a379@google.com Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 05730171d201..47f9002ae92d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -194,7 +194,6 @@ enum store_type { #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP -typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) @@ -207,7 +206,6 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else -typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) @@ -230,8 +228,10 @@ typedef struct { /* nothing */ } lockdep_map_p; */ struct maple_tree { union { - spinlock_t ma_lock; - lockdep_map_p ma_external_lock; + spinlock_t ma_lock; +#ifdef CONFIG_LOCKDEP + struct lockdep_map *ma_external_lock; +#endif }; unsigned int ma_flags; void __rcu *ma_root; -- cgit v1.2.3 From 522abd92279a8ea55bcc687f77697d4c0aaba6c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:00 +0100 Subject: ptdesc: convert __page_flags to pt_flags Patch series "Some ptdesc cleanups". The first two patches here are preparation for splitting struct ptdesc from struct page and struct folio. I think their only dependency is on the memdesc_flags_t patches from August which is in mm-new. The third patch is just something I noticed while working on the code. This patch (of 3): Use the new memdesc_flags_t type to show that these are the same bits as page/folio/slab and thesefore have the zone/node/section information in them. Remove a use of ptdesc_folio() by converting pagetable_is_reserved() to use test_bit() directly. Link: https://lkml.kernel.org/r/20250908171104.2409217-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250908171104.2409217-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ++++++- include/linux/mm_types.h | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a6bfa46937a8..8dd71392eba7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2934,6 +2934,11 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +enum pt_flags { + PT_reserved = PG_reserved, + /* High bits are used for zone/node/section */ +}; + static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); @@ -2951,7 +2956,7 @@ static inline void *ptdesc_address(const struct ptdesc *pt) static inline bool pagetable_is_reserved(struct ptdesc *pt) { - return folio_test_reserved(ptdesc_folio(pt)); + return test_bit(PT_reserved, &pt->pt_flags.f); } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ff2b4e13215f..f048dc80646e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -524,7 +524,7 @@ FOLIO_MATCH(compound_head, _head_3); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Powerpc only. + * @pt_flags: enum pt_flags plus zone/node/section. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 gmap shadow pages * (which are not linked into the user page tables) and x86 @@ -546,7 +546,7 @@ FOLIO_MATCH(compound_head, _head_3); * understanding of the issues. */ struct ptdesc { - unsigned long __page_flags; + memdesc_flags_t pt_flags; union { struct rcu_head pt_rcu_head; @@ -584,7 +584,7 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) -TABLE_MATCH(flags, __page_flags); +TABLE_MATCH(flags, pt_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); -- cgit v1.2.3 From f0c92726e89f5c6c092526787465617a68af154f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:01 +0100 Subject: ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor() In preparation for splitting struct ptdesc from struct page and struct folio, remove mentions of struct folio from these functions. Introduce ptdesc_nr_pages() to avoid using lruvec_stat_add/sub_folio() Link: https://lkml.kernel.org/r/20250908171104.2409217-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8dd71392eba7..25f56e209ec8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2097,9 +2097,9 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(struct page *page) +static inline long compound_nr(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags.f)) return 1; @@ -3066,21 +3066,26 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) +{ + return compound_nr(ptdesc_page(ptdesc)); +} + static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __SetPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); + __ClearPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) -- cgit v1.2.3 From 90ec2df9dd31653ceac4a35d2440b108bdf27550 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:02 +0100 Subject: ptdesc: remove ptdesc_to_virt() This has the same effect as ptdesc_address() so convert the callers to use that and delete the function. Add kernel-doc for ptdesc_address(). Link: https://lkml.kernel.org/r/20250908171104.2409217-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/mmu.c | 2 +- arch/s390/mm/pgalloc.c | 6 +++--- include/linux/mm.h | 11 ++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index edb7f56b7c91..8bac96e205ac 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -737,7 +737,7 @@ static void *__init late_alloc(unsigned long sz) if (!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)) BUG(); - return ptdesc_to_virt(ptdesc); + return ptdesc_address(ptdesc); } static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d2f6f1f6d2fc..f56ee9aeac83 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -21,7 +21,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!ptdesc) return NULL; - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -119,7 +119,7 @@ struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (ptdesc) { - table = (u64 *)ptdesc_to_virt(ptdesc); + table = (u64 *)ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -146,7 +146,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) pagetable_free(ptdesc); return NULL; } - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); diff --git a/include/linux/mm.h b/include/linux/mm.h index 25f56e209ec8..da6e0abad2cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2944,11 +2944,12 @@ static inline struct ptdesc *virt_to_ptdesc(const void *x) return page_ptdesc(virt_to_page(x)); } -static inline void *ptdesc_to_virt(const struct ptdesc *pt) -{ - return page_to_virt(ptdesc_page(pt)); -} - +/** + * ptdesc_address - Virtual address of page table. + * @pt: Page table descriptor. + * + * Return: The first byte of the page table described by @pt. + */ static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); -- cgit v1.2.3 From 602837268999912b3c0e0db21b67818ffbde7141 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 8 Sep 2025 16:55:34 +0200 Subject: readahead: add trace points Add a couple of trace points to make debugging readahead logic easier. [jack@suse.cz: v2] Link: https://lkml.kernel.org/r/20250909145849.5090-2-jack@suse.cz Link: https://lkml.kernel.org/r/20250908145533.31528-2-jack@suse.cz Signed-off-by: Jan Kara Tested-by: Pankaj Raghav Signed-off-by: Andrew Morton --- include/trace/events/readahead.h | 132 +++++++++++++++++++++++++++++++++++++++ mm/readahead.c | 8 +++ 2 files changed, 140 insertions(+) create mode 100644 include/trace/events/readahead.h (limited to 'include') diff --git a/include/trace/events/readahead.h b/include/trace/events/readahead.h new file mode 100644 index 000000000000..0997ac5eceab --- /dev/null +++ b/include/trace/events/readahead.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM readahead + +#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_READAHEAD_H + +#include +#include +#include +#include +#include + +TRACE_EVENT(page_cache_ra_unbounded, + TP_PROTO(struct inode *inode, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size), + + TP_ARGS(inode, index, nr_to_read, lookahead_size), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned long, nr_to_read) + __field(unsigned long, lookahead_size) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->nr_to_read = nr_to_read; + __entry->lookahead_size = lookahead_size; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu nr_to_read=%lu lookahead_size=%lu", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->nr_to_read, __entry->lookahead_size + ) +); + +TRACE_EVENT(page_cache_ra_order, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra), + + TP_ARGS(inode, index, ra), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned int, order) + __field(unsigned int, size) + __field(unsigned int, async_size) + __field(unsigned int, ra_pages) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->order = ra->order; + __entry->size = ra->size; + __entry->async_size = ra->async_size; + __entry->ra_pages = ra->ra_pages; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu order=%u size=%u async_size=%u ra_pages=%u", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->order, __entry->size, + __entry->async_size, __entry->ra_pages + ) +); + +DECLARE_EVENT_CLASS(page_cache_ra_op, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + + TP_ARGS(inode, index, ra, req_count), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned int, order) + __field(unsigned int, size) + __field(unsigned int, async_size) + __field(unsigned int, ra_pages) + __field(unsigned int, mmap_miss) + __field(loff_t, prev_pos) + __field(unsigned long, req_count) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->order = ra->order; + __entry->size = ra->size; + __entry->async_size = ra->async_size; + __entry->ra_pages = ra->ra_pages; + __entry->mmap_miss = ra->mmap_miss; + __entry->prev_pos = ra->prev_pos; + __entry->req_count = req_count; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->req_count, __entry->order, + __entry->size, __entry->async_size, __entry->ra_pages, + __entry->mmap_miss, __entry->prev_pos + ) +); + +DEFINE_EVENT(page_cache_ra_op, page_cache_sync_ra, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + TP_ARGS(inode, index, ra, req_count) +); + +DEFINE_EVENT(page_cache_ra_op, page_cache_async_ra, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + TP_ARGS(inode, index, ra, req_count) +); + +#endif /* _TRACE_FILEMAP_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/readahead.c b/mm/readahead.c index 406756d34309..3a4b5d58eeb6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -129,6 +129,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "internal.h" /* @@ -225,6 +228,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ unsigned int nofs = memalloc_nofs_save(); + trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read, + lookahead_size); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); @@ -470,6 +475,7 @@ void page_cache_ra_order(struct readahead_control *ractl, gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; + trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { ra->order = 0; goto fallback; @@ -554,6 +560,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, unsigned long max_pages, contig_count; pgoff_t prev_index, miss; + trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced @@ -638,6 +645,7 @@ void page_cache_async_ra(struct readahead_control *ractl, if (folio_test_writeback(folio)) return; + trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count); folio_clear_readahead(folio); if (blk_cgroup_congested()) -- cgit v1.2.3 From e7a5f249e6db3b41a4618763e0f840639f3578f4 Mon Sep 17 00:00:00 2001 From: Chanwon Park Date: Mon, 8 Sep 2025 19:04:10 +0900 Subject: mm: re-enable kswapd when memory pressure subsides or demotion is toggled If kswapd fails to reclaim pages from a node MAX_RECLAIM_RETRIES in a row, kswapd on that node gets disabled. That is, the system won't wakeup kswapd for that node until page reclamation is observed at least once. That reclamation is mostly done by direct reclaim, which in turn enables kswapd back. However, on systems with CXL memory nodes, workloads with high anon page usage can disable kswapd indefinitely, without triggering direct reclaim. This can be reproduced with following steps: numa node 0 (32GB memory, 48 CPUs) numa node 2~5 (512GB CXL memory, 128GB each) (numa node 1 is disabled) swap space 8GB 1) Set /sys/kernel/mm/demotion_enabled to 0. 2) Set /proc/sys/kernel/numa_balancing to 0. 3) Run a process that allocates and random accesses 500GB of anon pages. 4) Let the process exit normally. During 3), free memory on node 0 gets lower than low watermark, and kswapd runs and depletes swap space. Then, kswapd fails consecutively and gets disabled. Allocation afterwards happens on CXL memory, so node 0 never gains more memory pressure to trigger direct reclaim. After 4), kswapd on node 0 remains disabled, and tasks running on that node are unable to swap. If you turn on NUMA_BALANCING_MEMORY_TIERING and demotion now, it won't work properly since kswapd is disabled. To mitigate this problem, reset kswapd_failures to 0 on following conditions: a) ZONE_BELOW_HIGH bit of a zone in hopeless node with a fallback memory node gets cleared. b) demotion_enabled is changed from false to true. Rationale for a): ZONE_BELOW_HIGH bit being cleared might be a sign that the node may be reclaimable afterwards. This won't help much if the memory-hungry process keeps running without freeing anything, but at least the node will go back to reclaimable state when the process exits. Rationale for b): When demotion_enabled is false, kswapd can only reclaim anon pages by swapping them out to swap space. If demotion_enabled is turned on, kswapd can demote anon pages to another node for reclaiming. So, the original failure count for determining reclaimability is no longer valid. Since kswapd_failures resets may be missed by ++ operation, it is changed from int to atomic_t. [akpm@linux-foundation.org: tweak whitespace] Link: https://lkml.kernel.org/r/aL6qGi69jWXfPc4D@pcw-MS-7D22 Signed-off-by: Chanwon Park Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/memory-tiers.c | 12 ++++++++++++ mm/page_alloc.c | 29 ++++++++++++++++++++++------- mm/show_mem.c | 3 ++- mm/vmscan.c | 14 +++++++------- mm/vmstat.c | 2 +- 6 files changed, 45 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6c4eae96160d..7fb7331c5725 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1440,7 +1440,7 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_highest_zoneidx; - int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0382b6942b8b..0ea5c13f10a2 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -942,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, const char *buf, size_t count) { ssize_t ret; + bool before = numa_demotion_enabled; ret = kstrtobool(buf, &numa_demotion_enabled); if (ret) return ret; + /* + * Reset kswapd_failures statistics. They may no longer be + * valid since the policy for kswapd has changed. + */ + if (before == false && numa_demotion_enabled == true) { + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + atomic_set(&pgdat->kswapd_failures, 0); + } + return count; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df6df302d0c5..6ff9f17d5f4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2860,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone, */ return; } + high = nr_pcp_high(pcp, zone, batch, free_high); - if (pcp->count >= high) { - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); - if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && - zone_watermark_ok(zone, 0, high_wmark_pages(zone), - ZONE_MOVABLE, 0)) - clear_bit(ZONE_BELOW_HIGH, &zone->flags); + if (pcp->count < high) + return; + + free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), + pcp, pindex); + if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && + zone_watermark_ok(zone, 0, high_wmark_pages(zone), + ZONE_MOVABLE, 0)) { + struct pglist_data *pgdat = zone->zone_pgdat; + clear_bit(ZONE_BELOW_HIGH, &zone->flags); + + /* + * Assume that memory pressure on this node is gone and may be + * in a reclaimable state. If a memory fallback node exists, + * direct reclaim may not have been triggered, causing a + * 'hopeless node' to stay in that state for a while. Let + * kswapd work again by resetting kswapd_failures. + */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + next_memory_node(pgdat->node_id) < MAX_NUMNODES) + atomic_set(&pgdat->kswapd_failures, 0); } } diff --git a/mm/show_mem.c b/mm/show_mem.c index 90a9a37116e7..3a4b5207635d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,7 +278,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES), + str_yes_no(atomic_read(&pgdat->kswapd_failures) >= + MAX_RECLAIM_RETRIES), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/vmscan.c b/mm/vmscan.c index e170c12e2065..b2fc8b626d3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -518,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; /* @@ -5101,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); } /****************************************************************************** @@ -6180,7 +6180,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -6492,7 +6492,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6902,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7170,7 +7170,7 @@ restart: } if (!sc.nr_reclaimed) - pgdat->kswapd_failures++; + atomic_inc(&pgdat->kswapd_failures); out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7429,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* diff --git a/mm/vmstat.c b/mm/vmstat.c index e522decf6a72..bb09c032eecf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1848,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n node_unreclaimable: %u" "\n start_pfn: %lu", - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, + atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn); seq_putc(m, '\n'); } -- cgit v1.2.3 From 032c31127f27acb1b8152b512830ecef04ed2ebc Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 9 Sep 2025 13:13:57 -0700 Subject: mm: vm_event_item: explicit #include for THREAD_SIZE This header uses THREAD_SIZE, which is provided by the thread_info.h header but is not included in this header. Depending on the #include ordering in other files, this can produce preprocessor errors. Link: https://lkml.kernel.org/r/20250909201419.827638-1-briannorris@chromium.org Signed-off-by: Brian Norris Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..92f80b4d69a6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -2,6 +2,8 @@ #ifndef VM_EVENT_ITEM_H_INCLUDED #define VM_EVENT_ITEM_H_INCLUDED +#include + #ifdef CONFIG_ZONE_DMA #define DMA_ZONE(xx) xx##_DMA, #else -- cgit v1.2.3 From fa17bcd5f65ed702df001579cca8c885fa6bf3e7 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 26 Aug 2025 11:37:21 -0400 Subject: mm: make folio page count functions return unsigned As raised by Andrew [1], a folio/compound page never spans a negative number of pages. Consequently, let's use "unsigned long" instead of "long" consistently for folio_nr_pages(), folio_large_nr_pages() and compound_nr(). Using "unsigned long" as return value is fine, because even "(long)-folio_nr_pages()" will keep on working as expected. Using "unsigned int" instead would actually break these use cases. This patch takes the first step changing these to return unsigned long (and making drm_gem_get_pages() use the new types instead of replacing min()). In the future, we might want to make more callers of these functions to consistently use "unsigned long". Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ Link: https://lkml.kernel.org/r/20250826153721.GA23292@cathedrallabs.org Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ [1] Signed-off-by: Aristeu Rozanski Suggested-by: Andrew Morton Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Simona Vetter Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_gem.c | 4 ++-- include/linux/mm.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 6a44351e58b7..5dbcf91ff73c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -621,7 +621,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) struct page **pages; struct folio *folio; struct folio_batch fbatch; - long i, j, npages; + unsigned long i, j, npages; if (WARN_ON(!obj->filp)) return ERR_PTR(-EINVAL); @@ -645,7 +645,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) i = 0; while (i < npages) { - long nr; + unsigned long nr; folio = shmem_read_folio_gfp(mapping, i, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) diff --git a/include/linux/mm.h b/include/linux/mm.h index da6e0abad2cb..8f5b4df9b166 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1018,12 +1018,12 @@ static inline unsigned int folio_large_order(const struct folio *folio) } #ifdef NR_PAGES_IN_LARGE_FOLIO -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } @@ -2062,7 +2062,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, * * Return: A positive power of two. */ -static inline long folio_nr_pages(const struct folio *folio) +static inline unsigned long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; @@ -2097,7 +2097,7 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(const struct page *page) +static inline unsigned long compound_nr(const struct page *page) { const struct folio *folio = (struct folio *)page; -- cgit v1.2.3 From 3a37469e5ac004905e4125bf60b43cc5216e83dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:17 +0100 Subject: mm: constify compound_order() and page_size() Patch series "Small cleanups". These small cleanups can be applied now to reduce conflicts during the next merge window. They're all from various efforts to split struct page from other memdescs. Thanks to Vlastimil for the suggestion. This patch (of 3): These functions do not modify their arguments. Telling the compiler this may improve code generation, and allows us to pass const arguments from other functions. Link: https://lkml.kernel.org/r/20250910142923.2465470-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250910142923.2465470-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f5b4df9b166..fcb1e72eea40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1036,9 +1036,9 @@ static inline unsigned long folio_large_nr_pages(const struct folio *folio) * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ -static inline unsigned int compound_order(struct page *page) +static inline unsigned int compound_order(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags.f)) return 0; @@ -1256,7 +1256,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ -static inline unsigned long page_size(struct page *page) +static inline unsigned long page_size(const struct page *page) { return PAGE_SIZE << compound_order(page); } -- cgit v1.2.3 From 9d003dec972563efb8ce14c9962af3652d0e201d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:19 +0100 Subject: mm: remove page->order We already use page->private for storing the order of a page while it's in the buddy allocator system; extend that to also storing the order while it's in the pcp_llist. Link: https://lkml.kernel.org/r/20250910142923.2465470-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Alexei Starovoitov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 +++----- mm/page_alloc.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f048dc80646e..6920c816f6c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -97,10 +97,7 @@ struct page { /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; - struct { - struct llist_node pcp_llist; - unsigned int order; - }; + struct llist_node pcp_llist; }; struct address_space *mapping; union { @@ -111,7 +108,8 @@ struct page { * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if swapcache flag set. - * Indicates order in the buddy system if PageBuddy. + * Indicates order in the buddy system if PageBuddy + * or on pcp_llist. */ unsigned long private; }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf38d499e045..2bfab96c207f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1520,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page, unsigned int order) { /* Remember the order */ - page->order = order; + page->private = order; /* Add the page to the free list */ llist_add(&page->pcp_llist, &zone->trylock_free_pages); } @@ -1549,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page, llnode = llist_del_all(llhead); llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { - unsigned int p_order = p->order; + unsigned int p_order = p->private; split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); __count_vm_events(PGFREE, 1 << p_order); -- cgit v1.2.3 From d02ac836e4d6bdfd7d44927d01a4cd048ad4aba8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 13 Sep 2025 17:03:39 -0700 Subject: include/linux/pgtable.h: convert arch_enter_lazy_mmu_mode() and friends to static inlines For all the usual reasons, plus a new one. Calling (void)arch_enter_lazy_mmu_mode(); deservedly blows up. Cc: Balbir Singh Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94249e671a7e..32e8457ad535 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint -- cgit v1.2.3 From 59d4d36158ba3cdbce141d8e9261eea154d4c441 Mon Sep 17 00:00:00 2001 From: zhongjinji Date: Tue, 16 Sep 2025 00:29:45 +0800 Subject: mm/oom_kill: thaw the entire OOM victim process Patch series "Improvements to Victim Process Thawing and OOM Reaper Traversal Order", v10. This patch series focuses on optimizing victim process thawing and refining the traversal order of the OOM reaper. Since __thaw_task() is used to thaw a single thread of the victim, thawing only one thread cannot guarantee the exit of the OOM victim when it is frozen. Patch 1 thaw the entire process of the OOM victim to ensure that OOM victims are able to terminate themselves. Even if the oom_reaper is delayed, patch 2 is still beneficial for reaping processes with a large address space footprint, and it also greatly improves process_mrelease. This patch (of 10): OOM killer is a mechanism that selects and kills processes when the system runs out of memory to reclaim resources and keep the system stable. But the oom victim cannot terminate on its own when it is frozen, even if the OOM victim task is thawed through __thaw_task(). This is because __thaw_task() can only thaw a single OOM victim thread, and cannot thaw the entire OOM victim process. In addition, freezing_slow_path() determines whether a task is an OOM victim by checking the task's TIF_MEMDIE flag. When a task is identified as an OOM victim, the freezer bypasses both PM freezing and cgroup freezing states to thaw it. Historically, TIF_MEMDIE was a "this is the oom victim & it has access to memory reserves" flag in the past. It has that thread vs. process problems and tsk_is_oom_victim was introduced later to get rid of them and other issues as well as the guarantee that we can identify the oom victim's mm reliably for other oom_reaper. Therefore, thaw_process() is introduced to unfreeze all threads within the OOM victim process, ensuring that every thread is properly thawed. The freezer now uses tsk_is_oom_victim() to determine OOM victim status, allowing all victim threads to be unfrozen as necessary. With this change, the entire OOM victim process will be thawed when an OOM event occurs, ensuring that the victim can terminate on its own. Link: https://lkml.kernel.org/r/20250915162946.5515-1-zhongjinji@honor.com Link: https://lkml.kernel.org/r/20250915162946.5515-2-zhongjinji@honor.com Signed-off-by: zhongjinji Reviewed-by: Suren Baghdasaryan Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Liam R. Howlett Cc: David Rientjes Cc: Len Brown Cc: Lorenzo Stoakes Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- include/linux/freezer.h | 2 ++ kernel/freezer.c | 20 +++++++++++++++++++- mm/oom_kill.c | 10 +++++----- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index b303472255be..32884c9721e5 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -47,6 +47,7 @@ extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); +extern void thaw_process(struct task_struct *p); static inline bool try_to_freeze(void) { @@ -80,6 +81,7 @@ static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} static inline void thaw_kernel_threads(void) {} +static inline void thaw_process(struct task_struct *p) {} static inline bool try_to_freeze(void) { return false; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 6a96149aede9..ddc11a8bd2ea 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* total number of freezing conditions in effect */ @@ -40,7 +41,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_tsk_thread_flag(p, TIF_MEMDIE)) + if (tsk_is_oom_victim(p)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) @@ -206,6 +207,23 @@ void __thaw_task(struct task_struct *p) wake_up_state(p, TASK_FROZEN); } +/* + * thaw_process - Thaw a frozen process + * @p: the process to be thawed + * + * Iterate over all threads of @p and call __thaw_task() on each. + */ +void thaw_process(struct task_struct *p) +{ + struct task_struct *t; + + rcu_read_lock(); + for_each_thread(p, t) { + __thaw_task(t); + } + rcu_read_unlock(); +} + /** * set_freezable - make %current freezable * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 58bd4cf71d52..22caef65f1d0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -772,12 +772,12 @@ static void mark_oom_victim(struct task_struct *tsk) mmgrab(tsk->signal->oom_mm); /* - * Make sure that the task is woken up from uninterruptible sleep - * if it is frozen because OOM killer wouldn't be able to free - * any memory and livelock. freezing_slow_path will tell the freezer - * that TIF_MEMDIE tasks should be ignored. + * Make sure that the process is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free any + * memory and livelock. The freezer will thaw the tasks that are OOM + * victims regardless of the PM freezing and cgroup freezing states. */ - __thaw_task(tsk); + thaw_process(tsk); atomic_inc(&oom_victims); cred = get_task_cred(tsk); trace_mark_victim(tsk, cred->uid.val); -- cgit v1.2.3 From b9e2f58ffb84afcbba7e66f96ca14f98e0e88f26 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 16:02:24 -0700 Subject: alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output While rare, memory allocation profiling can contain inaccurate counters if slab object extension vector allocation fails. That allocation might succeed later but prior to that, slab allocations that would have used that object extension vector will not be accounted for. To indicate incorrect counters, "accurate:no" marker is appended to the call site line in the /proc/allocinfo output. Bump up /proc/allocinfo version to reflect the change in the file format and update documentation. Example output with invalid counters: allocinfo - version: 2.0 0 0 arch/x86/kernel/kdebugfs.c:105 func:create_setup_data_nodes 0 0 arch/x86/kernel/alternative.c:2090 func:alternatives_smp_module_add 0 0 arch/x86/kernel/alternative.c:127 func:__its_alloc accurate:no 0 0 arch/x86/kernel/fpu/regset.c:160 func:xstateregs_set 0 0 arch/x86/kernel/fpu/xstate.c:1590 func:fpstate_realloc 0 0 arch/x86/kernel/cpu/aperfmperf.c:379 func:arch_enable_hybrid_capacity_scale 0 0 arch/x86/kernel/cpu/amd_cache_disable.c:258 func:init_amd_l3_attrs 49152 48 arch/x86/kernel/cpu/mce/core.c:2709 func:mce_device_create accurate:no 32768 1 arch/x86/kernel/cpu/mce/genpool.c:132 func:mce_gen_pool_create 0 0 arch/x86/kernel/cpu/mce/amd.c:1341 func:mce_threshold_create_device [surenb@google.com: document new "accurate:no" marker] Fixes: 39d117e04d15 ("alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output") [akpm@linux-foundation.org: simplification per Usama, reflow text] [akpm@linux-foundation.org: add newline to prevent docs warning, per Randy] Link: https://lkml.kernel.org/r/20250915230224.4115531-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Usama Arif Acked-by: Johannes Weiner Cc: David Rientjes Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Roman Gushchin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 13 +++++++++++++ include/linux/alloc_tag.h | 12 ++++++++++++ include/linux/codetag.h | 5 ++++- lib/alloc_tag.c | 4 +++- mm/slub.c | 2 ++ 5 files changed, 34 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 915a3e44bc12..6ffd04c736db 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1009,6 +1009,19 @@ number, module (if originates from a loadable module) and the function calling the allocation. The number of bytes allocated and number of calls at each location are reported. The first line indicates the version of the file, the second line is the header listing fields in the file. +If file version is 2.0 or higher then each line may contain additional +: pairs representing extra information about the call site. +For example if the counters are not accurate, the line will be appended with +"accurate:no" pair. + +Supported markers in v2: +accurate:no + + Absolute values of the counters in this line are not accurate + because of the failure to allocate memory to track some of the + allocations made at this location. Deltas in these counters are + accurate, therefore counters can be used to track allocation size + and count changes. Example output. diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 9ef2633e2c08..d40ac39bfbe8 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -221,6 +221,16 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) ref->ct = NULL; } +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) +{ + tag->ct.flags |= CODETAG_FLAG_INACCURATE; +} + +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) +{ + return !!(tag->ct.flags & CODETAG_FLAG_INACCURATE); +} + #define alloc_tag_record(p) ((p) = current->alloc_tag) #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -230,6 +240,8 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) {} static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) {} +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) { return false; } #define alloc_tag_record(p) do {} while (0) #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 457ed8fd3214..8ea2a5f7c98a 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -16,13 +16,16 @@ struct module; #define CODETAG_SECTION_START_PREFIX "__start_" #define CODETAG_SECTION_STOP_PREFIX "__stop_" +/* codetag flags */ +#define CODETAG_FLAG_INACCURATE (1 << 0) + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as * an array of these. */ struct codetag { - unsigned int flags; /* used in later patches */ + unsigned int flags; unsigned int lineno; const char *modname; const char *function; diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index f79217427d81..3ef702e6b69a 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -80,7 +80,7 @@ static void allocinfo_stop(struct seq_file *m, void *arg) static void print_allocinfo_header(struct seq_buf *buf) { /* Output format version, so we can change it. */ - seq_buf_printf(buf, "allocinfo - version: 1.0\n"); + seq_buf_printf(buf, "allocinfo - version: 2.0\n"); seq_buf_printf(buf, "# \n"); } @@ -92,6 +92,8 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls); codetag_to_text(out, ct); + if (unlikely(alloc_tag_is_inaccurate(tag))) + seq_buf_printf(out, " accurate:no"); seq_buf_putc(out, ' '); seq_buf_putc(out, '\n'); } diff --git a/mm/slub.c b/mm/slub.c index af343ca570b5..9c04f29ee8de 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2143,6 +2143,8 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) */ if (likely(obj_exts)) alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + else + alloc_tag_set_inaccurate(current->alloc_tag); } static inline void -- cgit v1.2.3 From ab152db3cae520154d572cff32e63de441672454 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:05 -0700 Subject: mm/damon/core: implement damon_initialized() function Patch series "mm/damon: define and use DAMON initialization check function". DAMON is initialized in subsystem initialization time, by damon_init(). If DAMON API functions are called before the initialization, the system could crash. Actually such issues happened and were fixed [1] in the past. For the fix, DAMON API callers have updated to check if DAMON is initialized or not, using their own hacks. The hacks are unnecessarily duplicated on every DAMON API callers and therefore it would be difficult to reliably maintain in the long term. Make it reliable and easy to maintain. For this, implement a new DAMON core layer API function that returns if DAMON is successfully initialized. If it returns true, it means DAMON API functions are safe to be used. After the introduction of the new API, update DAMON API callers to use the new function instead of their own hacks. This patch (of 7): If DAMON is tried to be used when it is not yet successfully initialized, the caller could be crashed. DAMON core layer is not providing a reliable way to see if it is successfully initialized and therefore ready to be used, though. As a result, DAMON API callers are implementing their own hacks to see it. The hacks simply assume DAMON should be ready on module init time. It is not reliable as DAMON initialization can indeed fail if KMEM_CACHE() fails, and difficult to maintain as those are duplicates. Implement a core layer API function for better reliability and maintainability to replace the hacks with followup commits. Link: https://lkml.kernel.org/r/20250916033511.116366-2-sj@kernel.org Link: https://lkml.kernel.org/r/20250916033511.116366-2-sj@kernel.org Link: https://lore.kernel.org/20250909022238.2989-1-sj@kernel.org [1] Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index aa7381be388c..cae8c613c5fc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -938,6 +938,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs } +bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); diff --git a/mm/damon/core.c b/mm/damon/core.c index 775121ae7a9b..93848b4c6944 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2863,6 +2863,16 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, r->nr_accesses++; } +/** + * damon_initialized() - Return if DAMON is ready to be used. + * + * Return: true if DAMON is ready to be used, false otherwise. + */ +bool damon_initialized(void) +{ + return damon_region_cache != NULL; +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); -- cgit v1.2.3 From 473b73222f3d8cc66bcd840bf9c3260619620789 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 8 Sep 2025 13:20:28 +0530 Subject: mm: drop all references of writable and SCAN_PAGE_RO Now that all actionable outcomes from checking pte_write() are gone, drop the related references. Link: https://lkml.kernel.org/r/20250908075028.38431-3-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Kiryl Shutsemau Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Reviewed-by: Zach O'Keefe Reviewed-by: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Hugh Dickins Cc: Liam Howlett Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 19 ++++++------------- mm/khugepaged.c | 14 +++----------- 2 files changed, 9 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 2305df6cb485..dd94d14a2427 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -19,7 +19,6 @@ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \ - EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ EM( SCAN_SCAN_ABORT, "scan_aborted") \ @@ -55,15 +54,14 @@ SCAN_STATUS TRACE_EVENT(mm_khugepaged_scan_pmd, - TP_PROTO(struct mm_struct *mm, struct folio *folio, bool writable, + TP_PROTO(struct mm_struct *mm, struct folio *folio, int referenced, int none_or_zero, int status, int unmapped), - TP_ARGS(mm, folio, writable, referenced, none_or_zero, status, unmapped), + TP_ARGS(mm, folio, referenced, none_or_zero, status, unmapped), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(unsigned long, pfn) - __field(bool, writable) __field(int, referenced) __field(int, none_or_zero) __field(int, status) @@ -73,17 +71,15 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, TP_fast_assign( __entry->mm = mm; __entry->pfn = folio ? folio_pfn(folio) : -1; - __entry->writable = writable; __entry->referenced = referenced; __entry->none_or_zero = none_or_zero; __entry->status = status; __entry->unmapped = unmapped; ), - TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", + TP_printk("mm=%p, scan_pfn=0x%lx, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", __entry->mm, __entry->pfn, - __entry->writable, __entry->referenced, __entry->none_or_zero, __print_symbolic(__entry->status, SCAN_STATUS), @@ -117,15 +113,14 @@ TRACE_EVENT(mm_collapse_huge_page, TRACE_EVENT(mm_collapse_huge_page_isolate, TP_PROTO(struct folio *folio, int none_or_zero, - int referenced, bool writable, int status), + int referenced, int status), - TP_ARGS(folio, none_or_zero, referenced, writable, status), + TP_ARGS(folio, none_or_zero, referenced, status), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, none_or_zero) __field(int, referenced) - __field(bool, writable) __field(int, status) ), @@ -133,15 +128,13 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, __entry->pfn = folio ? folio_pfn(folio) : -1; __entry->none_or_zero = none_or_zero; __entry->referenced = referenced; - __entry->writable = writable; __entry->status = status; ), - TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s", + TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s", __entry->pfn, __entry->none_or_zero, __entry->referenced, - __entry->writable, __print_symbolic(__entry->status, SCAN_STATUS)) ); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a0f1df2a7ae6..af5f5c80fe4e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -39,7 +39,6 @@ enum scan_result { SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, - SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, @@ -557,7 +556,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; - bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { @@ -671,9 +669,6 @@ next: folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; - - if (pte_write(pteval)) - writable = true; } if (unlikely(cc->is_khugepaged && !referenced)) { @@ -681,13 +676,13 @@ next: } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } @@ -1280,7 +1275,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - bool writable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1344,8 +1338,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_UFFD_WP; goto out_unmap; } - if (pte_write(pteval)) - writable = true; page = vm_normal_page(vma, _address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { @@ -1435,7 +1427,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } -- cgit v1.2.3 From d6d673dd1e92b2bed0096e7e7e9fe5d7e7d2156c Mon Sep 17 00:00:00 2001 From: Ashwini Sahu Date: Mon, 8 Sep 2025 15:26:45 +0530 Subject: uapi: vduse: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a spelling mistake in vduse.h: "regsion" → "region" in the documentation for struct vduse_iova_info. No functional change. Signed-off-by: Ashwini Sahu Message-Id: <20250908095645.610336-1-ashwini@wisig.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vduse.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68a627d04afa..10ad71aa00d6 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -237,7 +237,7 @@ struct vduse_iova_umem { * struct vduse_iova_info - information of one IOVA region * @start: start of the IOVA region * @last: last of the IOVA region - * @capability: capability of the IOVA regsion + * @capability: capability of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of -- cgit v1.2.3 From a05e4e935a6689542d86162b33a484cc704ce39a Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 29 Aug 2025 17:09:44 +0200 Subject: virtio_config: clarify output parameters This was ambiguous enough for a broken patch (206cc44588f7 ("virtio: reject shm region if length is zero")) to make it into the kernel, so make it clearer. Link: https://lore.kernel.org/r/20250816071600-mutt-send-email-mst@kernel.org/ Signed-off-by: Alyssa Ross Message-Id: <20250829150944.233505-1-hi@alyssa.is> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8bf156dde554..7427b79d6f3d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -193,14 +193,15 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, } static inline void virtio_get_features(struct virtio_device *vdev, - u64 *features) + u64 *features_out) { if (vdev->config->get_extended_features) { - vdev->config->get_extended_features(vdev, features); + vdev->config->get_extended_features(vdev, features_out); return; } - virtio_features_from_u64(features, vdev->config->get_features(vdev)); + virtio_features_from_u64(features_out, + vdev->config->get_features(vdev)); } /** @@ -326,11 +327,11 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) static inline bool virtio_get_shm_region(struct virtio_device *vdev, - struct virtio_shm_region *region, u8 id) + struct virtio_shm_region *region_out, u8 id) { if (!vdev->config->get_shm_region) return false; - return vdev->config->get_shm_region(vdev, region, id); + return vdev->config->get_shm_region(vdev, region_out, id); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) -- cgit v1.2.3 From 2ee3a75e42081db3d951c0893f5d654f16d1c0e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 18 Jul 2025 11:26:15 +1000 Subject: nfsd: discard nfsd_file_get_local() This interface was deprecated by commit e6f7e1487ab5 ("nfs_localio: simplify interface to nfsd for getting nfsd_file") and is now unused. So let's remove it. Signed-off-by: NeilBrown Reviewed-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 21 --------------------- fs/nfsd/filecache.h | 1 - fs/nfsd/localio.c | 1 - include/linux/nfslocalio.h | 1 - 4 files changed, 24 deletions(-) (limited to 'include') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..75bc48031c07 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -391,27 +391,6 @@ nfsd_file_put_local(struct nfsd_file __rcu **pnf) return net; } -/** - * nfsd_file_get_local - get nfsd_file reference and reference to net - * @nf: nfsd_file of which to put the reference - * - * Get reference to both the nfsd_file and nf->nf_net. - */ -struct nfsd_file * -nfsd_file_get_local(struct nfsd_file *nf) -{ - struct net *net = nf->nf_net; - - if (nfsd_net_try_get(net)) { - nf = nfsd_file_get(nf); - if (!nf) - nfsd_net_put(net); - } else { - nf = NULL; - } - return nf; -} - /** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..24ddf60e8434 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -63,7 +63,6 @@ int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); -struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..269fa9391dc4 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -122,7 +122,6 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_put = nfsd_net_put, .nfsd_open_local_fh = nfsd_open_local_fh, .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, }; diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 5c7c92659e73..59ea90bd136b 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -63,7 +63,6 @@ struct nfsd_localio_operations { struct nfsd_file __rcu **pnf, const fmode_t); struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **); - struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; -- cgit v1.2.3 From c97b737ef8f10f28424822c139e3b22b9e9bcc2b Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 18 Jul 2025 11:09:56 +0300 Subject: sunrpc: Change ret code of xdr_stream_decode_opaque_fixed Since the opaque is fixed in size, the caller already knows how many bytes were decoded, on success. Thus, xdr_stream_decode_opaque_fixed() doesn't need to return that value. And, xdr_stream_decode_u32 and _u64 both return zero on success. This patch simplifies the caller's error checking to avoid potential integer promotion issues. Suggested-by: Dan Carpenter Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 4 ++-- .../sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a9ec617cf66..8d354015d762 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -721,7 +721,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @len: size of buffer pointed to by @ptr * * Return values: - * On success, returns size of object stored in @ptr + * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t @@ -732,7 +732,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) if (unlikely(!p)) return -EBADMSG; xdr_decode_opaque_fixed(p, ptr, len); - return len; + return 0; } /** diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 index 8b4ff08c49e5..bdc7bd24ffb1 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 @@ -13,5 +13,5 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr {% if annotate %} /* (fixed-length opaque) */ {% endif %} - return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) >= 0; + return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) == 0; }; -- cgit v1.2.3 From afc5b36e29b95fbd31a60b9630d148857e5e513d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:32 -0400 Subject: vfs: add ATTR_CTIME_SET flag When ATTR_ATIME_SET and ATTR_MTIME_SET are set in the ia_valid mask, the notify_change() logic takes that to mean that the request should set those values explicitly, and not override them with "now". With the advent of delegated timestamps, similar functionality is needed for the ctime. Add a ATTR_CTIME_SET flag, and use that to indicate that the ctime should be accepted as-is. Also, clean up the if statements to eliminate the extra negatives. In setattr_copy() and setattr_copy_mgtime() use inode_set_ctime_deleg() when ATTR_CTIME_SET is set, instead of basing the decision on ATTR_DELEG. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/attr.c | 44 +++++++++++++++++++------------------------- include/linux/fs.h | 1 + 2 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/fs/attr.c b/fs/attr.c index 5425c1dbbff9..795f231d00e8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -286,20 +286,12 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) unsigned int ia_valid = attr->ia_valid; struct timespec64 now; - if (ia_valid & ATTR_CTIME) { - /* - * In the case of an update for a write delegation, we must respect - * the value in ia_ctime and not use the current time. - */ - if (ia_valid & ATTR_DELEG) - now = inode_set_ctime_deleg(inode, attr->ia_ctime); - else - now = inode_set_ctime_current(inode); - } else { - /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ - WARN_ON_ONCE(ia_valid & ATTR_MTIME); + if (ia_valid & ATTR_CTIME_SET) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + now = inode_set_ctime_current(inode); + else now = current_time(inode); - } if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); @@ -359,12 +351,11 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) { - if (ia_valid & ATTR_DELEG) - inode_set_ctime_deleg(inode, attr->ia_ctime); - else - inode_set_ctime_to_ts(inode, attr->ia_ctime); - } + + if (ia_valid & ATTR_CTIME_SET) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); @@ -463,15 +454,18 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, now = current_time(inode); - attr->ia_ctime = now; - if (!(ia_valid & ATTR_ATIME_SET)) - attr->ia_atime = now; - else + if (ia_valid & ATTR_ATIME_SET) attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); - if (!(ia_valid & ATTR_MTIME_SET)) - attr->ia_mtime = now; else + attr->ia_atime = now; + if (ia_valid & ATTR_CTIME_SET) + attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode); + else + attr->ia_ctime = now; + if (ia_valid & ATTR_MTIME_SET) attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); + else + attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); diff --git a/include/linux/fs.h b/include/linux/fs.h index 601d036a6c78..74f2bfc51926 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -238,6 +238,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ +#define ATTR_CTIME_SET (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) -- cgit v1.2.3 From 898374fdd7f06fa4c4a66e8be3135efeae6128d5 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 19 Aug 2025 14:04:02 -0400 Subject: nfsd: unregister with rpcbind when deleting a transport When a listener is added, a part of creation of transport also registers program/port with rpcbind. However, when the listener is removed, while transport goes away, rpcbind still has the entry for that port/type. When deleting the transport, unregister with rpcbind when appropriate. ---v2 created a new xpt_flag XPT_RPCB_UNREG to mark TCP and UDP transport and at xprt destroy send rpcbind unregister if flag set. Suggested-by: Chuck Lever Fixes: d093c9089260 ("nfsd: fix management of listener transports") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 3 +++ net/sunrpc/svc_xprt.c | 13 +++++++++++++ net/sunrpc/svcsock.c | 2 ++ 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..2b886f7eb295 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -104,6 +104,9 @@ enum { * it has access to. It is NOT counted * in ->sv_tmpcnt. */ + XPT_RPCB_UNREG, /* transport that needs unregistering + * with rpcbind (TCP, UDP) on destroy + */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..b800d704d807 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1014,6 +1014,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c0d5a27ba674..7b90abc5cf0e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -836,6 +836,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -1350,6 +1351,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { -- cgit v1.2.3 From d73d06dac604043b94a5f18ebb6a69da1b867702 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 2b886f7eb295..da2a2531e110 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -168,7 +168,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index fc70e13b1cb9..cb4010e2dc0c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b800d704d807..6973184ff667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1115,6 +1115,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1127,7 +1128,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1137,6 +1139,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); -- cgit v1.2.3 From e0d3bba84ff8b82d4e8820856a7850afb17c14f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 19 Sep 2025 12:23:25 +0200 Subject: wifi: cfg80211: remove IEEE80211_CHAN_{1,2,4,8,16}MHZ flags These were used by S1G for older chandef representation, but are no longer needed. Clean them up, even if we can't drop them from the userspace API entirely. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 16 +--------------- net/wireless/nl80211.c | 15 --------------- 2 files changed, 1 insertion(+), 30 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1c041ce7a03b..781624f5913a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -101,16 +101,6 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. - * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted - * on this channel. * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, * this flag indicates that a 320 MHz channel cannot use this * channel as the control or any of the secondary channels. @@ -152,11 +142,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = BIT(11), IEEE80211_CHAN_NO_10MHZ = BIT(12), IEEE80211_CHAN_NO_HE = BIT(13), - IEEE80211_CHAN_1MHZ = BIT(14), - IEEE80211_CHAN_2MHZ = BIT(15), - IEEE80211_CHAN_4MHZ = BIT(16), - IEEE80211_CHAN_8MHZ = BIT(17), - IEEE80211_CHAN_16MHZ = BIT(18), + /* can use free bits here */ IEEE80211_CHAN_NO_320MHZ = BIT(19), IEEE80211_CHAN_NO_EHT = BIT(20), IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de34a1d14073..346dfd2bd987 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1280,21 +1280,6 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_1MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_2MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_4MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_8MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_16MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) - goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ)) goto nla_put_failure; -- cgit v1.2.3 From d9a2211dd3aee3ef29fc675f70a1941bc3f4f51f Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:46 +0800 Subject: virtio: Add ID for virtio SPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_ID_SPI definition for virtio SPI. Signed-off-by: Haixu Cui Reviewed-by: Viresh Kumar Reviewed-by: Alex Bennée Link: https://patch.msgid.link/20250908092348.1283552-2-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- include/uapi/linux/virtio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 7aa2eb766205..6c12db16faa3 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -68,6 +68,7 @@ #define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */ #define VIRTIO_ID_BT 40 /* virtio bluetooth */ #define VIRTIO_ID_GPIO 41 /* virtio gpio */ +#define VIRTIO_ID_SPI 45 /* virtio spi */ /* * Virtio Transitional IDs -- cgit v1.2.3 From 6a1f3390fafeafe130b8128b3047452b92911a98 Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:47 +0800 Subject: virtio-spi: Add virtio-spi.h Add virtio-spi.h header for virtio SPI. Signed-off-by: Haixu Cui Link: https://patch.msgid.link/20250908092348.1283552-3-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- MAINTAINERS | 6 ++ include/uapi/linux/virtio_spi.h | 181 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 include/uapi/linux/virtio_spi.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 931e68699ab9..18cd254aa85b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26803,6 +26803,12 @@ S: Maintained F: include/uapi/linux/virtio_snd.h F: sound/virtio/* +VIRTIO SPI DRIVER +M: Haixu Cui +L: virtualization@lists.linux.dev +S: Maintained +F: include/uapi/linux/virtio_spi.h + VIRTUAL BOX GUEST DEVICE DRIVER M: Hans de Goede M: Arnd Bergmann diff --git a/include/uapi/linux/virtio_spi.h b/include/uapi/linux/virtio_spi.h new file mode 100644 index 000000000000..8ab3c970cdd3 --- /dev/null +++ b/include/uapi/linux/virtio_spi.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ +#ifndef _LINUX_VIRTIO_VIRTIO_SPI_H +#define _LINUX_VIRTIO_VIRTIO_SPI_H + +#include +#include +#include +#include + +/* Sample data on trailing clock edge */ +#define VIRTIO_SPI_CPHA _BITUL(0) +/* Clock is high when IDLE */ +#define VIRTIO_SPI_CPOL _BITUL(1) +/* Chip Select is active high */ +#define VIRTIO_SPI_CS_HIGH _BITUL(2) +/* Transmit LSB first */ +#define VIRTIO_SPI_MODE_LSB_FIRST _BITUL(3) +/* Loopback mode */ +#define VIRTIO_SPI_MODE_LOOP _BITUL(4) + +/** + * struct virtio_spi_config - All config fields are read-only for the + * Virtio SPI driver + * @cs_max_number: maximum number of chipselect the host SPI controller + * supports. + * @cs_change_supported: indicates if the host SPI controller supports to toggle + * chipselect after each transfer in one message: + * 0: unsupported, chipselect will be kept in active state throughout the + * message transaction; + * 1: supported. + * Note: Message here contains a sequence of SPI transfers. + * @tx_nbits_supported: indicates the supported number of bit for writing: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @rx_nbits_supported: indicates the supported number of bit for reading: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @bits_per_word_mask: mask indicating which values of bits_per_word are + * supported. If not set, no limitation for bits_per_word. + * @mode_func_supported: indicates the following features are supported or not: + * bit 0-1: CPHA feature + * 0b00: invalid, should support as least one CPHA setting + * 0b01: supports CPHA=0 only + * 0b10: supports CPHA=1 only + * 0b11: supports CPHA=0 and CPHA=1. + * bit 2-3: CPOL feature + * 0b00: invalid, should support as least one CPOL setting + * 0b01: supports CPOL=0 only + * 0b10: supports CPOL=1 only + * 0b11: supports CPOL=0 and CPOL=1. + * bit 4: chipselect active high feature, 0 for unsupported and 1 for + * supported, chipselect active low is supported by default. + * bit 5: LSB first feature, 0 for unsupported and 1 for supported, + * MSB first is supported by default. + * bit 6: loopback mode feature, 0 for unsupported and 1 for supported, + * normal mode is supported by default. + * @max_freq_hz: the maximum clock rate supported in Hz unit, 0 means no + * limitation for transfer speed. + * @max_word_delay_ns: the maximum word delay supported, in nanoseconds. + * A value of 0 indicates that word delay is unsupported. + * Each transfer may consist of a sequence of words. + * @max_cs_setup_ns: the maximum delay supported after chipselect is asserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * asserted. + * @max_cs_hold_ns: the maximum delay supported before chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce before chipselect + * is deasserted. + * @max_cs_incative_ns: maximum delay supported after chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * deasserted. + */ +struct virtio_spi_config { + __u8 cs_max_number; + __u8 cs_change_supported; +#define VIRTIO_SPI_RX_TX_SUPPORT_DUAL _BITUL(0) +#define VIRTIO_SPI_RX_TX_SUPPORT_QUAD _BITUL(1) +#define VIRTIO_SPI_RX_TX_SUPPORT_OCTAL _BITUL(2) + __u8 tx_nbits_supported; + __u8 rx_nbits_supported; + __le32 bits_per_word_mask; +#define VIRTIO_SPI_MF_SUPPORT_CPHA_0 _BITUL(0) +#define VIRTIO_SPI_MF_SUPPORT_CPHA_1 _BITUL(1) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_0 _BITUL(2) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_1 _BITUL(3) +#define VIRTIO_SPI_MF_SUPPORT_CS_HIGH _BITUL(4) +#define VIRTIO_SPI_MF_SUPPORT_LSB_FIRST _BITUL(5) +#define VIRTIO_SPI_MF_SUPPORT_LOOPBACK _BITUL(6) + __le32 mode_func_supported; + __le32 max_freq_hz; + __le32 max_word_delay_ns; + __le32 max_cs_setup_ns; + __le32 max_cs_hold_ns; + __le32 max_cs_inactive_ns; +}; + +/** + * struct spi_transfer_head - virtio SPI transfer descriptor + * @chip_select_id: chipselect index the SPI transfer used. + * @bits_per_word: the number of bits in each SPI transfer word. + * @cs_change: whether to deselect device after finishing this transfer + * before starting the next transfer, 0 means cs keep asserted and + * 1 means cs deasserted then asserted again. + * @tx_nbits: bus width for write transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @rx_nbits: bus width for read transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @reserved: for future use. + * @mode: SPI transfer mode. + * bit 0: CPHA, determines the timing (i.e. phase) of the data + * bits relative to the clock pulses.For CPHA=0, the + * "out" side changes the data on the trailing edge of the + * preceding clock cycle, while the "in" side captures the data + * on (or shortly after) the leading edge of the clock cycle. + * For CPHA=1, the "out" side changes the data on the leading + * edge of the current clock cycle, while the "in" side + * captures the data on (or shortly after) the trailing edge of + * the clock cycle. + * bit 1: CPOL, determines the polarity of the clock. CPOL=0 is a + * clock which idles at 0, and each cycle consists of a pulse + * of 1. CPOL=1 is a clock which idles at 1, and each cycle + * consists of a pulse of 0. + * bit 2: CS_HIGH, if 1, chip select active high, else active low. + * bit 3: LSB_FIRST, determines per-word bits-on-wire, if 0, MSB + * first, else LSB first. + * bit 4: LOOP, loopback mode. + * @freq: the transfer speed in Hz. + * @word_delay_ns: delay to be inserted between consecutive words of a + * transfer, in ns unit. + * @cs_setup_ns: delay to be introduced after CS is asserted, in ns + * unit. + * @cs_delay_hold_ns: delay to be introduced before CS is deasserted + * for each transfer, in ns unit. + * @cs_change_delay_inactive_ns: delay to be introduced after CS is + * deasserted and before next asserted, in ns unit. + */ +struct spi_transfer_head { + __u8 chip_select_id; + __u8 bits_per_word; + __u8 cs_change; + __u8 tx_nbits; + __u8 rx_nbits; + __u8 reserved[3]; + __le32 mode; + __le32 freq; + __le32 word_delay_ns; + __le32 cs_setup_ns; + __le32 cs_delay_hold_ns; + __le32 cs_change_delay_inactive_ns; +}; + +/** + * struct spi_transfer_result - virtio SPI transfer result + * @result: Transfer result code. + * VIRTIO_SPI_TRANS_OK: Transfer successful. + * VIRTIO_SPI_PARAM_ERR: Parameter error. + * VIRTIO_SPI_TRANS_ERR: Transfer error. + */ +struct spi_transfer_result { +#define VIRTIO_SPI_TRANS_OK 0 +#define VIRTIO_SPI_PARAM_ERR 1 +#define VIRTIO_SPI_TRANS_ERR 2 + __u8 result; +}; + +#endif /* #ifndef _LINUX_VIRTIO_VIRTIO_SPI_H */ -- cgit v1.2.3 From 099f942182e3695554cba44e4bafb08a4111b50f Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:20 +0200 Subject: spi: keep track of number of chipselects in spi_device There are several places where we need to iterate over a device's chipselect. To be able to do it efficiently, store the number of chipselects in spi_device, like we do for controllers. Since we now use a device supplied value, add a check to make sure it isn't more than we can support. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-3-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 29 +++++++++++++++++++++-------- include/linux/spi/spi.h | 4 +++- 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index b07d6cdf587c..6598fb862d80 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -586,6 +586,7 @@ struct spi_device *spi_alloc_device(struct spi_controller *ctlr) spi->dev.bus = &spi_bus_type; spi->dev.release = spidev_release; spi->mode = ctlr->buswidth_override_bits; + spi->num_chipselect = 1; device_initialize(&spi->dev); return spi; @@ -635,7 +636,7 @@ static inline int spi_dev_check_cs(struct device *dev, u8 idx_new; cs = spi_get_chipselect(spi, idx); - for (idx_new = new_idx; idx_new < SPI_CS_CNT_MAX; idx_new++) { + for (idx_new = new_idx; idx_new < new_spi->num_chipselect; idx_new++) { cs_new = spi_get_chipselect(new_spi, idx_new); if (is_valid_cs(cs) && is_valid_cs(cs_new) && cs == cs_new) { dev_err(dev, "chipselect %u already in use\n", cs_new); @@ -652,7 +653,7 @@ static int spi_dev_check(struct device *dev, void *data) int status, idx; if (spi->controller == new_spi->controller) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, new_spi, 0); if (status) return status; @@ -674,7 +675,13 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi->num_chipselect > SPI_CS_CNT_MAX) { + dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, + SPI_CS_CNT_MAX); + return -EOVERFLOW; + } + + for (idx = 0; idx < spi->num_chipselect; idx++) { /* Chipselects are numbered 0..max; validate. */ cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs) && cs >= ctlr->num_chipselect) { @@ -689,7 +696,7 @@ static int __spi_add_device(struct spi_device *spi) * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ if (!spi_controller_is_target(ctlr)) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); if (status) return status; @@ -717,7 +724,7 @@ static int __spi_add_device(struct spi_device *spi) if (ctlr->cs_gpiods) { u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs)) spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); @@ -1024,7 +1031,7 @@ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *mes /*-------------------------------------------------------------------------*/ #define spi_for_each_valid_cs(spi, idx) \ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) \ + for (idx = 0; idx < spi->num_chipselect; idx++) \ if (!(spi->cs_index_mask & BIT(idx))) {} else static inline bool spi_is_last_cs(struct spi_device *spi) @@ -1080,8 +1087,12 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) - spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : SPI_INVALID_CS; + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (enable && idx < spi->num_chipselect) + spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); + else + spi->controller->last_cs[idx] = SPI_INVALID_CS; + } spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH; if (spi->controller->last_cs_mode_high) @@ -2452,6 +2463,8 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); return -EINVAL; } + + spi->num_chipselect = rc; for (idx = 0; idx < rc; idx++) spi_set_chipselect(spi, idx, cs[idx]); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e9ea43234d9a..49c048277e97 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -170,6 +170,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * two delays will be added up. * @chip_select: Array of physical chipselect, spi->chipselect[i] gives * the corresponding physical CS for logical CS i. + * @num_chipselect: Number of physical chipselects used. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines * (optional, NULL when not using a GPIO line) @@ -229,6 +230,7 @@ struct spi_device { struct spi_delay cs_inactive; u8 chip_select[SPI_CS_CNT_MAX]; + u8 num_chipselect; /* * Bit mask of the chipselect(s) that the driver need to use from @@ -315,7 +317,7 @@ static inline bool spi_is_csgpiod(struct spi_device *spi) { u8 idx; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { if (spi_get_csgpiod(spi, idx)) return true; } -- cgit v1.2.3 From 08fda410bae41cc8dde9697f9104da525be53153 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:24 +0200 Subject: spi: reduce device chip select limit again The spi chipselect limit SPI_CS_CNT_MAX was raised with commit 2f8c7c3715f2 ("spi: Raise limit on number of chip selects") from 4 to 16 to accommodate spi controllers with more than 4 chip selects, and then later to 24 with commit 96893cdd4760 ("spi: Raise limit on number of chip selects to 24"). Now that we removed SPI_CS_CNT_MAX limiting the chip selects of controllers, we can reduce the amount of chip selects per device again to 4, the original value. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-7-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 49c048277e97..df4842abbc6f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 24 +#define SPI_CS_CNT_MAX 4 struct dma_chan; struct software_node; -- cgit v1.2.3 From e336ab509b43ea601801dfa05b4270023c3ed007 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:25 +0200 Subject: spi: rename SPI_CS_CNT_MAX => SPI_DEVICE_CS_CNT_MAX Rename SPI_CS_CNT_MAX to SPI_DEVICE_CS_CNT_MAX to make it more obvious that this is the max number of CS per device supported, not per controller. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-8-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 2 +- drivers/spi/spi.c | 14 +++++++------- include/linux/spi/spi.h | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index af253b86f1ab..2ce4fc136a51 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -33,7 +33,7 @@ #define CQSPI_NAME "cadence-qspi" #define CQSPI_MAX_CHIPSELECT 4 -static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); +static_assert(CQSPI_MAX_CHIPSELECT <= SPI_DEVICE_CS_CNT_MAX); /* Quirks */ #define CQSPI_NEEDS_WR_DELAY BIT(0) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 2eb361e9e44d..2e0647a06890 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -670,9 +670,9 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - if (spi->num_chipselect > SPI_CS_CNT_MAX) { + if (spi->num_chipselect > SPI_DEVICE_CS_CNT_MAX) { dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); return -EOVERFLOW; } @@ -699,7 +699,7 @@ static int __spi_add_device(struct spi_device *spi) } /* Initialize unused logical CS as invalid */ - for (idx = spi->num_chipselect; idx < SPI_CS_CNT_MAX; idx++) + for (idx = spi->num_chipselect; idx < SPI_DEVICE_CS_CNT_MAX; idx++) spi_set_chipselect(spi, idx, SPI_INVALID_CS); /* Set the bus ID string */ @@ -1076,7 +1076,7 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) { if (enable && idx < spi->num_chipselect) spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); else @@ -2354,7 +2354,7 @@ static void of_spi_parse_dt_cs_delay(struct device_node *nc, static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { - u32 value, cs[SPI_CS_CNT_MAX]; + u32 value, cs[SPI_DEVICE_CS_CNT_MAX]; int rc, idx; /* Mode (clock phase/polarity/etc.) */ @@ -2429,7 +2429,7 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); if (rc < 0) { dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n", nc, rc); @@ -3313,7 +3313,7 @@ int spi_register_controller(struct spi_controller *ctlr) } /* Setting last_cs to SPI_INVALID_CS means no chip selected */ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) ctlr->last_cs[idx] = SPI_INVALID_CS; status = device_add(&ctlr->dev); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index df4842abbc6f..cb2c2df31089 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 4 +#define SPI_DEVICE_CS_CNT_MAX 4 struct dma_chan; struct software_node; @@ -229,7 +229,7 @@ struct spi_device { struct spi_delay cs_hold; struct spi_delay cs_inactive; - u8 chip_select[SPI_CS_CNT_MAX]; + u8 chip_select[SPI_DEVICE_CS_CNT_MAX]; u8 num_chipselect; /* @@ -238,9 +238,9 @@ struct spi_device { * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ - u32 cs_index_mask : SPI_CS_CNT_MAX; + u32 cs_index_mask : SPI_DEVICE_CS_CNT_MAX; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + struct gpio_desc *cs_gpiod[SPI_DEVICE_CS_CNT_MAX]; /* Chip select gpio desc */ /* * Likely need more hooks for more protocol options affecting how @@ -721,8 +721,8 @@ struct spi_controller { bool auto_runtime_pm; bool fallback; bool last_cs_mode_high; - s8 last_cs[SPI_CS_CNT_MAX]; - u32 last_cs_index_mask : SPI_CS_CNT_MAX; + s8 last_cs[SPI_DEVICE_CS_CNT_MAX]; + u32 last_cs_index_mask : SPI_DEVICE_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; -- cgit v1.2.3 From 96384a34dd15b0e7357a34af5c848d1115a35e62 Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Fri, 12 Sep 2025 14:06:22 +0530 Subject: ASoc: tas2783A: machine driver amp utility for TI devices Machine driver amp utility file to initialize and support multiple tas2783a devices are added. Reviewed-by: Bard Liao Signed-off-by: Niranjan H Y -- v5: - removed empty line in soc_sdw_ti_amp.c Link: https://patch.msgid.link/20250912083624.804-3-niranjan.hy@ti.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 8 ++++ sound/soc/sdw_utils/Makefile | 3 +- sound/soc/sdw_utils/soc_sdw_ti_amp.c | 92 ++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 sound/soc/sdw_utils/soc_sdw_ti_amp.c (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 6049a5d0cfcd..3c5e9b2af7f1 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -248,5 +248,13 @@ int asoc_sdw_cs42l43_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_so int asoc_sdw_cs42l43_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_maxim_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +/* TI */ +int asoc_sdw_ti_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback); +int asoc_sdw_ti_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, + const char *name_prefix); #endif diff --git a/sound/soc/sdw_utils/Makefile b/sound/soc/sdw_utils/Makefile index daf019113553..a87c53e1a2c1 100644 --- a/sound/soc/sdw_utils/Makefile +++ b/sound/soc/sdw_utils/Makefile @@ -6,5 +6,6 @@ snd-soc-sdw-utils-y := soc_sdw_utils.o soc_sdw_dmic.o soc_sdw_rt_dmic.o \ soc_sdw_bridge_cs35l56.o \ soc_sdw_cs42l42.o soc_sdw_cs42l43.o \ soc_sdw_cs_amp.o \ - soc_sdw_maxim.o + soc_sdw_maxim.o \ + soc_sdw_ti_amp.o obj-$(CONFIG_SND_SOC_SDW_UTILS) += snd-soc-sdw-utils.o diff --git a/sound/soc/sdw_utils/soc_sdw_ti_amp.c b/sound/soc/sdw_utils/soc_sdw_ti_amp.c new file mode 100644 index 000000000000..f0011360ae9b --- /dev/null +++ b/sound/soc/sdw_utils/soc_sdw_ti_amp.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2025 Texas Instruments Inc. + +/* + * soc_sdw_ti_amp - Helpers to handle TI's soundwire based codecs + */ + +#include +#include +#include +#include +#include +#include + +#define TIAMP_SPK_VOLUME_0DB 200 + +int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, + const char *name_prefix) +{ + char *volume_ctl_name; + int ret; + + volume_ctl_name = kasprintf(GFP_KERNEL, "%s Speaker Volume", + name_prefix); + if (!volume_ctl_name) + return -ENOMEM; + + ret = snd_soc_limit_volume(card, volume_ctl_name, + TIAMP_SPK_VOLUME_0DB); + if (ret) + dev_err(card->dev, + "%s update failed %d\n", + volume_ctl_name, ret); + + kfree(volume_ctl_name); + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_amp_initial_settings, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_ti_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, + struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + char widget_name[16]; + char speaker[16]; + struct snd_soc_dapm_route route = {speaker, NULL, widget_name}; + struct snd_soc_dai *codec_dai; + const char *prefix; + int i, ret = 0; + + for_each_rtd_codec_dais(rtd, i, codec_dai) { + if (!strstr(codec_dai->name, "tas2783")) + continue; + + prefix = codec_dai->component->name_prefix; + if (!strncmp(prefix, "tas2783-1", strlen("tas2783-1"))) { + strscpy(speaker, "Left Spk", sizeof(speaker)); + } else if (!strncmp(prefix, "tas2783-2", strlen("tas2783-2"))) { + strscpy(speaker, "Right Spk", sizeof(speaker)); + } else { + ret = -EINVAL; + dev_err(card->dev, "unhandled prefix %s", prefix); + break; + } + + snprintf(widget_name, sizeof(widget_name), "%s SPK", prefix); + ret = asoc_sdw_ti_amp_initial_settings(card, prefix); + if (ret) + return ret; + + ret = snd_soc_dapm_add_routes(&card->dapm, &route, 1); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_spk_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_ti_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback) +{ + if (!playback) + return 0; + + info->amp_num++; + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_amp_init, "SND_SOC_SDW_UTILS"); -- cgit v1.2.3 From 8535bd38b4d58a3d19bf8e7dfa66e1d8180b316a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:35 +0200 Subject: cgroup: add missing ns_common include Add the missing include of the ns_common header. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index b7dbf4d623d2..81ccbdee425b 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -2,6 +2,8 @@ #ifndef _LINUX_CGROUP_NAMESPACE_H #define _LINUX_CGROUP_NAMESPACE_H +#include + struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; -- cgit v1.2.3 From d7610cb7454bbd8bf6d58f71b0ed57155d3c545f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:36 +0200 Subject: ns: simplify ns_common_init() further Simply derive the ns operations from the namespace type. Acked-by: Thomas Gleixner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 30 ++++++++++++++++++++++++++---- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 9 +-------- 9 files changed, 35 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 271cd6294c8a..d65917ec5544 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,9 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else - ret = ns_common_init(new_ns, &mntns_operations); + ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index aea8528d799a..56492cd9ff8d 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -25,6 +25,17 @@ extern struct time_namespace init_time_ns; extern struct user_namespace init_user_ns; extern struct uts_namespace init_uts_ns; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -84,10 +95,21 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) -#define ns_common_init(__ns, __ops) \ - __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/ipc/namespace.c b/ipc/namespace.c index bd85d1c9d2c2..d89dfd718d2b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(ns, &ipcns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 16ead7508371..04c98338ac08 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(new_ns, &cgroupns_operations); + ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 162f5fb63d75..a262a3f19443 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(ns, &pidns_operations); + err = ns_common_init(ns); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 7aa4d6fedd49..9f26e61be044 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(ns, &timens_operations); + err = ns_common_init(ns); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f9df45c46235..e1559e8a8a02 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(ns, &userns_operations); + ret = ns_common_init(ns); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index 95d733eb2c98..00001592ad13 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(ns, &utsns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index d5e3fd819163..bdea7d5fac56 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -400,16 +400,9 @@ static __net_init void preinit_net_sysctl(struct net *net) /* init code that must occur even if setup_net() is not called. */ static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { - const struct proc_ns_operations *ns_ops; int ret; -#ifdef CONFIG_NET_NS - ns_ops = &netns_operations; -#else - ns_ops = NULL; -#endif - - ret = ns_common_init(net, ns_ops); + ret = ns_common_init(net); if (ret) return ret; -- cgit v1.2.3 From 0950c64ae38661bd97127e9aa0522f1624f82006 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Mon, 22 Sep 2025 12:26:06 +0000 Subject: workqueue: fix texinfodocs warning for WQ_* flags reference Sphinx emitted a warning during make texinfodocs: WARNING: Inline literal start-string without end-string. This was caused by the trailing '*' in "%WQ_*" being parsed as reStructuredText markup in the kernel-doc comment. Escape the '*' in the comment so that Sphinx treats it as a literal character, resolving the warning. Signed-off-by: Kriish Sharma Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 71a9900c03c7..dabc351cc127 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu); * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * - * For detailed information on %WQ_* flags, please refer to + * For detailed information on %WQ_\* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: -- cgit v1.2.3 From cd875625b475dc4e28ac302ccb3422cc9f678f89 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 18 Sep 2025 17:33:18 -0700 Subject: ptp: document behavior of PTP_STRICT_FLAGS Commit 6138e687c7b6 ("ptp: Introduce strict checking of external time stamp options.") added the PTP_STRICT_FLAGS to the set of flags supported for the external timestamp request ioctl. It is only supported by PTP_EXTTS_REQUEST2, as it was introduced the introduction of the new ioctls. Further, the kernel has always set this flag for PTP_EXTTS_REQUEST2 regardless of whether or not the user requested the behavior. This effectively means that the flag is not useful for userspace. If the user issues a PTP_EXTTS_REQUEST ioctl, the flag is ignored due to not being supported on the old ioctl. If the user issues a PTP_EXTTS_REQUEST2 ioctl, the flag will be set by the kernel regardless of whether the user set the flag in their structure. Add a comment documenting this behavior in the uAPI header file. Signed-off-by: Jacob Keller Reviewed-by: Vadim Fedorenko Acked-by: Richard Cochran Reviewed-by: Kory Maincent Tested-by: James Clark Link: https://patch.msgid.link/20250918-jk-fix-bcm-phy-supported-flags-v1-3-747b60407c9c@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ptp_clock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d6..2c3346e91dbe 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -37,6 +37,9 @@ /* * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + * + * Note: PTP_STRICT_FLAGS is always enabled by the kernel for + * PTP_EXTTS_REQUEST2 regardless of whether it is set by userspace. */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ -- cgit v1.2.3 From 6445bb832dc0ba0ab816e5bd79ef0209cdd46d3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:28 +0000 Subject: tcp: Remove osk from __inet_hash() arg. __inet_hash() is called from inet_hash() and inet6_hash with osk NULL. Let's remove the 2nd arg from __inet_hash(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- net/ipv4/inet_hashtables.c | 6 +++--- net/ipv6/inet6_hashtables.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a3b32241c2f2..64bc8870db88 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -289,7 +289,7 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk, struct sock *osk); +int __inet_hash(struct sock *sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ef4ccfd46ff6..baee5c075e6c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -739,7 +739,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk, struct sock *osk) +int __inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; @@ -747,7 +747,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); - inet_ehash_nolisten(sk, osk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); local_bh_enable(); return 0; } @@ -779,7 +779,7 @@ int inet_hash(struct sock *sk) int err = 0; if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); + err = __inet_hash(sk); return err; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index a3a9ea49fee2..64fcd7df0c9a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -374,7 +374,7 @@ int inet6_hash(struct sock *sk) int err = 0; if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); + err = __inet_hash(sk); return err; } -- cgit v1.2.3 From 0ac44301e3bf4f5abc892ab530188ca95c61e59f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:29 +0000 Subject: tcp: Remove inet6_hash(). inet_hash() and inet6_hash() are exactly the same. Also, we do not need to export inet6_hash(). Let's consolidate the two into __inet_hash() and rename it to inet_hash(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 -- include/net/inet_hashtables.h | 1 - net/ipv4/inet_hashtables.c | 17 +++++------------ net/ipv6/inet6_hashtables.c | 11 ----------- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 6 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 1f985d2012ce..282e29237d93 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -167,8 +167,6 @@ struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *daddr, const __be16 dport, const int dif); -int inet6_hash(struct sock *sk); - static inline bool inet6_match(const struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 64bc8870db88..b787be651ce7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -289,7 +289,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index baee5c075e6c..efa8a615b868 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -739,12 +739,15 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; + if (sk->sk_state == TCP_CLOSE) + return 0; + if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, NULL, NULL); @@ -772,17 +775,7 @@ unlock: return err; } -EXPORT_IPV6_MOD(__inet_hash); - -int inet_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk); - - return err; -} +EXPORT_IPV6_MOD(inet_hash); void inet_unhash(struct sock *sk) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 64fcd7df0c9a..5e1da088d8e1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -368,14 +368,3 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); - -int inet6_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet6_hash); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1e5b2a186fb..9622c2776ade 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2355,7 +2355,7 @@ struct proto tcpv6_prot = { .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, -- cgit v1.2.3 From c9809f03c158f07eaa76c7dd3606fc0a184520f2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:58 +0200 Subject: mptcp: pm: netlink: only add server-side attr when true This attribute is a boolean. No need to add it to set it to 'false'. Indeed, the default value when this attribute is not set is naturally 'false'. A few bytes can then be saved by not adding this attribute if the connection is not on the server side. This prepares the future deprecation of its attribute, in favour of a new flag. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-1-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 4 +++- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d1b4829b580a..fc47a2931014 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 7359d34da446..bf44a5cf5b5a 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 483ddbb9ec40..33a6bf536c02 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,7 +413,9 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + /* only set when it is the server side */ + if (READ_ONCE(msk->pm.server_side) && + nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; if (READ_ONCE(msk->pm.remote_deny_join_id0)) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 3d45991f24ed..87323942cb8a 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -241,7 +241,7 @@ make_connection() print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && - [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_serverside:-0}" = 0 ] && [ "${server_serverside:-0}" = 1 ] && [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass -- cgit v1.2.3 From 3d7ae91107b839ffeeb19730a2e2a46e0054bae8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:59 +0200 Subject: mptcp: pm: netlink: announce server-side flag Now that the 'flags' attribute is used, it seems interesting to add one flag for 'server-side', a boolean value. This is duplicating the info from the dedicated 'server-side' attribute, but it will be deprecated in the next commit, and removed in a few versions. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-2-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm_netlink.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5fd5b4cf75ca..95d621f6d598 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -32,6 +32,7 @@ #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) +#define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 33a6bf536c02..aa0c73faaa6a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,10 +413,13 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - /* only set when it is the server side */ - if (READ_ONCE(msk->pm.server_side) && - nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) - return -EMSGSIZE; + if (READ_ONCE(msk->pm.server_side)) { + flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; + + /* only set when it is the server side */ + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) + return -EMSGSIZE; + } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; -- cgit v1.2.3 From 5c967ebb551919661166305c0ff9422e41065c02 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:09:02 +0200 Subject: mptcp: use _BITUL() instead of (1 << x) Simply to use the proper way to declare bits, and to align with all other flags declared in this file. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-5-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 95d621f6d598..15eef878690b 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -34,11 +34,11 @@ #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) #define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) -#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) -#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) -#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) -#define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) -#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) +#define MPTCP_PM_ADDR_FLAG_SIGNAL _BITUL(0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW _BITUL(1) +#define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) +#define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) +#define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) struct mptcp_info { __u8 mptcpi_subflows; -- cgit v1.2.3 From a571f08d3db215dd6ec294d8faac8cc4184bc4e4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:46:36 +0100 Subject: net: phy: add phy_interface_copy() Add a helper for copying PHY interface bitmasks. This will be used by the SFP bus code, which will then be moved to phylink in the subsequent patches. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVU-000000061W8-2IDT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7da9e19471c9..d09fc42e61f3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -169,6 +169,11 @@ static inline bool phy_interface_empty(const unsigned long *intf) return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); } +static inline void phy_interface_copy(unsigned long *d, const unsigned long *s) +{ + bitmap_copy(d, s, PHY_INTERFACE_MODE_MAX); +} + static inline unsigned int phy_interface_weight(const unsigned long *intf) { return bitmap_weight(intf, PHY_INTERFACE_MODE_MAX); -- cgit v1.2.3 From ddae6127afbba46e32af3b31eb7bba939e1fad96 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:46:41 +0100 Subject: net: sfp: pre-parse the module support Pre-parse the module support on insert rather than when the upstream requests the data. This will allow more flexible and extensible parsing. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVZ-000000061WE-2pXD@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 80 ++++++++++++++++++++++++++++++++--------------- include/linux/sfp.h | 22 +++++++++++++ 2 files changed, 77 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index f13c00b5b449..35030c527fbe 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -22,7 +22,6 @@ struct sfp_bus { const struct sfp_socket_ops *socket_ops; struct device *sfp_dev; struct sfp *sfp; - const struct sfp_quirk *sfp_quirk; const struct sfp_upstream_ops *upstream_ops; void *upstream; @@ -30,6 +29,8 @@ struct sfp_bus { bool registered; bool started; + + struct sfp_module_caps caps; }; /** @@ -48,6 +49,13 @@ struct sfp_bus { */ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support) +{ + return bus->caps.port; +} +EXPORT_SYMBOL_GPL(sfp_parse_port); + +static void sfp_module_parse_port(struct sfp_bus *bus, + const struct sfp_eeprom_id *id) { int port; @@ -91,21 +99,18 @@ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, break; } - if (support) { - switch (port) { - case PORT_FIBRE: - phylink_set(support, FIBRE); - break; + switch (port) { + case PORT_FIBRE: + phylink_set(bus->caps.link_modes, FIBRE); + break; - case PORT_TP: - phylink_set(support, TP); - break; - } + case PORT_TP: + phylink_set(bus->caps.link_modes, TP); + break; } - return port; + bus->caps.port = port; } -EXPORT_SYMBOL_GPL(sfp_parse_port); /** * sfp_may_have_phy() - indicate whether the module may have a PHY @@ -117,8 +122,17 @@ EXPORT_SYMBOL_GPL(sfp_parse_port); */ bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { - if (id->base.e1000_base_t) - return true; + return bus->caps.may_have_phy; +} +EXPORT_SYMBOL_GPL(sfp_may_have_phy); + +static void sfp_module_parse_may_have_phy(struct sfp_bus *bus, + const struct sfp_eeprom_id *id) +{ + if (id->base.e1000_base_t) { + bus->caps.may_have_phy = true; + return; + } if (id->base.phys_id != SFF8024_ID_DWDM_SFP) { switch (id->base.extended_cc) { @@ -126,13 +140,13 @@ bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) case SFF8024_ECC_10GBASE_T_SR: case SFF8024_ECC_5GBASE_T: case SFF8024_ECC_2_5GBASE_T: - return true; + bus->caps.may_have_phy = true; + return; } } - return false; + bus->caps.may_have_phy = false; } -EXPORT_SYMBOL_GPL(sfp_may_have_phy); /** * sfp_parse_support() - Parse the eeprom id for supported link modes @@ -148,8 +162,17 @@ EXPORT_SYMBOL_GPL(sfp_may_have_phy); void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support, unsigned long *interfaces) { + linkmode_or(support, support, bus->caps.link_modes); + phy_interface_copy(interfaces, bus->caps.interfaces); +} +EXPORT_SYMBOL_GPL(sfp_parse_support); + +static void sfp_module_parse_support(struct sfp_bus *bus, + const struct sfp_eeprom_id *id) +{ + unsigned long *interfaces = bus->caps.interfaces; + unsigned long *modes = bus->caps.link_modes; unsigned int br_min, br_nom, br_max; - __ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = { 0, }; /* Decode the bitrate information to MBd */ br_min = br_nom = br_max = 0; @@ -338,13 +361,22 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, phylink_set(modes, Autoneg); phylink_set(modes, Pause); phylink_set(modes, Asym_Pause); +} + +static void sfp_init_module(struct sfp_bus *bus, + const struct sfp_eeprom_id *id, + const struct sfp_quirk *quirk) +{ + memset(&bus->caps, 0, sizeof(bus->caps)); - if (bus->sfp_quirk && bus->sfp_quirk->modes) - bus->sfp_quirk->modes(id, modes, interfaces); + sfp_module_parse_support(bus, id); + sfp_module_parse_port(bus, id); + sfp_module_parse_may_have_phy(bus, id); - linkmode_or(support, support, modes); + if (quirk && quirk->modes) + quirk->modes(id, bus->caps.link_modes, + bus->caps.interfaces); } -EXPORT_SYMBOL_GPL(sfp_parse_support); /** * sfp_select_interface() - Select appropriate phy_interface_t mode @@ -794,7 +826,7 @@ int sfp_module_insert(struct sfp_bus *bus, const struct sfp_eeprom_id *id, const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus); int ret = 0; - bus->sfp_quirk = quirk; + sfp_init_module(bus, id, quirk); if (ops && ops->module_insert) ret = ops->module_insert(bus->upstream, id); @@ -809,8 +841,6 @@ void sfp_module_remove(struct sfp_bus *bus) if (ops && ops->module_remove) ops->module_remove(bus->upstream); - - bus->sfp_quirk = NULL; } EXPORT_SYMBOL_GPL(sfp_module_remove); diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 60c65cea74f6..5fb59cf49882 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -521,6 +521,28 @@ struct ethtool_eeprom; struct ethtool_modinfo; struct sfp_bus; +/** + * struct sfp_module_caps - sfp module capabilities + * @interfaces: bitmap of interfaces that the module may support + * @link_modes: bitmap of ethtool link modes that the module may support + */ +struct sfp_module_caps { + DECLARE_PHY_INTERFACE_MASK(interfaces); + __ETHTOOL_DECLARE_LINK_MODE_MASK(link_modes); + /** + * @may_have_phy: indicate whether the module may have an ethernet PHY + * There is no way to be sure that a module has a PHY as the EEPROM + * doesn't contain this information. When set, this does not mean that + * the module definitely has a PHY. + */ + bool may_have_phy; + /** + * @port: one of ethtool %PORT_* definitions, parsed from the module + * EEPROM, or %PORT_OTHER if the port type is not known. + */ + u8 port; +}; + /** * struct sfp_upstream_ops - upstream operations structure * @attach: called when the sfp socket driver is bound to the upstream -- cgit v1.2.3 From 64fb4a3ae8a5d9c4d27d9f4ae58e38200fc3d44b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:46:51 +0100 Subject: net: sfp: provide sfp_get_module_caps() Provide a function to retrieve the current sfp_module_caps structure so that upstreams can get the entire module support in one go. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVj-000000061WQ-3q47@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 6 ++++++ include/linux/sfp.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index b77190494b04..e8cf411396fa 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -33,6 +33,12 @@ struct sfp_bus { struct sfp_module_caps caps; }; +const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus) +{ + return &bus->caps; +} +EXPORT_SYMBOL_GPL(sfp_get_module_caps); + /** * sfp_parse_port() - Parse the EEPROM base ID, setting the port type * @bus: a pointer to the &struct sfp_bus structure for the sfp module diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 5fb59cf49882..9f29fcad52be 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -576,6 +576,7 @@ struct sfp_upstream_ops { }; #if IS_ENABLED(CONFIG_SFP) +const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus); int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support); bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id); @@ -600,6 +601,12 @@ int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, void sfp_bus_del_upstream(struct sfp_bus *bus); const char *sfp_get_name(struct sfp_bus *bus); #else +static inline const struct sfp_module_caps * +sfp_get_module_caps(struct sfp_bus *bus) +{ + return NULL; +} + static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support) -- cgit v1.2.3 From 9ce138735efcb395974952972aa5dbd1d444ac2c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:47:07 +0100 Subject: net: sfp: remove old sfp_parse_* functions Remove the old sfp_parse_*() functions that are now no longer used. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVz-000000061Wj-13Yd@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 54 ----------------------------------------------- include/linux/sfp.h | 25 ---------------------- 2 files changed, 79 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index e8cf411396fa..b945d75966d5 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -39,27 +39,6 @@ const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus) } EXPORT_SYMBOL_GPL(sfp_get_module_caps); -/** - * sfp_parse_port() - Parse the EEPROM base ID, setting the port type - * @bus: a pointer to the &struct sfp_bus structure for the sfp module - * @id: a pointer to the module's &struct sfp_eeprom_id - * @support: optional pointer to an array of unsigned long for the - * ethtool support mask - * - * Parse the EEPROM identification given in @id, and return one of - * %PORT_TP, %PORT_FIBRE or %PORT_OTHER. If @support is non-%NULL, - * also set the ethtool %ETHTOOL_LINK_MODE_xxx_BIT corresponding with - * the connector type. - * - * If the port type is not known, returns %PORT_OTHER. - */ -int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support) -{ - return bus->caps.port; -} -EXPORT_SYMBOL_GPL(sfp_parse_port); - static void sfp_module_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { @@ -118,20 +97,6 @@ static void sfp_module_parse_port(struct sfp_bus *bus, bus->caps.port = port; } -/** - * sfp_may_have_phy() - indicate whether the module may have a PHY - * @bus: a pointer to the &struct sfp_bus structure for the sfp module - * @id: a pointer to the module's &struct sfp_eeprom_id - * - * Parse the EEPROM identification given in @id, and return whether - * this module may have a PHY. - */ -bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) -{ - return bus->caps.may_have_phy; -} -EXPORT_SYMBOL_GPL(sfp_may_have_phy); - static void sfp_module_parse_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { @@ -154,25 +119,6 @@ static void sfp_module_parse_may_have_phy(struct sfp_bus *bus, bus->caps.may_have_phy = false; } -/** - * sfp_parse_support() - Parse the eeprom id for supported link modes - * @bus: a pointer to the &struct sfp_bus structure for the sfp module - * @id: a pointer to the module's &struct sfp_eeprom_id - * @support: pointer to an array of unsigned long for the ethtool support mask - * @interfaces: pointer to an array of unsigned long for phy interface modes - * mask - * - * Parse the EEPROM identification information and derive the supported - * ethtool link modes for the module. - */ -void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support, unsigned long *interfaces) -{ - linkmode_or(support, support, bus->caps.link_modes); - phy_interface_copy(interfaces, bus->caps.interfaces); -} -EXPORT_SYMBOL_GPL(sfp_parse_support); - static void sfp_module_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 9f29fcad52be..5c71945a5e4d 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -577,11 +577,6 @@ struct sfp_upstream_ops { #if IS_ENABLED(CONFIG_SFP) const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus); -int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support); -bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id); -void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support, unsigned long *interfaces); phy_interface_t sfp_select_interface(struct sfp_bus *bus, const unsigned long *link_modes); @@ -607,26 +602,6 @@ sfp_get_module_caps(struct sfp_bus *bus) return NULL; } -static inline int sfp_parse_port(struct sfp_bus *bus, - const struct sfp_eeprom_id *id, - unsigned long *support) -{ - return PORT_OTHER; -} - -static inline bool sfp_may_have_phy(struct sfp_bus *bus, - const struct sfp_eeprom_id *id) -{ - return false; -} - -static inline void sfp_parse_support(struct sfp_bus *bus, - const struct sfp_eeprom_id *id, - unsigned long *support, - unsigned long *interfaces) -{ -} - static inline phy_interface_t sfp_select_interface(struct sfp_bus *bus, const unsigned long *link_modes) { -- cgit v1.2.3 From 17b14d235f58155a05cd9371e4559361ca3c67da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:49 +0000 Subject: net: move sk_uid and sk_protocol to sock_read_tx sk_uid and sk_protocol are read from inet6_csk_route_socket() for each TCP transmit. Also read from udpv6_sendmsg(), udp_sendmsg() and others. Move them to sock_read_tx for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 +++--- net/core/sock.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ee95081b0c0b..66c2f396b57d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -492,6 +492,9 @@ struct sock { long sk_sndtimeo; u32 sk_priority; u32 sk_mark; + kuid_t sk_uid; + u16 sk_protocol; + u16 sk_type; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT @@ -517,15 +520,12 @@ struct sock { sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; - u16 sk_type; - u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - kuid_t sk_uid; unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; diff --git a/net/core/sock.c b/net/core/sock.c index 21742da19e45..ad79efde4476 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4471,6 +4471,8 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); -- cgit v1.2.3 From 9303c3ced111803dcd1aa36a778f290977935ca5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:50 +0000 Subject: net: move sk->sk_err_soft and sk->sk_sndbuf sk->sk_sndbuf is read-mostly in tx path, so move it from sock_write_tx group to more appropriate sock_read_tx. sk->sk_err_soft was not identified previously, but is used from tcp_ack(). Move it to sock_write_tx group for better cache locality. Also change tcp_ack() to clear sk->sk_err_soft only if needed. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- net/core/sock.c | 3 ++- net/ipv4/tcp_input.c | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 66c2f396b57d..b4fefeea0213 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -467,7 +467,7 @@ struct sock { __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; - int sk_sndbuf; + int sk_err_soft; int sk_wmem_queued; refcount_t sk_wmem_alloc; @@ -507,6 +507,7 @@ struct sock { unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; + int sk_sndbuf; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); @@ -523,7 +524,6 @@ struct sock { unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; unsigned long sk_ino; diff --git a/net/core/sock.c b/net/core/sock.c index ad79efde4476..dc03d4b5909a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4452,7 +4452,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); @@ -4479,6 +4479,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9fdc6ce25eb1..f93d48d98d5d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4085,7 +4085,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* We passed data and got it acked, remove any soft error * log. Something worked... */ - WRITE_ONCE(sk->sk_err_soft, 0); + if (READ_ONCE(sk->sk_err_soft)) + WRITE_ONCE(sk->sk_err_soft, 0); WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) -- cgit v1.2.3 From 1b44d700023e77dd92821e7811db825e75a1a394 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:52 +0000 Subject: tcp: move tcp->rcv_tstamp to tcp_sock_write_txrx group tcp_ack() writes this field, it belongs to tcp_sock_write_txrx. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index d4dc01800945..429df29fba8b 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -26,7 +26,7 @@ u64 bytes_acked read_w u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update -u32 rcv_tstamp read_mostly tcp_ack +u32 rcv_tstamp read_write read_write tcp_ack void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3ca5ed02de6d..1e6c2ded22c9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,7 +238,6 @@ struct tcp_sock { /* RX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_rx); u32 copied_seq; /* Head of yet unread data */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 snd_wl1; /* Sequence for window update */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 rttvar_us; /* smoothed mdev_max */ @@ -246,13 +245,13 @@ struct tcp_sock { u16 advmss; /* Advertised MSS */ u16 urg_data; /* Saved octet of OOB data and control flags */ u32 lost; /* Total data packets lost incl. rexmits */ + u32 snd_ssthresh; /* Slow start size threshold */ struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); @@ -319,6 +318,7 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5932dba3bd71..721287ca3328 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5114,7 +5114,6 @@ static void __init tcp_struct_check(void) /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); @@ -5164,6 +5163,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); /* RX read-write hotpath cache lines */ -- cgit v1.2.3 From 969904dcd77dbb0a773d66cddaa59eccc6415d03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:53 +0000 Subject: tcp: move recvmsg_inq to tcp_sock_read_txrx Fill a hole in tcp_sock_read_txrx, instead of possibly wasting a cache line. Note that tcp_recvmsg_locked() is also reading tp->repair, so this removes one cache line miss in tcp recvmsg(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 429df29fba8b..c2138619b995 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -57,7 +57,7 @@ u8:1 is_sack_reneg read_m u8:2 fastopen_client_fail u8:4 nonagle read_write tcp_skb_entail,tcp_push_pending_frames u8:1 thin_lto -u8:1 recvmsg_inq +u8:1 recvmsg_inq read_mostly tcp_recvmsg u8:1 repair read_mostly tcp_write_xmit u8:1 frto u8 repair_queue diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1e6c2ded22c9..c1d7fce251d7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -232,7 +232,8 @@ struct tcp_sock { repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ + recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_txrx); /* RX read-mostly hotpath cache lines */ @@ -252,7 +253,6 @@ struct tcp_sock { #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ -- cgit v1.2.3 From a105ea47a4e855d24ebf65f1c5fb907162e7b8cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:54 +0000 Subject: tcp: move tcp_clean_acked to tcp_sock_read_tx group tp->tcp_clean_acked is fetched in tx path when snd_una is updated. This field thus belongs to tcp_sock_read_tx group. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 6 +++--- net/ipv4/tcp.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index c2138619b995..26f32dbcf6ec 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -27,7 +27,7 @@ u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update u32 rcv_tstamp read_write read_write tcp_ack -void * tcp_clean_acked read_mostly tcp_ack +void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time u32 compressed_ack_rcv_nxt diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c1d7fce251d7..3f282130c863 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -215,6 +215,9 @@ struct tcp_sock { u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ struct sk_buff *retransmit_skb_hint; +#if defined(CONFIG_TLS_DEVICE) + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); +#endif __cacheline_group_end(tcp_sock_read_tx); /* TXRX read-mostly hotpath cache lines */ @@ -250,9 +253,6 @@ struct tcp_sock { struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; -#if defined(CONFIG_TLS_DEVICE) - void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); -#endif __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 721287ca3328..7949d16506a4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5101,6 +5101,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); +#endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5124,9 +5127,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); -#if IS_ENABLED(CONFIG_TLS_DEVICE) - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); -#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); -- cgit v1.2.3 From 31c4511bbb0c75c525b6e4f4fe4167f2e9d3b05c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:55 +0000 Subject: tcp: move mtu_info to remove two 32bit holes This removes 8bytes waste on 64bit builds. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3f282130c863..20b8c6e21fef 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -448,6 +448,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG + * while socket was owned by user. + */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ @@ -494,9 +497,6 @@ struct tcp_sock { u32 probe_seq_end; } mtu_probe; u32 plb_rehash; /* PLB-triggered rehash attempts */ - u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG - * while socket was owned by user. - */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif -- cgit v1.2.3 From 649091ef597bb7de34dd8ceea39bbc4252970558 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:56 +0000 Subject: tcp: reclaim 8 bytes in struct request_sock_queue synflood_warned had to be u32 for xchg(), but ensuring atomicity is not really needed. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 2 +- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6a5ec1418e85..cd4d4cf71d0d 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -185,8 +185,8 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 synflood_warned; - u32 synflood_warned; atomic_t qlen; atomic_t young; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f93d48d98d5d..79d5252ed6cc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7282,8 +7282,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) { + if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { + WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), -- cgit v1.2.3 From 349271568303695f0ac3563af153d2b4542f6986 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:16 +0200 Subject: bpf: Implement signature verification for BPF programs This patch extends the BPF_PROG_LOAD command by adding three new fields to `union bpf_attr` in the user-space API: - signature: A pointer to the signature blob. - signature_size: The size of the signature blob. - keyring_id: The serial number of a loaded kernel keyring (e.g., the user or session keyring) containing the trusted public keys. When a BPF program is loaded with a signature, the kernel: 1. Retrieves the trusted keyring using the provided `keyring_id`. 2. Verifies the supplied signature against the BPF program's instruction buffer. 3. If the signature is valid and was generated by a key in the trusted keyring, the program load proceeds. 4. If no signature is provided, the load proceeds as before, allowing for backward compatibility. LSMs can chose to restrict unsigned programs and implement a security policy. 5. If signature verification fails for any reason, the program is not loaded. Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- crypto/asymmetric_keys/pkcs7_verify.c | 1 + include/linux/verification.h | 1 + include/uapi/linux/bpf.h | 10 ++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 45 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 10 ++++++++ tools/lib/bpf/bpf.c | 2 +- 7 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f0d4ff3c20a8..6d6475e3a9bf 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -429,6 +429,7 @@ int pkcs7_verify(struct pkcs7_message *pkcs7, /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: + case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c3..dec7f2beabfd 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef4ede8bb74f..969f63f8ca28 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3898,7 +3898,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf7173b1bb83..8a3c3d26f6e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2785,8 +2786,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2950,6 +2987,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 19ad7bcf0c2f..339b19797237 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -240,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; -- cgit v1.2.3 From 39f17c707454290900b608ee5a200b5db9245626 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 14 Sep 2025 13:09:08 +0200 Subject: sched/task.h: fix the wrong comment on task_lock() nesting with tasklist_lock The ancient comment above task_lock() states that it can be nested outside of read_lock(&tasklist_lock), but this is no longer true: CPU_0 CPU_1 CPU_2 task_lock() read_lock(tasklist) write_lock_irq(tasklist) read_lock(tasklist) task_lock() Unless CPU_0 calls read_lock() in IRQ context, queued_read_lock_slowpath() won't get the lock immediately, it will spin waiting for the pending writer on CPU_2, resulting in a deadlock. Link: https://lkml.kernel.org/r/20250914110908.GA18769@redhat.com Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Jiri Slaby Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- include/linux/sched/task.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..8ff98b18b24b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. + * Nests inside of read_lock(&tasklist_lock). It must not be nested with + * write_lock_irq(&tasklist_lock), neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { -- cgit v1.2.3 From af6703838ecb1513efdd2502a8f7bb6472c5ce96 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 3 Sep 2025 18:48:41 +0100 Subject: mm: specify separate file and vm_file params in vm_area_desc Patch series "mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()", v2. As part of the efforts to eliminate the problematic f_op->mmap callback, a new callback - f_op->mmap_prepare was provided. While we are converting these callbacks, we must deal with 'stacked' filesystems and drivers - those which in their own f_op->mmap callback invoke an inner f_op->mmap callback. To accomodate for this, a compatibility layer is provided that, via vfs_mmap(), detects if f_op->mmap_prepare is provided and if so, generates a vm_area_desc containing the VMA's metadata and invokes the call. So far, we have provided desc->file equal to vma->vm_file. However this is not necessarily valid, especially in the case of stacked drivers which wish to assign a new file after the inner hook is invoked. To account for this, we adjust vm_area_desc to have both file and vm_file fields. The .vm_file field is strictly set to vma->vm_file (or in the case of a new mapping, what will become vma->vm_file). However, .file is set to whichever file vfs_mmap() is invoked with when using the compatibilty layer. Therefore, if the VMA's file needs to be updated in .mmap_prepare, desc->vm_file should be assigned, whilst desc->file should be read. No current f_op->mmap_prepare users assign desc->file so this is safe to do. This makes the .mmap_prepare callback in the context of a stacked filesystem or driver completely consistent with the existing .mmap implementations. While we're here, we do a few small cleanups, and ensure that we const-ify things correctly in the vm_area_desc struct to avoid hooks accidentally trying to assign fields they should not. This patch (of 2): Stacked filesystems and drivers may invoke mmap hooks with a struct file pointer that differs from the overlying file. We will make this functionality possible in a subsequent patch. In order to prepare for this, let's update vm_area_struct to separately provide desc->file and desc->vm_file parameters. The desc->file parameter is the file that the hook is expected to operate upon, and is not assignable (though the hok may wish to e.g. update the file's accessed time for instance). The desc->vm_file defaults to what will become vma->vm_file and is what the hook must reassign should it wish to change the VMA"s vma->vm_file. For now we keep desc->file, vm_file the same to remain consistent. No f_op->mmap_prepare() callback sets a new vma->vm_file currently, so this is safe to change. While we're here, make the mm_struct desc->mm pointers at immutable as well as the desc->mm field itself. As part of this change, also update the single hook which this would otherwise break - mlock_future_ok(), invoked by secretmem_mmap_prepare()). We additionally update set_vma_from_desc() to compare fields in a more logical fashion, checking the (possibly) user-modified fields as the first operand against the existing value as the second one. Additionally, update VMA tests to accommodate changes. Link: https://lkml.kernel.org/r/cover.1756920635.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3fa15a861bb7419f033d22970598aa61850ea267.1756920635.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++-- mm/internal.h | 4 ++-- mm/mmap.c | 2 +- mm/util.c | 14 ++++++++++++-- mm/vma.c | 5 +++-- mm/vma.h | 28 ++++------------------------ tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++---------- 7 files changed, 43 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6920c816f6c6..965dedb3ccfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -777,13 +777,14 @@ struct pfnmap_track_ctx { */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; diff --git a/mm/internal.h b/mm/internal.h index c4657ffd342e..63e3ec8d63be 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -962,8 +962,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes); +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, + unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the diff --git a/mm/mmap.c b/mm/mmap.c index 7a057e0e8da9..5fd3b80fda1d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -225,7 +225,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; diff --git a/mm/util.c b/mm/util.c index 732a2dfcaec7..215ecd0214b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1161,10 +1161,20 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = vma->vm_file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; int err; - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + err = file->f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); diff --git a/mm/vma.c b/mm/vma.c index 3b12c7579831..abe0da33c844 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2572,11 +2572,12 @@ static int call_mmap_prepare(struct mmap_state *map) int err; struct vm_area_desc desc = { .mm = map->mm, + .file = map->file, .start = map->addr, .end = map->end, .pgoff = map->pgoff, - .file = map->file, + .vm_file = map->file, .vm_flags = map->vm_flags, .page_prot = map->page_prot, }; @@ -2588,7 +2589,7 @@ static int call_mmap_prepare(struct mmap_state *map) /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; - map->file = desc.file; + map->file = desc.vm_file; map->vm_flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ diff --git a/mm/vma.h b/mm/vma.h index bcdc261c5b15..9183fe549009 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -222,31 +222,11 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } - /* - * Temporary helper functions for file systems which wrap an invocation of + * Temporary helper function for stacked mmap handlers which specify * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ - -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc) -{ - desc->mm = vma->vm_mm; - desc->start = vma->vm_start; - desc->end = vma->vm_end; - - desc->pgoff = vma->vm_pgoff; - desc->file = vma->vm_file; - desc->vm_flags = vma->vm_flags; - desc->page_prot = vma->vm_page_prot; - - desc->vm_ops = NULL; - desc->private_data = NULL; - - return desc; -} - static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { @@ -258,9 +238,9 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma, /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; - if (vma->vm_file != desc->file) - vma_set_file(vma, desc->file); - if (vma->vm_flags != desc->vm_flags) + if (desc->vm_file != vma->vm_file) + vma_set_file(vma, desc->vm_file); + if (desc->vm_flags != vma->vm_flags) vm_flags_set(vma, desc->vm_flags); vma->vm_page_prot = desc->page_prot; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 437d2a1013be..6dcbeaa9f9a0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -283,13 +283,14 @@ struct vm_area_struct; */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; @@ -1299,8 +1300,8 @@ static inline bool capable(int cap) return true; } -static inline bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes) +static inline bool mlock_future_ok(const struct mm_struct *mm, + vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; @@ -1465,16 +1466,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc); - -static int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = vma->vm_file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; int err; - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + err = file->f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); -- cgit v1.2.3 From f7a741c53b712542aedd9382f215fbe969f8a580 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 3 Sep 2025 18:48:42 +0100 Subject: mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare() In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for nested file systems") we introduced the ability for stacked drivers and file systems to correctly invoke the f_op->mmap_prepare() handler from an f_op->mmap() handler via a compatibility layer implemented in compat_vma_mmap_prepare(). This populates vm_area_desc fields according to those found in the (not yet fully initialised) VMA passed to f_op->mmap(). However this function implicitly assumes that the struct file which we are operating upon is equal to vma->vm_file. This is not a safe assumption in all cases. The only really sane situation in which this matters would be something like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against obj->base.filp: ret = vfs_mmap(obj->base.filp, vma); if (ret) return ret; And then sets the VMA's file to this, should the mmap operation succeed: vma_set_file(vma, obj->base.filp); That is - it is the file that is intended to back the VMA mapping. This is not an issue currently, as so far we have only implemented f_op->mmap_prepare() handlers for some file systems and internal mm uses, and the only stacked f_op->mmap() operations that can be performed upon these are those in backing_file_mmap() and coda_file_mmap(), both of which use vma->vm_file. However, moving forward, as we convert drivers to using f_op->mmap_prepare(), this will become a problem. Resolve this issue by explicitly setting desc->file to the provided file parameter and update callers accordingly. Callers are expected to read desc->file and update desc->vm_file - the former will be the file provided by the caller (if stacked, this may differ from vma->vm_file). If the caller needs to differentiate between the two they therefore now can. While we are here, also provide a variant of compat_vma_mmap_prepare() that operates against a pointer to any file_operations struct and does not assume that the file_operations struct we are interested in is file->f_op. This function is __compat_vma_mmap_prepare() and we invoke it from compat_vma_mmap_prepare() so that we share code between the two functions. This is important, because some drivers provide hooks in a separate struct, for instance struct drm_device provides an fops field for this purpose. Also update the VMA selftests accordingly. Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Christian Brauner Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Al Viro Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ mm/util.c | 62 +++++++++++++++++++++++++--------------- tools/testing/vma/vma_internal.h | 12 ++++++-- 3 files changed, 50 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0783c5d05d3f..594bd4d0521e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2279,6 +2279,8 @@ static inline bool can_mmap_file(struct file *file) return true; } +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma); int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/mm/util.c b/mm/util.c index 215ecd0214b7..6c1d64ed0221 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1133,17 +1133,51 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif +/** + * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * for details. This is the same operation, only with a specific file operations + * struct which may or may not be the same as vma->vm_file->f_op. + * @f_op: The file operations whose .mmap_prepare() hook is specified. + * @file: The file which backs or will back the mapping. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * Returns: 0 on success or error. + */ +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(__compat_vma_mmap_prepare); + /** * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA - * @file: The file which possesss an f_op->mmap_prepare() hook + * existing VMA. + * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * stacked filesystems invoke a nested mmap hook of an underlying file. * * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these 'wrapper' filesystems using the + * conservative and continue to invoke these stacked filesystems using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1161,25 +1195,7 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = vma->vm_file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, - .page_prot = vma->vm_page_prot, - }; - int err; - - err = file->f_op->mmap_prepare(&desc); - if (err) - return err; - set_vma_from_desc(vma, &desc); - - return 0; + return __compat_vma_mmap_prepare(file->f_op, file, vma); } EXPORT_SYMBOL(compat_vma_mmap_prepare); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 6dcbeaa9f9a0..07167446dcf4 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1466,8 +1466,8 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int compat_vma_mmap_prepare(struct file *file, - struct vm_area_struct *vma) +static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1482,7 +1482,7 @@ static inline int compat_vma_mmap_prepare(struct file *file, }; int err; - err = file->f_op->mmap_prepare(&desc); + err = f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); @@ -1490,6 +1490,12 @@ static inline int compat_vma_mmap_prepare(struct file *file, return 0; } +static inline int compat_vma_mmap_prepare(struct file *file, + struct vm_area_struct *vma) +{ + return __compat_vma_mmap_prepare(file->f_op, file, vma); +} + /* Did the driver provide valid mmap hook configuration? */ static inline bool can_mmap_file(struct file *file) { -- cgit v1.2.3 From ef9f603fd3d4b7937f2cdbce40e47df0a54b2a55 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 22 Sep 2025 11:02:31 -0600 Subject: io_uring/cmd: drop unused res2 param from io_uring_cmd_done() Commit 79525b51acc1 ("io_uring: fix nvme's 32b cqes on mixed cq") split out a separate io_uring_cmd_done32() helper for ->uring_cmd() implementations that return 32-byte CQEs. The res2 value passed to io_uring_cmd_done() is now unused because __io_uring_cmd_done() ignores it when is_cqe32 is passed as false. So drop the parameter from io_uring_cmd_done() to simplify the callers and clarify that it's not possible to return an extra value beyond the 32-bit CQE result. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- drivers/block/ublk_drv.c | 6 +++--- fs/btrfs/ioctl.c | 2 +- fs/fuse/dev_uring.c | 8 ++++---- include/linux/io_uring/cmd.h | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..c9ea8e53871e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, 0, issue_flags); + io_uring_cmd_done(cmd, bic->res, issue_flags); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 99abd67b708b..48c409d1e1bb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1188,7 +1188,7 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); /* tell ublksrv one io request is coming */ - io_uring_cmd_done(cmd, res, 0, issue_flags); + io_uring_cmd_done(cmd, res, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -1805,7 +1805,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, spin_unlock(&ubq->cancel_lock); if (!done) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } /* @@ -2452,7 +2452,7 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e13de2bdcbf..168d84421a78 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4685,7 +4685,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..a30c44234a4e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -351,7 +351,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); if (req) fuse_uring_stop_fuse_req_end(req); @@ -518,7 +518,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, if (need_cmd_done) { /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); } } @@ -733,7 +733,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, 0, 0, issue_flags); + io_uring_cmd_done(cmd, 0, issue_flags); return 0; } @@ -1200,7 +1200,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } /* diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 02d50f08f668..7509025b4071 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -160,9 +160,9 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) } static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, - u64 res2, unsigned issue_flags) + unsigned issue_flags) { - return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, false); + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); } static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, -- cgit v1.2.3 From 7c7da8aa3fd69b77e8efaa14b80bddd948547739 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:37:09 +0900 Subject: can: dev: turn can_set_static_ctrlmode() into a non-inline function can_set_static_ctrlmode() is declared as a static inline. But it is only called in the probe function of the devices and so does not really benefit from any kind of optimization. Transform it into a "normal" function by moving it to drivers/net/can/dev/dev.c Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-can-fix-mtu-v3-2-581bde113f52@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 21 +++++++++++++++++++++ include/linux/can/dev.h | 23 ++--------------------- 2 files changed, 23 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 99b78cbb2252..02bfed37cc93 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -347,6 +347,27 @@ int can_change_mtu(struct net_device *dev, int new_mtu) } EXPORT_SYMBOL_GPL(can_change_mtu); +/* helper to define static CAN controller features at device creation time */ +int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) +{ + struct can_priv *priv = netdev_priv(dev); + + /* alloc_candev() succeeded => netdev_priv() is valid at this point */ + if (priv->ctrlmode_supported & static_mode) { + netdev_warn(dev, + "Controller features can not be supported and static at the same time\n"); + return -EINVAL; + } + priv->ctrlmode = static_mode; + + /* override MTU which was set by default in can_setup()? */ + if (static_mode & CAN_CTRLMODE_FD) + dev->mtu = CANFD_MTU; + + return 0; +} +EXPORT_SYMBOL_GPL(can_set_static_ctrlmode); + /* generic implementation of netdev_ops::ndo_eth_ioctl for CAN devices * supporting hardware timestamps */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 9a92cbe5b2cb..5dc58360c2d7 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -125,27 +125,6 @@ static inline s32 can_get_relative_tdco(const struct can_priv *priv) return (s32)priv->fd.tdc.tdco - sample_point_in_tc; } -/* helper to define static CAN controller features at device creation time */ -static inline int __must_check can_set_static_ctrlmode(struct net_device *dev, - u32 static_mode) -{ - struct can_priv *priv = netdev_priv(dev); - - /* alloc_candev() succeeded => netdev_priv() is valid at this point */ - if (priv->ctrlmode_supported & static_mode) { - netdev_warn(dev, - "Controller features can not be supported and static at the same time\n"); - return -EINVAL; - } - priv->ctrlmode = static_mode; - - /* override MTU which was set by default in can_setup()? */ - if (static_mode & CAN_CTRLMODE_FD) - dev->mtu = CANFD_MTU; - - return 0; -} - static inline u32 can_get_static_ctrlmode(struct can_priv *priv) { return priv->ctrlmode & ~priv->ctrlmode_supported; @@ -188,6 +167,8 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); int can_change_mtu(struct net_device *dev, int new_mtu); +int __must_check can_set_static_ctrlmode(struct net_device *dev, + u32 static_mode); int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, struct kernel_ethtool_ts_info *info); -- cgit v1.2.3 From d57f4b874946e997be52f5ebb5e0e1dad368c16f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 17 Sep 2025 15:22:04 +0200 Subject: tcp: Update bind bucket state on port release Today, once an inet_bind_bucket enters a state where fastreuse >= 0 or fastreuseport >= 0 after a socket is explicitly bound to a port, it remains in that state until all sockets are removed and the bucket is destroyed. In this state, the bucket is skipped during ephemeral port selection in connect(). For applications using a reduced ephemeral port range (IP_LOCAL_PORT_RANGE socket option), this can cause faster port exhaustion since blocked buckets are excluded from reuse. The reason the bucket state isn't updated on port release is unclear. Possibly a performance trade-off to avoid scanning bucket owners, or just an oversight. Fix it by recalculating the bucket state when a socket releases a port. To limit overhead, each inet_bind2_bucket stores its own (fastreuse, fastreuseport) state. On port release, only the relevant port-addr bucket is scanned, and the overall state is derived from these. Signed-off-by: Jakub Sitnicki Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250917-update-bind-bucket-state-on-unhash-v5-1-57168b661b47@cloudflare.com Signed-off-by: Paolo Abeni --- include/net/inet_connection_sock.h | 5 +++-- include/net/inet_hashtables.h | 2 ++ include/net/inet_timewait_sock.h | 3 ++- include/net/sock.h | 4 ++++ net/ipv4/inet_connection_sock.c | 12 +++++++---- net/ipv4/inet_hashtables.c | 44 +++++++++++++++++++++++++++++++++++++- net/ipv4/inet_timewait_sock.c | 1 + 7 files changed, 63 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0737d8e178dd..b4b886647607 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -316,8 +316,9 @@ int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); /* update the fast reuse flag when adding a socket */ -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk); +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b787be651ce7..ac05a52d9e13 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -108,6 +108,8 @@ struct inet_bind2_bucket { struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; + signed char fastreuse; + signed char fastreuseport; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 3a31c74c9e15..63a644ff30de 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -70,7 +70,8 @@ struct inet_timewait_sock { unsigned int tw_transparent : 1, tw_flowlabel : 20, tw_usec_ts : 1, - tw_pad : 2, /* 2 bits hole */ + tw_connect_bind : 1, + tw_pad : 1, /* 1 bit hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; diff --git a/include/net/sock.h b/include/net/sock.h index b4fefeea0213..8c5b64f41ab7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1494,6 +1494,10 @@ static inline int __sk_prot_rehash(struct sock *sk) #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 +/** + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time + */ +#define SOCK_CONNECT_BIND 16 struct socket_alloc { struct socket socket; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 142ff8d86fc2..cdd1e12aac8c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -423,7 +423,7 @@ success: } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, - struct sock *sk) + const struct sock *sk) { if (tb->fastreuseport <= 0) return 0; @@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk) +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; @@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, tb->fastreuseport = 0; } } + + tb2->fastreuse = tb->fastreuse; + tb2->fastreuseport = tb->fastreuseport; } /* Obtain a reference to a local port for the given sock, @@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } success: - inet_csk_update_fastreuse(tb, sk); + inet_csk_update_fastreuse(sk, tb, tb2); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4eb933f56fe6..b7024e3d9ac3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk) sk->sk_daddr, sk->sk_dport); } +static bool sk_is_connect_bind(const struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk(sk)->tw_connect_bind; + else + return sk->sk_userlocks & SOCK_CONNECT_BIND; +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { + const struct inet_bind2_bucket *tb2; + if (hlist_empty(&tb->bhash2)) { hlist_del_rcu(&tb->node); kfree_rcu(tb, rcu); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { + if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, @@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif + tb2->fastreuse = 0; + tb2->fastreuseport = 0; INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); @@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { + const struct sock *sk; + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); + return; } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + sk_for_each_bound(sk, &tb->owners) { + if (!sk_is_connect_bind(sk)) + return; + } + tb->fastreuse = -1; + tb->fastreuseport = -1; } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, @@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk) tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; + sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { @@ -277,7 +312,7 @@ bhash2_find: } } if (update_fastreuse) - inet_csk_update_fastreuse(tb, child); + inet_csk_update_fastreuse(child, tb, tb2); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); @@ -950,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); + if (sk_is_connect_bind(sk)) { + tb2->fastreuse = -1; + tb2->fastreuseport = -1; + } } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -1120,6 +1159,8 @@ ok: head2, tb, sk); if (!tb2) goto error; + tb2->fastreuse = -1; + tb2->fastreuseport = -1; } /* Here we want to add a little bit of randomness to the next source @@ -1132,6 +1173,7 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); + sk->sk_userlocks |= SOCK_CONNECT_BIND; if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2ca2912f61f4..e1a86130f038 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -208,6 +208,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); + tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); -- cgit v1.2.3 From 91daac8a6893c65e18f194946ad3ad9df5e9de8d Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 16 Sep 2025 08:10:07 +0200 Subject: genirq/msi: Remove msi_post_free() The only user of msi_post_free() - powerpc/pseries - has been changed to use msi_teardown(). Remove this unused callback. Signed-off-by: Nam Cao Reviewed-by: Thomas Gleixner Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250916061007.964005-1-namcao@linutronix.de --- include/linux/msi.h | 4 ---- kernel/irq/msi.c | 3 --- 2 files changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb..faac634ac230 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -431,8 +431,6 @@ struct msi_domain_info; * function. * @domain_free_irqs: Optional function to override the default free * function. - * @msi_post_free: Optional function which is invoked after freeing - * all interrupts. * @msi_translate: Optional translate callback to support the odd wire to * MSI bridges, e.g. MBIGEN * @@ -473,8 +471,6 @@ struct msi_domain_ops { struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, struct device *dev); - void (*msi_post_free)(struct irq_domain *domain, - struct device *dev); int (*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type); }; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9b09ad3f9914..e7ad99254841 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } -- cgit v1.2.3 From 884eee8e43f3072db4111178c98b9aa5c57bcf92 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:47 +0200 Subject: net/smc: Remove error handling of unregister_dmb() smcd_buf_free() calls smc_ism_unregister_dmb(lgr->smcd, buf_desc) and then unconditionally frees buf_desc. Remove the cleaning up of fields of buf_desc in smc_ism_unregister_dmb(), because it is not helpful. This removes the only usage of ISM_ERROR from the smc module. So move it to drivers/s390/net/ism.h. Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Reviewed-by: Dust Li Link: https://patch.msgid.link/20250918110500.1731261-2-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism.h | 1 + include/net/smc.h | 2 -- net/smc/smc_ism.c | 14 +++++--------- net/smc/smc_ism.h | 3 ++- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 047fa6101555..b5b03db52fce 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -10,6 +10,7 @@ #include #define UTIL_STR_LEN 16 +#define ISM_ERROR 0xFFFF /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. diff --git a/include/net/smc.h b/include/net/smc.h index db84e4e35080..a9c023dd1380 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -44,8 +44,6 @@ struct smcd_dmb { #define ISM_RESERVED_VLANID 0x1FFF -#define ISM_ERROR 0xFFFF - struct smcd_dev; struct smcd_gid { diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a58ffb7a0610..fca01b95b65a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -205,13 +205,13 @@ out: return rc; } -int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +void smc_ism_unregister_dmb(struct smcd_dev *smcd, + struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; - int rc = 0; if (!dmb_desc->dma_addr) - return rc; + return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -219,13 +219,9 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - rc = smcd->ops->unregister_dmb(smcd, &dmb); - if (!rc || rc == ISM_ERROR) { - dmb_desc->cpu_addr = NULL; - dmb_desc->dma_addr = 0; - } + smcd->ops->unregister_dmb(smcd, &dmb); - return rc; + return; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 6763133dd8d0..765aa8fae6fa 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -47,7 +47,8 @@ int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); -int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +void smc_ism_unregister_dmb(struct smcd_dev *dev, + struct smc_buf_desc *dmb_desc); bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc); -- cgit v1.2.3 From 35758b0032c056cdff3e8f5a70669cb3e2c8d0e4 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:49 +0200 Subject: dibs: Create drivers/dibs Create the file structure for a 'DIBS - Direct Internal Buffer Sharing' shim layer that will provide generic functionality and declarations for dibs device drivers and dibs clients. Following patches will add functionality. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-4-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 7 +++++++ drivers/Makefile | 1 + drivers/dibs/Kconfig | 12 ++++++++++++ drivers/dibs/Makefile | 7 +++++++ drivers/dibs/dibs_main.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/dibs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ net/Kconfig | 1 + 7 files changed, 107 insertions(+) create mode 100644 drivers/dibs/Kconfig create mode 100644 drivers/dibs/Makefile create mode 100644 drivers/dibs/dibs_main.c create mode 100644 include/linux/dibs.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index a8a770714101..ecc55fae5f9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7132,6 +7132,13 @@ L: linux-gpio@vger.kernel.org S: Maintained F: drivers/gpio/gpio-gpio-mm.c +DIBS (DIRECT INTERNAL BUFFER SHARING) +M: Alexandra Winter +L: netdev@vger.kernel.org +S: Supported +F: drivers/dibs/ +F: include/linux/dibs.h + DIGITEQ AUTOMOTIVE MGB4 V4L2 DRIVER M: Martin Tuma L: linux-media@vger.kernel.org diff --git a/drivers/Makefile b/drivers/Makefile index b5749cf67044..a104163b1353 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -195,4 +195,5 @@ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_DPLL) += dpll/ +obj-$(CONFIG_DIBS) += dibs/ obj-$(CONFIG_S390) += s390/ diff --git a/drivers/dibs/Kconfig b/drivers/dibs/Kconfig new file mode 100644 index 000000000000..09c12f6838ad --- /dev/null +++ b/drivers/dibs/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +config DIBS + tristate "DIBS support" + default n + help + Direct Internal Buffer Sharing (DIBS) + A communication method that uses common physical (internal) memory + for synchronous direct access into a remote buffer. + + Select this option to provide the abstraction layer between + dibs devices and dibs clients like the SMC protocol. + The module name is dibs. diff --git a/drivers/dibs/Makefile b/drivers/dibs/Makefile new file mode 100644 index 000000000000..825dec431bfc --- /dev/null +++ b/drivers/dibs/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# DIBS class module +# + +dibs-y += dibs_main.o +obj-$(CONFIG_DIBS) += dibs.o diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c new file mode 100644 index 000000000000..68e189932fcf --- /dev/null +++ b/drivers/dibs/dibs_main.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIBS - Direct Internal Buffer Sharing + * + * Implementation of the DIBS class module + * + * Copyright IBM Corp. 2025 + */ +#define KMSG_COMPONENT "dibs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Direct Internal Buffer Sharing class"); +MODULE_LICENSE("GPL"); + +/* use an array rather a list for fast mapping: */ +static struct dibs_client *clients[MAX_DIBS_CLIENTS]; +static u8 max_client; + +static int __init dibs_init(void) +{ + memset(clients, 0, sizeof(clients)); + max_client = 0; + + return 0; +} + +static void __exit dibs_exit(void) +{ +} + +module_init(dibs_init); +module_exit(dibs_exit); diff --git a/include/linux/dibs.h b/include/linux/dibs.h new file mode 100644 index 000000000000..3f4175aaa732 --- /dev/null +++ b/include/linux/dibs.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Direct Internal Buffer Sharing + * + * Definitions for the DIBS module + * + * Copyright IBM Corp. 2025 + */ +#ifndef _DIBS_H +#define _DIBS_H + +/* DIBS - Direct Internal Buffer Sharing - concept + * ----------------------------------------------- + * In the case of multiple system sharing the same hardware, dibs fabrics can + * provide dibs devices to these systems. The systems use dibs devices of the + * same fabric to communicate via dmbs (Direct Memory Buffers). Each dmb has + * exactly one owning local dibs device and one remote using dibs device, that + * is authorized to write into this dmb. This access control is provided by the + * dibs fabric. + * + * Because the access to the dmb is based on access to physical memory, it is + * lossless and synchronous. The remote devices can directly access any offset + * of the dmb. + * + * Dibs fabrics, dibs devices and dmbs are identified by tokens and ids. + * Dibs fabric id is unique within the same hardware (with the exception of the + * dibs loopback fabric), dmb token is unique within the same fabric, dibs + * device gids are guaranteed to be unique within the same fabric and + * statistically likely to be globally unique. The exchange of these tokens and + * ids between the systems is not part of the dibs concept. + * + * The dibs layer provides an abstraction between dibs device drivers and dibs + * clients. + */ + +#define MAX_DIBS_CLIENTS 8 + +struct dibs_client { + const char *name; +}; + +#endif /* _DIBS_H */ diff --git a/net/Kconfig b/net/Kconfig index 4b563aea4c23..1d3f757d4b07 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -88,6 +88,7 @@ source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "drivers/dibs/Kconfig" source "net/xdp/Kconfig" config NET_HANDSHAKE -- cgit v1.2.3 From d324a2ca3f8efd57f5839aa2690554a5cbb3586f Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:50 +0200 Subject: dibs: Register smc as dibs_client Formally register smc as dibs client. Functionality will be moved by follow-on patches from ism_client to dibs_client until eventually ism_client can be removed. As DIBS is only a shim layer without any dependencies, we can depend SMC on DIBS without adding indirect dependencies. A follow-on patch will remove dependency of SMC on ISM. Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Link: https://patch.msgid.link/20250918110500.1731261-5-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/dibs.h | 23 +++++++++++++++++++++++ net/smc/Kconfig | 2 +- net/smc/smc_ism.c | 6 ++++++ 6 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5e616bc988ac..7bc54f053a3b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -120,6 +120,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_DIBS=y CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 094599cdaf4d..4bf6f3311f7d 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -111,6 +111,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_DIBS=y CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index 68e189932fcf..a5d2be9c3246 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -20,6 +20,41 @@ MODULE_LICENSE("GPL"); /* use an array rather a list for fast mapping: */ static struct dibs_client *clients[MAX_DIBS_CLIENTS]; static u8 max_client; +static DEFINE_MUTEX(clients_lock); + +int dibs_register_client(struct dibs_client *client) +{ + int i, rc = -ENOSPC; + + mutex_lock(&clients_lock); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + if (!clients[i]) { + clients[i] = client; + client->id = i; + if (i == max_client) + max_client++; + rc = 0; + break; + } + } + mutex_unlock(&clients_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(dibs_register_client); + +int dibs_unregister_client(struct dibs_client *client) +{ + int rc = 0; + + mutex_lock(&clients_lock); + clients[client->id] = NULL; + if (client->id + 1 == max_client) + max_client--; + mutex_unlock(&clients_lock); + return rc; +} +EXPORT_SYMBOL_GPL(dibs_unregister_client); static int __init dibs_init(void) { diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 3f4175aaa732..7bedeaf52c1b 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -33,10 +33,33 @@ * clients. */ +/* DIBS client + * ----------- + */ #define MAX_DIBS_CLIENTS 8 struct dibs_client { + /* client name for logging and debugging purposes */ const char *name; + /* client index - provided and used by dibs layer */ + u8 id; }; +/* Functions to be called by dibs clients: + */ +/** + * dibs_register_client() - register a client with dibs layer + * @client: this client + * + * Return: zero on success. + */ +int dibs_register_client(struct dibs_client *client); +/** + * dibs_unregister_client() - unregister a client with dibs layer + * @client: this client + * + * Return: zero on success. + */ +int dibs_unregister_client(struct dibs_client *client); + #endif /* _DIBS_H */ diff --git a/net/smc/Kconfig b/net/smc/Kconfig index ba5e6a2dd2fd..40dd60c1d23f 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" - depends on INET && INFINIBAND + depends on INET && INFINIBAND && DIBS depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 503a9f93b392..a7a965e3c0ce 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -18,6 +18,7 @@ #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" +#include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -42,6 +43,9 @@ static struct ism_client smc_ism_client = { .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client smc_dibs_client = { + .name = "SMC-D", +}; static void smc_ism_create_system_eid(void) { @@ -623,11 +627,13 @@ int smc_ism_init(void) #if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif + rc = dibs_register_client(&smc_dibs_client); return rc; } void smc_ism_exit(void) { + dibs_unregister_client(&smc_dibs_client); #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif -- cgit v1.2.3 From 269726968f95ebc00e3a47f91eebd6818991d6fa Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:51 +0200 Subject: dibs: Register ism as dibs device Register ism devices with the dibs layer. Follow-on patches will move functionality to the dibs layer. As DIBS is only a shim layer without any dependencies, we can depend ISM on DIBS without adding indirect dependencies. A follow-on patch will remove implication of SMC by ISM. Define struct dibs_dev. Follow-on patches will move more content into dibs_dev. The goal of follow-on patches is that ism_dev will only contain fields that are special for this device driver. The same concept will apply to other dibs device drivers. Define dibs_dev_alloc(), dibs_dev_add() and dibs_dev_del() to be called by dibs device drivers and call them from ism_drv.c Use ism_dev.dibs for a pointer to dibs_dev. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-6-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_main.c | 38 +++++++++++++++++++++ drivers/s390/net/Kconfig | 2 +- drivers/s390/net/ism.h | 1 + drivers/s390/net/ism_drv.c | 83 +++++++++++++++++++++++++++++----------------- include/linux/dibs.h | 38 +++++++++++++++++++++ include/linux/ism.h | 1 + 6 files changed, 131 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index a5d2be9c3246..2f420e077417 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -21,6 +22,15 @@ MODULE_LICENSE("GPL"); static struct dibs_client *clients[MAX_DIBS_CLIENTS]; static u8 max_client; static DEFINE_MUTEX(clients_lock); +struct dibs_dev_list { + struct list_head list; + struct mutex mutex; /* protects dibs device list */ +}; + +static struct dibs_dev_list dibs_dev_list = { + .list = LIST_HEAD_INIT(dibs_dev_list.list), + .mutex = __MUTEX_INITIALIZER(dibs_dev_list.mutex), +}; int dibs_register_client(struct dibs_client *client) { @@ -56,6 +66,34 @@ int dibs_unregister_client(struct dibs_client *client) } EXPORT_SYMBOL_GPL(dibs_unregister_client); +struct dibs_dev *dibs_dev_alloc(void) +{ + struct dibs_dev *dibs; + + dibs = kzalloc(sizeof(*dibs), GFP_KERNEL); + + return dibs; +} +EXPORT_SYMBOL_GPL(dibs_dev_alloc); + +int dibs_dev_add(struct dibs_dev *dibs) +{ + mutex_lock(&dibs_dev_list.mutex); + list_add(&dibs->list, &dibs_dev_list.list); + mutex_unlock(&dibs_dev_list.mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(dibs_dev_add); + +void dibs_dev_del(struct dibs_dev *dibs) +{ + mutex_lock(&dibs_dev_list.mutex); + list_del_init(&dibs->list); + mutex_unlock(&dibs_dev_list.mutex); +} +EXPORT_SYMBOL_GPL(dibs_dev_del); + static int __init dibs_init(void) { memset(clients, 0, sizeof(clients)); diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 2b43f6f28362..92985f595d59 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -81,7 +81,7 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" - depends on PCI + depends on PCI && DIBS imply SMC default n help diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index b5b03db52fce..3078779fa71e 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 6cd60b174315..8ecd0cccc7e8 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -599,8 +599,39 @@ static void ism_dev_release(struct device *dev) kfree(ism); } +static void ism_dev_exit(struct ism_dev *ism) +{ + struct pci_dev *pdev = ism->pdev; + unsigned long flags; + int i; + + spin_lock_irqsave(&ism->lock, flags); + for (i = 0; i < max_client; ++i) + ism->subs[i] = NULL; + spin_unlock_irqrestore(&ism->lock, flags); + + mutex_lock(&ism_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->remove(ism); + } + mutex_unlock(&clients_lock); + + if (ism_v2_capable) + ism_del_vlan_id(ism, ISM_RESERVED_VLANID); + unregister_ieq(ism); + unregister_sba(ism); + free_irq(pci_irq_vector(pdev, 0), ism); + kfree(ism->sba_client_arr); + pci_free_irq_vectors(pdev); + list_del_init(&ism->list); + mutex_unlock(&ism_dev_list.mutex); +} + static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) { + struct dibs_dev *dibs; struct ism_dev *ism; int ret; @@ -636,12 +667,28 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dma_set_max_seg_size(&pdev->dev, SZ_1M); pci_set_master(pdev); + dibs = dibs_dev_alloc(); + if (!dibs) { + ret = -ENOMEM; + goto err_resource; + } + ism->dibs = dibs; + ret = ism_dev_init(ism); if (ret) - goto err_resource; + goto err_dibs; + + ret = dibs_dev_add(dibs); + if (ret) + goto err_ism; return 0; +err_ism: + ism_dev_exit(ism); +err_dibs: + /* pairs with dibs_dev_alloc() */ + kfree(dibs); err_resource: pci_release_mem_regions(pdev); err_disable: @@ -655,41 +702,15 @@ err_dev: return ret; } -static void ism_dev_exit(struct ism_dev *ism) -{ - struct pci_dev *pdev = ism->pdev; - unsigned long flags; - int i; - - spin_lock_irqsave(&ism->lock, flags); - for (i = 0; i < max_client; ++i) - ism->subs[i] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) - clients[i]->remove(ism); - } - mutex_unlock(&clients_lock); - - if (ism_v2_capable) - ism_del_vlan_id(ism, ISM_RESERVED_VLANID); - unregister_ieq(ism); - unregister_sba(ism); - free_irq(pci_irq_vector(pdev, 0), ism); - kfree(ism->sba_client_arr); - pci_free_irq_vectors(pdev); - list_del_init(&ism->list); - mutex_unlock(&ism_dev_list.mutex); -} - static void ism_remove(struct pci_dev *pdev) { struct ism_dev *ism = dev_get_drvdata(&pdev->dev); + struct dibs_dev *dibs = ism->dibs; + dibs_dev_del(dibs); ism_dev_exit(ism); + /* pairs with dibs_dev_alloc() */ + kfree(dibs); pci_release_mem_regions(pdev); pci_disable_device(pdev); diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 7bedeaf52c1b..c12db19c98c0 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -9,6 +9,7 @@ #ifndef _DIBS_H #define _DIBS_H +#include /* DIBS - Direct Internal Buffer Sharing - concept * ----------------------------------------------- * In the case of multiple system sharing the same hardware, dibs fabrics can @@ -62,4 +63,41 @@ int dibs_register_client(struct dibs_client *client); */ int dibs_unregister_client(struct dibs_client *client); +/* DIBS devices + * ------------ + */ +struct dibs_dev { + struct list_head list; +}; + +/* ------- End of client-only functions ----------- */ + +/* + * Functions to be called by dibs device drivers: + */ +/** + * dibs_dev_alloc() - allocate and reference device structure + * + * The following fields will be valid upon successful return: dev + * NOTE: Use put_device(dibs_get_dev(@dibs)) to give up your reference instead + * of freeing @dibs @dev directly once you have successfully called this + * function. + * Return: Pointer to dibs device structure + */ +struct dibs_dev *dibs_dev_alloc(void); +/** + * dibs_dev_add() - register with dibs layer and all clients + * @dibs: dibs device + * + * The following fields must be valid upon entry: dev, ops, drv_priv + * All fields will be valid upon successful return. + * Return: zero on success + */ +int dibs_dev_add(struct dibs_dev *dibs); +/** + * dibs_dev_del() - unregister from dibs layer and all clients + * @dibs: dibs device + */ +void dibs_dev_del(struct dibs_dev *dibs); + #endif /* _DIBS_H */ diff --git a/include/linux/ism.h b/include/linux/ism.h index 8358b4cd7ba6..9a53d3c48c16 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -30,6 +30,7 @@ struct ism_dev { spinlock_t lock; /* protects the ism device */ spinlock_t cmd_lock; /* serializes cmds */ struct list_head list; + struct dibs_dev *dibs; struct pci_dev *pdev; struct ism_sba *sba; -- cgit v1.2.3 From 69baaac9361edd169713562f088829a1be9c51a9 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:53 +0200 Subject: dibs: Define dibs_client_ops and dibs_dev_ops Move the device add() and remove() functions from ism_client to dibs_client_ops and call add_dev()/del_dev() for ism devices and dibs_loopback devices. dibs_client_ops->add_dev() = smcd_register_dev() for the smc_dibs_client. This is the first step to handle ism and loopback devices alike (as dibs devices) in the smc dibs client. Define dibs_dev->ops and move smcd_ops->get_chid to dibs_dev_ops->get_fabric_id() for ism and loopback devices. See below for why this needs to be in the same patch as dibs_client_ops->add_dev(). The following changes contain intermediate steps, that will be obsoleted by follow-on patches, once more functionality has been moved to dibs: Use different smcd_ops and max_dmbs for ism and loopback. Follow-on patches will change SMC-D to directly use dibs_ops instead of smcd_ops. In smcd_register_dev() it is now necessary to identify a dibs_loopback device before smcd_dev and smcd_ops->get_chid() are available. So provide dibs_dev_ops->get_fabric_id() in this patch and evaluate it in smc_ism_is_loopback(). Call smc_loopback_init() in smcd_register_dev() and call smc_loopback_exit() in smcd_unregister_dev() to handle the functionality that is still in smc_loopback. Follow-on patches will move all smc_loopback code to dibs_loopback. In smcd_[un]register_dev() use only ism device name, this will be replaced by dibs device name by a follow-on patch. End of changes with intermediate parts. Allocate an smcd event workqueue for all dibs devices, although dibs_loopback does not generate events. Use kernel memory instead of devres memory for smcd_dev and smcd->conn. Since commit a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") an ism device and its driver can have a longer lifetime than the smc module, so smc should not rely on devres to free its resources [1]. It is now the responsibility of the smc client to free smcd and smcd->conn for all dibs devices, ism devices as well as loopback. Call client->ops->del_dev() for all existing dibs devices in dibs_unregister_client(), so all device related structures can be freed in the client. When dibs_unregister_client() is called in the context of smc_exit() or smc_core_reboot_event(), these functions have already called smc_lgrs_shutdown() which calls smc_smcd_terminate_all(smcd) and sets going_away. This is done a second time in smcd_unregister_dev(). This is analogous to how smcr is handled in these functions, by calling first smc_lgrs_shutdown() and then smc_ib_unregister_client() > smc_ib_remove_dev(), so leave it that way. It may be worth investigating, whether smc_lgrs_shutdown() is still required or useful. Remove CONFIG_SMC_LO. CONFIG_DIBS_LO now controls whether a dibs loopback device exists or not. Link: https://www.kernel.org/doc/Documentation/driver-model/devres.txt [1] Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-8-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - drivers/dibs/dibs_loopback.c | 11 ++++ drivers/dibs/dibs_main.c | 36 +++++++++++ drivers/s390/net/ism_drv.c | 43 ++++++------- include/linux/dibs.h | 89 ++++++++++++++++++++++++- include/linux/ism.h | 2 - include/net/smc.h | 3 +- net/smc/Kconfig | 13 ---- net/smc/Makefile | 2 +- net/smc/af_smc.c | 12 +--- net/smc/smc_ism.c | 132 ++++++++++++++++++++++++++------------ net/smc/smc_ism.h | 7 +- net/smc/smc_loopback.c | 94 ++++----------------------- net/smc/smc_loopback.h | 17 +---- 15 files changed, 270 insertions(+), 193 deletions(-) (limited to 'include') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5a2ed07b6198..a97c8d19f643 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -123,7 +123,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 4cbdd7e2ff9f..7f7b52d9a33c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -114,7 +114,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 225514a452a8..215986ae54a4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -18,6 +18,15 @@ /* global loopback device */ static struct dibs_lo_dev *lo_dev; +static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) +{ + return DIBS_LOOPBACK_FABRIC; +} + +static const struct dibs_dev_ops dibs_lo_ops = { + .get_fabric_id = dibs_lo_get_fabric_id, +}; + static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) { dibs_dev_del(ldev->dibs); @@ -40,6 +49,8 @@ static int dibs_lo_dev_probe(void) } ldev->dibs = dibs; + dibs->drv_priv = ldev; + dibs->ops = &dibs_lo_ops; ret = dibs_dev_add(dibs); if (ret) diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index a7e33be36158..f1cfa5849277 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,8 +36,10 @@ static struct dibs_dev_list dibs_dev_list = { int dibs_register_client(struct dibs_client *client) { + struct dibs_dev *dibs; int i, rc = -ENOSPC; + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { if (!clients[i]) { @@ -51,19 +53,37 @@ int dibs_register_client(struct dibs_client *client) } mutex_unlock(&clients_lock); + if (i < MAX_DIBS_CLIENTS) { + /* initialize with all devices that we got so far */ + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + dibs->priv[i] = NULL; + client->ops->add_dev(dibs); + } + } + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { + struct dibs_dev *dibs; int rc = 0; + mutex_lock(&dibs_dev_list.mutex); + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + clients[client->id]->ops->del_dev(dibs); + dibs->priv[client->id] = NULL; + } + mutex_lock(&clients_lock); clients[client->id] = NULL; if (client->id + 1 == max_client) max_client--; mutex_unlock(&clients_lock); + + mutex_unlock(&dibs_dev_list.mutex); return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -80,7 +100,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->add_dev(dibs); + } + mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); mutex_unlock(&dibs_dev_list.mutex); @@ -90,7 +118,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->del_dev(dibs); + } + mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); } diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 8ecd0cccc7e8..2bd8f64ebb56 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -79,7 +79,6 @@ int ism_register_client(struct ism_client *client) /* initialize with all devices that we got so far */ list_for_each_entry(ism, &ism_dev_list.list, list) { ism->priv[i] = NULL; - client->add(ism); ism_setup_forwarding(client, ism); } } @@ -465,6 +464,16 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, } EXPORT_SYMBOL_GPL(ism_move); +static u16 ism_get_chid(struct dibs_dev *dibs) +{ + struct ism_dev *ism = dibs->drv_priv; + + if (!ism || !ism->pdev) + return 0; + + return to_zpci(ism->pdev)->pchid; +} + static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; @@ -523,6 +532,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } +static const struct dibs_dev_ops ism_ops = { + .get_fabric_id = ism_get_chid, +}; + static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -564,7 +577,6 @@ static int ism_dev_init(struct ism_dev *ism) mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { if (clients[i]) { - clients[i]->add(ism); ism_setup_forwarding(clients[i], ism); } } @@ -611,12 +623,6 @@ static void ism_dev_exit(struct ism_dev *ism) spin_unlock_irqrestore(&ism->lock, flags); mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) - clients[i]->remove(ism); - } - mutex_unlock(&clients_lock); if (ism_v2_capable) ism_del_vlan_id(ism, ISM_RESERVED_VLANID); @@ -672,7 +678,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = -ENOMEM; goto err_resource; } + /* set this up before we enable interrupts */ ism->dibs = dibs; + dibs->drv_priv = ism; + dibs->ops = &ism_ops; ret = ism_dev_init(ism); if (ret) @@ -857,19 +866,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static u16 ism_get_chid(struct ism_dev *ism) -{ - if (!ism || !ism->pdev) - return 0; - - return to_zpci(ism->pdev)->pchid; -} - -static u16 smcd_get_chid(struct smcd_dev *smcd) -{ - return ism_get_chid(smcd->priv); -} - static inline struct device *smcd_get_dev(struct smcd_dev *dev) { struct ism_dev *ism = dev->priv; @@ -877,7 +873,7 @@ static inline struct device *smcd_get_dev(struct smcd_dev *dev) return &ism->dev; } -static const struct smcd_ops ism_ops = { +static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, @@ -889,13 +885,12 @@ static const struct smcd_ops ism_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_chid = smcd_get_chid, .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) { - return &ism_ops; + return &ism_smcd_ops; } EXPORT_SYMBOL_GPL(ism_get_smcd_ops); #endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c12db19c98c0..805ab33271b5 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -34,14 +34,45 @@ * clients. */ +struct dibs_dev; + /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +/* All dibs clients have access to all dibs devices. + * A dibs client provides the following functions to be called by dibs layer or + * dibs device drivers: + */ +struct dibs_client_ops { + /** + * add_dev() - add a dibs device + * @dev: device that was added + * + * Will be called during dibs_register_client() for all existing + * dibs devices and whenever a new dibs device is registered. + * dev is usable until dibs_client.remove() is called. + * *dev is protected by device refcounting. + */ + void (*add_dev)(struct dibs_dev *dev); + /** + * del_dev() - remove a dibs device + * @dev: device to be removed + * + * Will be called whenever a dibs device is removed. + * Will be called during dibs_unregister_client() for all existing + * dibs devices and whenever a dibs device is unregistered. + * The device has already stopped initiative for this client: + * No new handlers will be started. + * The device is no longer usable by this client after this call. + */ + void (*del_dev)(struct dibs_dev *dev); +}; struct dibs_client { /* client name for logging and debugging purposes */ const char *name; + const struct dibs_client_ops *ops; /* client index - provided and used by dibs layer */ u8 id; }; @@ -52,6 +83,7 @@ struct dibs_client { * dibs_register_client() - register a client with dibs layer * @client: this client * + * Will call client->ops->add_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_register_client(struct dibs_client *client); @@ -59,21 +91,74 @@ int dibs_register_client(struct dibs_client *client); * dibs_unregister_client() - unregister a client with dibs layer * @client: this client * + * Will call client->ops->del_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_unregister_client(struct dibs_client *client); +/* dibs clients can call dibs device ops. */ + /* DIBS devices * ------------ */ + +/* Defined fabric id / CHID for all loopback devices: + * All dibs loopback devices report this fabric id. In this case devices with + * the same fabric id can NOT communicate via dibs. Only loopback devices with + * the same dibs device gid can communicate (=same device with itself). + */ +#define DIBS_LOOPBACK_FABRIC 0xFFFF + +/* A dibs device provides the following functions to be called by dibs clients. + * They are mandatory, unless marked 'optional'. + */ +struct dibs_dev_ops { + /** + * get_fabric_id() + * @dev: local dibs device + * + * Only devices on the same dibs fabric can communicate. Fabric_id is + * unique inside the same HW system. Use fabric_id for fast negative + * checks, but only query_remote_gid() can give a reliable positive + * answer: + * Different fabric_id: dibs is not possible + * Same fabric_id: dibs may be possible or not + * (e.g. different HW systems) + * EXCEPTION: DIBS_LOOPBACK_FABRIC denotes an ism_loopback device + * that can only communicate with itself. Use dibs_dev.gid + * or query_remote_gid()to determine whether sender and + * receiver use the same ism_loopback device. + * Return: 2 byte dibs fabric id + */ + u16 (*get_fabric_id)(struct dibs_dev *dev); +}; + struct dibs_dev { struct list_head list; + /* To be filled by device driver, before calling dibs_dev_add(): */ + const struct dibs_dev_ops *ops; + /* priv pointer for device driver */ + void *drv_priv; + + /* priv pointer per client; for client usage only */ + void *priv[MAX_DIBS_CLIENTS]; }; +static inline void dibs_set_priv(struct dibs_dev *dev, + struct dibs_client *client, void *priv) +{ + dev->priv[client->id] = priv; +} + +static inline void *dibs_get_priv(struct dibs_dev *dev, + struct dibs_client *client) +{ + return dev->priv[client->id]; +} + /* ------- End of client-only functions ----------- */ -/* - * Functions to be called by dibs device drivers: +/* Functions to be called by dibs device drivers: */ /** * dibs_dev_alloc() - allocate and reference device structure diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a53d3c48c16..c818a25996db 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -59,8 +59,6 @@ struct ism_event { struct ism_client { const char *name; - void (*add)(struct ism_dev *dev); - void (*remove)(struct ism_dev *dev); void (*handle_event)(struct ism_dev *dev, struct ism_event *event); /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent * via ism_move_data(). Callback function must handle all active bits diff --git a/include/net/smc.h b/include/net/smc.h index a9c023dd1380..e271891b85e6 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -15,6 +15,7 @@ #include #include #include +#include #include "linux/ism.h" struct sock; @@ -62,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ @@ -81,6 +81,7 @@ struct smcd_dev { const struct smcd_ops *ops; void *priv; void *client; + struct dibs_dev *dibs; struct list_head list; spinlock_t lock; struct smc_connection **conn; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 40dd60c1d23f..9535d88c2acb 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,16 +20,3 @@ config SMC_DIAG smcss. if unsure, say Y. - -config SMC_LO - bool "SMC intra-OS shortcut with loopback-ism" - depends on SMC - default n - help - SMC_LO enables the creation of an Emulated-ISM device named - loopback-ism in SMC and makes use of it for transferring data - when communication occurs within the same OS. This helps in - convenient testing of SMC-D since loopback-ism is independent - of architecture or hardware. - - if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87d5212..96ccfdf246df 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-$(CONFIG_SMC_LO) += smc_loopback.o +smc-y += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9097e4f24d2b..77b99e8ef35a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -57,7 +57,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_loopback.h" #include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group @@ -3591,16 +3590,10 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_loopback_init(); - if (rc) { - pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); - goto out_ib; - } - rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_lo; + goto out_ib; } rc = smc_inet_init(); if (rc) { @@ -3611,8 +3604,6 @@ static int __init smc_init(void) return 0; out_ulp: tcp_unregister_ulp(&smc_ulp_ops); -out_lo: - smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3651,7 +3642,6 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); - smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a7a965e3c0ce..415f03910c91 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,6 +15,7 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" +#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -28,23 +29,27 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) -static void smcd_register_dev(struct ism_dev *ism); -static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", - .add = smcd_register_dev, - .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, +}; + static struct dibs_client smc_dibs_client = { .name = "SMC-D", + .ops = &smc_client_ops, }; static void smc_ism_create_system_eid(void) @@ -86,7 +91,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -318,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + smc_set_pci_values(ism->pdev, &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -368,7 +373,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; - if (smc_ism_is_loopback(smcd)) + if (smc_ism_is_loopback(smcd->dibs)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; @@ -453,24 +458,26 @@ static void smc_ism_event_work(struct work_struct *work) } kfree(wrk); } +#endif -static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, + const struct smcd_ops *ops, + int max_dmbs) { struct smcd_dev *smcd; - smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = devm_kcalloc(parent, max_dmbs, - sizeof(struct smc_connection *), GFP_KERNEL); + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); if (!smcd->conn) - return NULL; + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) - return NULL; + goto free_conn; smcd->ops = ops; @@ -480,27 +487,58 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -static void smcd_register_dev(struct ism_dev *ism) +static void smcd_register_dev(struct dibs_dev *dibs) { - const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; + const struct smcd_ops *ops; + struct smc_lo_dev *smc_lo; + struct ism_dev *ism; - if (!ops) - return; + if (smc_ism_is_loopback(dibs)) { + if (smc_loopback_init(&smc_lo)) + return; + } - smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, - ISM_NR_DMBS); + if (smc_ism_is_loopback(dibs)) { + ops = smc_lo_get_smcd_ops(); + smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + SMC_LO_MAX_DMBS); + } else { + ism = dibs->drv_priv; +#if IS_ENABLED(CONFIG_ISM) + ops = ism_get_smcd_ops(); +#endif + smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + } if (!smcd) return; - smcd->priv = ism; - smcd->client = &smc_ism_client; - ism_set_priv(ism, &smc_ism_client, smcd); - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); - if (smcd->ops->supports_v2()) + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_ism_is_loopback(dibs)) { + smcd->priv = smc_lo; + smc_lo->smcd = smcd; + } else { + smcd->priv = ism; +#if IS_ENABLED(CONFIG_ISM) + ism_set_priv(ism, &smc_ism_client, smcd); + smcd->client = &smc_ism_client; +#endif + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + } + + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); /* sort list: @@ -510,7 +548,7 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); - if (fentry && smc_ism_is_loopback(fentry)) + if (fentry && smc_ism_is_loopback(fentry->dibs)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); @@ -519,32 +557,46 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: adding smcd loopback device\n"); + } else { + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&ism->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&ism->dev)); + } return; } -static void smcd_unregister_dev(struct ism_dev *ism) +static void smcd_unregister_dev(struct dibs_dev *dibs) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); + struct ism_dev *ism = dibs->drv_priv; - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: removing smcd loopback device\n"); + } else { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ism->dev)); + } smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); + if (smc_ism_is_loopback(dibs)) + smc_loopback_exit(); + kfree(smcd->conn); + kfree(smcd); } +#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 765aa8fae6fa..04699951d03f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "smc.h" @@ -85,14 +86,14 @@ static inline bool __smc_ism_is_emulated(u16 chid) static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { - u16 chid = smcd->ops->get_chid(smcd); + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); return __smc_ism_is_emulated(chid); } -static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) { - return (smcd->ops->get_chid(smcd) == 0xFFFF); + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 1853c26fbbbb..37d8366419f7 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -35,8 +35,6 @@ static void smc_lo_generate_ids(struct smc_lo_dev *ldev) memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), sizeof(lgid->gid_ext)); - - ldev->chid = SMC_LO_RESERVED_CHID; } static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, @@ -257,11 +255,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static u16 smc_lo_get_chid(struct smcd_dev *smcd) -{ - return ((struct smc_lo_dev *)smcd->priv)->chid; -} - static struct device *smc_lo_get_dev(struct smcd_dev *smcd) { return &((struct smc_lo_dev *)smcd->priv)->dev; @@ -281,72 +274,15 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_chid = smc_lo_get_chid, .get_dev = smc_lo_get_dev, }; -static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, - int max_dmbs) -{ - struct smcd_dev *smcd; - - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); - if (!smcd) - return NULL; - - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) - goto out_smcd; - - smcd->ops = ops; - - spin_lock_init(&smcd->lock); - spin_lock_init(&smcd->lgr_lock); - INIT_LIST_HEAD(&smcd->vlan); - INIT_LIST_HEAD(&smcd->lgr_list); - init_waitqueue_head(&smcd->lgrs_deleted); - return smcd; - -out_smcd: - kfree(smcd); - return NULL; -} - -static int smcd_lo_register_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd; - - smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); - if (!smcd) - return -ENOMEM; - ldev->smcd = smcd; - smcd->priv = ldev; - smc_ism_set_v2_capable(); - mutex_lock(&smcd_dev_list.mutex); - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s\n", - dev_name(&ldev->dev)); - return 0; -} - -static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +const struct smcd_ops *smc_lo_get_smcd_ops(void) { - struct smcd_dev *smcd = ldev->smcd; - - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ldev->dev)); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); - mutex_lock(&smcd_dev_list.mutex); - list_del_init(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - kfree(smcd->conn); - kfree(smcd); + return &lo_ops; } -static int smc_lo_dev_init(struct smc_lo_dev *ldev) +static void smc_lo_dev_init(struct smc_lo_dev *ldev) { smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); @@ -354,12 +290,11 @@ static int smc_lo_dev_init(struct smc_lo_dev *ldev) atomic_set(&ldev->dmb_cnt, 0); init_waitqueue_head(&ldev->ldev_release); - return smcd_lo_register_dev(ldev); + return; } static void smc_lo_dev_exit(struct smc_lo_dev *ldev) { - smcd_lo_unregister_dev(ldev); if (atomic_read(&ldev->dmb_cnt)) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } @@ -375,7 +310,6 @@ static void smc_lo_dev_release(struct device *dev) static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; - int ret; ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) @@ -385,17 +319,11 @@ static int smc_lo_dev_probe(void) ldev->dev.release = smc_lo_dev_release; device_initialize(&ldev->dev); dev_set_name(&ldev->dev, smc_lo_dev_name); - - ret = smc_lo_dev_init(ldev); - if (ret) - goto free_dev; + smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ - return 0; -free_dev: - put_device(&ldev->dev); - return ret; + return 0; } static void smc_lo_dev_remove(void) @@ -405,11 +333,17 @@ static void smc_lo_dev_remove(void) smc_lo_dev_exit(lo_dev); put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + lo_dev = NULL; } -int smc_loopback_init(void) +int smc_loopback_init(struct smc_lo_dev **smc_lb) { - return smc_lo_dev_probe(); + int ret; + + ret = smc_lo_dev_probe(); + if (!ret) + *smc_lb = lo_dev; + return ret; } void smc_loopback_exit(void) diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 04dc6808d2e1..76c62526e2e5 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -17,10 +17,8 @@ #include #include -#if IS_ENABLED(CONFIG_SMC_LO) #define SMC_LO_MAX_DMBS 5000 #define SMC_LO_DMBS_HASH_BITS 12 -#define SMC_LO_RESERVED_CHID 0xFFFF struct smc_lo_dmb_node { struct hlist_node list; @@ -35,7 +33,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; struct device dev; - u16 chid; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; @@ -44,17 +41,9 @@ struct smc_lo_dev { wait_queue_head_t ldev_release; }; -int smc_loopback_init(void); +const struct smcd_ops *smc_lo_get_smcd_ops(void); + +int smc_loopback_init(struct smc_lo_dev **smc_lb); void smc_loopback_exit(void); -#else -static inline int smc_loopback_init(void) -{ - return 0; -} - -static inline void smc_loopback_exit(void) -{ -} -#endif #endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From 845c334a0186a23c2ac4abfb444e499fec831b24 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:04:54 +0200 Subject: dibs: Move struct device to dibs_dev Move struct device from ism_dev and smc_lo_dev to dibs_dev, and define a corresponding release function. Free ism_dev in ism_remove() and smc_lo_dev in smc_lo_dev_remove(). Replace smcd->ops->get_dev(smcd) by using dibs->dev directly. An alternative design would be to embed dibs_dev as a field in ism_dev and do the same for other dibs device driver specific structs. However that would have the disadvantage that each dibs device driver needs to allocate dibs_dev and each dibs device driver needs a different device release function. The advantage would be that ism_dev and other device driver specific structs would be covered by device reference counts. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-9-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 15 +++++++-------- drivers/dibs/dibs_main.c | 21 +++++++++++++++++++- drivers/s390/net/ism_drv.c | 40 ++++++++------------------------------ include/linux/dibs.h | 1 + include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_core.c | 4 ++-- net/smc/smc_ism.c | 46 ++++++++++++++++++-------------------------- net/smc/smc_loopback.c | 21 +------------------- net/smc/smc_loopback.h | 1 - net/smc/smc_pnet.c | 13 +++++-------- 11 files changed, 63 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 215986ae54a4..76e479d5724b 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -15,6 +15,7 @@ #include "dibs_loopback.h" +static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -27,11 +28,6 @@ static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, }; -static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) -{ - dibs_dev_del(ldev->dibs); -} - static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -52,6 +48,9 @@ static int dibs_lo_dev_probe(void) dibs->drv_priv = ldev; dibs->ops = &dibs_lo_ops; + dibs->dev.parent = NULL; + dev_set_name(&dibs->dev, "%s", dibs_lo_dev_name); + ret = dibs_dev_add(dibs); if (ret) goto err_reg; @@ -60,7 +59,7 @@ static int dibs_lo_dev_probe(void) err_reg: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); kfree(ldev); return ret; @@ -71,9 +70,9 @@ static void dibs_lo_dev_remove(void) if (!lo_dev) return; - dibs_lo_dev_exit(lo_dev); + dibs_dev_del(lo_dev->dibs); /* pairs with dibs_dev_alloc() */ - kfree(lo_dev->dibs); + put_device(&lo_dev->dibs->dev); kfree(lo_dev); lo_dev = NULL; } diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f1cfa5849277..610b6c452211 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -88,11 +88,24 @@ int dibs_unregister_client(struct dibs_client *client) } EXPORT_SYMBOL_GPL(dibs_unregister_client); +static void dibs_dev_release(struct device *dev) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + kfree(dibs); +} + struct dibs_dev *dibs_dev_alloc(void) { struct dibs_dev *dibs; dibs = kzalloc(sizeof(*dibs), GFP_KERNEL); + if (!dibs) + return dibs; + dibs->dev.release = dibs_dev_release; + device_initialize(&dibs->dev); return dibs; } @@ -100,7 +113,11 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { - int i; + int i, ret; + + ret = device_add(&dibs->dev); + if (ret) + return ret; mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); @@ -129,6 +146,8 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); + + device_del(&dibs->dev); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2bd8f64ebb56..4096ea9faa7e 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -602,15 +602,6 @@ out: return ret; } -static void ism_dev_release(struct device *dev) -{ - struct ism_dev *ism; - - ism = container_of(dev, struct ism_dev, dev); - - kfree(ism); -} - static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -649,17 +640,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; - ism->dev.parent = &pdev->dev; - ism->dev.release = ism_dev_release; - device_initialize(&ism->dev); - dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev)); - ret = device_add(&ism->dev); - if (ret) - goto err_dev; ret = pci_enable_device_mem(pdev); if (ret) - goto err; + goto err_dev; ret = pci_request_mem_regions(pdev, DRV_NAME); if (ret) @@ -687,6 +671,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + dibs->dev.parent = &pdev->dev; + dev_set_name(&dibs->dev, "%s", dev_name(&pdev->dev)); + ret = dibs_dev_add(dibs); if (ret) goto err_ism; @@ -697,16 +684,14 @@ err_ism: ism_dev_exit(ism); err_dibs: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); err_resource: pci_release_mem_regions(pdev); err_disable: pci_disable_device(pdev); -err: - device_del(&ism->dev); err_dev: dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); return ret; } @@ -719,13 +704,12 @@ static void ism_remove(struct pci_dev *pdev) dibs_dev_del(dibs); ism_dev_exit(ism); /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); pci_release_mem_regions(pdev); pci_disable_device(pdev); - device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); } static struct pci_driver ism_driver = { @@ -866,13 +850,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static inline struct device *smcd_get_dev(struct smcd_dev *dev) -{ - struct ism_dev *ism = dev->priv; - - return &ism->dev; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -885,7 +862,6 @@ static const struct smcd_ops ism_smcd_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 805ab33271b5..793c6e1ece0f 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -135,6 +135,7 @@ struct dibs_dev_ops { struct dibs_dev { struct list_head list; + struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; /* priv pointer for device driver */ diff --git a/include/linux/ism.h b/include/linux/ism.h index c818a25996db..84f1afb3dded 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - struct device dev; u64 local_gid; int ieq_idx; diff --git a/include/net/smc.h b/include/net/smc.h index e271891b85e6..05faac83371e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -63,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a9e80f44307d..42ab0795d563 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -924,7 +924,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; - get_device(smcd->ops->get_dev(smcd)); + get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = @@ -1474,7 +1474,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 415f03910c91..6a6e7c9641e8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -303,12 +303,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; - struct ism_dev *ism; int use_cnt = 0; void *nlh; - ism = smcd->priv; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -323,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(ism->pdev, &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -509,14 +509,14 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_ism_is_loopback(dibs)) { ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, SMC_LO_MAX_DMBS); } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, ISM_NR_DMBS); } if (!smcd) @@ -534,10 +534,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) ism_set_priv(ism, &smc_ism_client, smcd); smcd->client = &smc_ism_client; #endif - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); } + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); @@ -557,33 +558,24 @@ static void smcd_register_dev(struct dibs_dev *dibs) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: adding smcd loopback device\n"); - } else { - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); - } + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); return; } static void smcd_unregister_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); - struct ism_dev *ism = dibs->drv_priv; - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: removing smcd loopback device\n"); - } else { - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); - } + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&dibs->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 37d8366419f7..262d0d0df4d0 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -23,7 +23,6 @@ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) -static const char smc_lo_dev_name[] = "loopback-ism"; static struct smc_lo_dev *lo_dev; static void smc_lo_generate_ids(struct smc_lo_dev *ldev) @@ -255,11 +254,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static struct device *smc_lo_get_dev(struct smcd_dev *smcd) -{ - return &((struct smc_lo_dev *)smcd->priv)->dev; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -274,7 +268,6 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_dev = smc_lo_get_dev, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -299,14 +292,6 @@ static void smc_lo_dev_exit(struct smc_lo_dev *ldev) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } -static void smc_lo_dev_release(struct device *dev) -{ - struct smc_lo_dev *ldev = - container_of(dev, struct smc_lo_dev, dev); - - kfree(ldev); -} - static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; @@ -315,10 +300,6 @@ static int smc_lo_dev_probe(void) if (!ldev) return -ENOMEM; - ldev->dev.parent = NULL; - ldev->dev.release = smc_lo_dev_release; - device_initialize(&ldev->dev); - dev_set_name(&ldev->dev, smc_lo_dev_name); smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ @@ -332,7 +313,7 @@ static void smc_lo_dev_remove(void) return; smc_lo_dev_exit(lo_dev); - put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + kfree(lo_dev); lo_dev = NULL; } diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 76c62526e2e5..a033bf10890a 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct device dev; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 7225b5fa17a6..d0df7f2b03aa 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", - dev_name(smcd->ops->get_dev(smcd)), + dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; @@ -332,7 +332,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } @@ -413,7 +413,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; - struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -431,10 +430,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { - dev = smcd->ops->get_dev(smcd); - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(dev), + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), smcd->pnetid); } } @@ -1193,7 +1190,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); + const char *ib_name = dev_name(&smcddev->dibs->dev); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; -- cgit v1.2.3 From 05e68d8dedf34f270cc3769ffe7f0ed413f23add Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:56 +0200 Subject: dibs: Local gid for dibs devices Define a uuid_t GID attribute to identify a dibs device. SMC uses 64 Bit and 128 Bit Global Identifiers (GIDs) per device, that need to be sent via the SMC protocol. Because the smc code uses integers, network endianness and host endianness need to be considered. Avoid this in the dibs layer by using uuid_t byte arrays. Future patches could change SMC to use uuid_t. For now conversion helper functions are introduced. ISM devices provide 64 Bit GIDs. Map them to dibs uuid_t GIDs like this: _________________________________________ | 64 Bit ISM-vPCI GID | 00000000_00000000 | ----------------------------------------- If interpreted as UUID [1], this would be interpreted as the UIID variant, that is reserved for NCS backward compatibility. So it will not collide with UUIDs that were generated according to the standard. smc_loopback already uses version 4 UUIDs as 128 Bit GIDs, move that to dibs loopback. A temporary change to smc_lo_query_rgid() is required, that will be moved to dibs_loopback with a follow-on patch. Provide gid of a dibs device as sysfs read-only attribute. Link: https://datatracker.ietf.org/doc/html/rfc4122 [1] Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-11-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 1 + drivers/dibs/dibs_main.c | 12 ++++++++++++ drivers/s390/net/ism.h | 9 +++++++++ drivers/s390/net/ism_drv.c | 30 +++++++++--------------------- include/linux/dibs.h | 3 +++ include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_clc.c | 6 +++--- net/smc/smc_core.c | 2 +- net/smc/smc_diag.c | 2 +- net/smc/smc_ism.h | 22 ++++++++++++++++++++++ net/smc/smc_loopback.c | 29 ++++------------------------- net/smc/smc_loopback.h | 1 - 13 files changed, 65 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 76e479d5724b..d7e6fa5e90f3 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -46,6 +46,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; dibs->dev.parent = NULL; diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index b3f21805aa59..f20ed0594a51 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -114,6 +114,17 @@ struct dibs_dev *dibs_dev_alloc(void) } EXPORT_SYMBOL_GPL(dibs_dev_alloc); +static ssize_t gid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + return sysfs_emit(buf, "%pUb\n", &dibs->gid); +} +static DEVICE_ATTR_RO(gid); + static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -128,6 +139,7 @@ static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(fabric_id); static struct attribute *dibs_dev_attrs[] = { + &dev_attr_gid.attr, &dev_attr_fabric_id.attr, NULL, }; diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 3078779fa71e..1b9fa14da20c 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -67,6 +67,15 @@ union ism_reg_ieq { } response; } __aligned(16); +/* ISM-vPCI devices provide 64 Bit GIDs + * Map them to ISM UUID GIDs like this: + * _________________________________________ + * | 64 Bit ISM-vPCI GID | 00000000_00000000 | + * ----------------------------------------- + * This will be interpreted as a UIID variant, that is reserved + * for NCS backward compatibility. So it will not collide with + * proper UUIDs. + */ union ism_read_gid { struct { struct ism_req_hdr hdr; diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ab1d61eb3e3b..e58c55fb03c2 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -272,8 +272,9 @@ static int unregister_ieq(struct ism_dev *ism) return 0; } -static int ism_read_local_gid(struct ism_dev *ism) +static int ism_read_local_gid(struct dibs_dev *dibs) { + struct ism_dev *ism = dibs->drv_priv; union ism_read_gid cmd; int ret; @@ -285,7 +286,8 @@ static int ism_read_local_gid(struct ism_dev *ism) if (ret) goto out; - ism->local_gid = cmd.response.gid; + memset(&dibs->gid, 0, sizeof(dibs->gid)); + memcpy(&dibs->gid, &cmd.response.gid, sizeof(cmd.response.gid)); out: return ret; } @@ -563,10 +565,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - ret = ism_read_local_gid(ism); - if (ret) - goto unreg_ieq; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) /* hardware is V2 capable */ ism_v2_capable = true; @@ -588,8 +586,6 @@ static int ism_dev_init(struct ism_dev *ism) query_info(ism); return 0; -unreg_ieq: - unregister_ieq(ism); unreg_sba: unregister_sba(ism); free_irq: @@ -672,6 +668,11 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + /* after ism_dev_init() we can call ism function to set gid */ + ret = ism_read_local_gid(dibs); + if (ret) + goto err_ism; + dibs->dev.parent = &pdev->dev; zdev = to_zpci(pdev); @@ -841,18 +842,6 @@ static int smcd_supports_v2(void) return ism_v2_capable; } -static u64 ism_get_local_gid(struct ism_dev *ism) -{ - return ism->local_gid; -} - -static void smcd_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - smcd_gid->gid = ism_get_local_gid(smcd->priv); - smcd_gid->gid_ext = 0; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -864,7 +853,6 @@ static const struct smcd_ops ism_smcd_ops = { .signal_event = smcd_signal_ieq, .move_data = smcd_move, .supports_v2 = smcd_supports_v2, - .get_local_gid = smcd_get_local_gid, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 793c6e1ece0f..904f37505c27 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -10,6 +10,8 @@ #define _DIBS_H #include +#include + /* DIBS - Direct Internal Buffer Sharing - concept * ----------------------------------------------- * In the case of multiple system sharing the same hardware, dibs fabrics can @@ -138,6 +140,7 @@ struct dibs_dev { struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; + uuid_t gid; /* priv pointer for device driver */ void *drv_priv; diff --git a/include/linux/ism.h b/include/linux/ism.h index 84f1afb3dded..a926dd61b5a1 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - u64 local_gid; int ieq_idx; struct ism_client *subs[MAX_CLIENTS]; diff --git a/include/net/smc.h b/include/net/smc.h index 05faac83371e..9cb8385bbc6e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -62,7 +62,6 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); - void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 09745baa1017..157aace169d4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -916,7 +916,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); @@ -966,7 +966,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); @@ -1059,7 +1059,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 42ab0795d563..be0c2da83d2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8ed2f6689b01..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 04699951d03f..139e99da2c9f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -96,4 +96,26 @@ static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); +} + #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 262d0d0df4d0..454d9d6a6e8f 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -13,6 +13,7 @@ #include #include +#include #include #include "smc_cdc.h" @@ -25,25 +26,14 @@ static struct smc_lo_dev *lo_dev; -static void smc_lo_generate_ids(struct smc_lo_dev *ldev) -{ - struct smcd_gid *lgid = &ldev->local_gid; - uuid_t uuid; - - uuid_gen(&uuid); - memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); - memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), - sizeof(lgid->gid_ext)); -} - static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, u32 vid_valid, u32 vid) { - struct smc_lo_dev *ldev = smcd->priv; + uuid_t temp; + copy_to_dibsgid(&temp, rgid); /* rgid should be the same as lgid */ - if (!ldev || rgid->gid != ldev->local_gid.gid || - rgid->gid_ext != ldev->local_gid.gid_ext) + if (!uuid_equal(&temp, &smcd->dibs->gid)) return -ENETUNREACH; return 0; } @@ -245,15 +235,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, return 0; } -static void smc_lo_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - smcd_gid->gid = ldev->local_gid.gid; - smcd_gid->gid_ext = ldev->local_gid.gid_ext; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -267,7 +248,6 @@ static const struct smcd_ops lo_ops = { .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, - .get_local_gid = smc_lo_get_local_gid, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -277,7 +257,6 @@ const struct smcd_ops *smc_lo_get_smcd_ops(void) static void smc_lo_dev_init(struct smc_lo_dev *ldev) { - smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); hash_init(ldev->dmb_ht); atomic_set(&ldev->dmb_cnt, 0); diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index a033bf10890a..33bb96ec8b77 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); -- cgit v1.2.3 From 92a0f7bb081dde6e88368816b8ba51352ddabb1d Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:57 +0200 Subject: dibs: Move vlan support to dibs_dev_ops It can be debated how much benefit definition of vlan ids for dibs devices brings, as the dmbs are accessible only by a single peer anyhow. But ism provides vlan support and smcd exploits it, so move it to dibs layer as an optional feature. smcd_loopback simply ignores all vlan settings, do the same in dibs_loopback. SMC-D and ISM have a method to use the invalid VLAN ID 1FFF (ISM_RESERVED_VLANID), to indicate that both communication peers support routable SMC-Dv2. Tolerate it in dibs, but move it to SMC only. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-12-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 47 ++++++---------------------------------------- include/linux/dibs.h | 19 +++++++++++++++++++ include/net/smc.h | 5 ----- net/smc/smc_ism.c | 14 +++++++++----- net/smc/smc_loopback.c | 5 ----- 5 files changed, 34 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e58c55fb03c2..ed4c28ca355b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -36,7 +36,6 @@ static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ /* a list for fast mapping */ static u8 max_client; static DEFINE_MUTEX(clients_lock); -static bool ism_v2_capable; struct ism_dev_list { struct list_head list; struct mutex mutex; /* protects ism device list */ @@ -409,8 +408,9 @@ out: } EXPORT_SYMBOL_GPL(ism_unregister_dmb); -static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -422,8 +422,9 @@ static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -536,6 +537,8 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .add_vlan_id = ism_add_vlan_id, + .del_vlan_id = ism_del_vlan_id, }; static int ism_dev_init(struct ism_dev *ism) @@ -565,12 +568,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) - /* hardware is V2 capable */ - ism_v2_capable = true; - else - ism_v2_capable = false; - mutex_lock(&ism_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -611,8 +608,6 @@ static void ism_dev_exit(struct ism_dev *ism) mutex_lock(&ism_dev_list.mutex); - if (ism_v2_capable) - ism_del_vlan_id(ism, ISM_RESERVED_VLANID); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); @@ -786,26 +781,6 @@ static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); } -static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_add_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_del_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_set_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); -} - -static int smcd_reset_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -837,22 +812,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); } -static int smcd_supports_v2(void) -{ - return ism_v2_capable; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, - .add_vlan_id = smcd_add_vlan_id, - .del_vlan_id = smcd_del_vlan_id, - .set_vlan_required = smcd_set_vlan_required, - .reset_vlan_required = smcd_reset_vlan_required, .signal_event = smcd_signal_ieq, .move_data = smcd_move, - .supports_v2 = smcd_supports_v2, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 904f37505c27..166148fb8d76 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,25 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * add_vlan_id() - add dibs device to vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan. A device can belong to more than 1 vlan. + * Any device can access an untagged dmb. + * Deprecated, only supported for backwards compatibility. + * Return: zero on success + */ + int (*add_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * del_vlan_id() - remove dibs device from vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * Return: zero on success + */ + int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); }; struct dibs_dev { diff --git a/include/net/smc.h b/include/net/smc.h index 9cb8385bbc6e..51b4aefc106a 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -61,13 +61,8 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); - int (*supports_v2)(void); /* optional operations */ - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); int (*support_dmb_nocopy)(struct smcd_dev *dev); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 6a6e7c9641e8..5118441bed18 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -140,7 +140,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->add_vlan_id) + if (!smcd->dibs->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ @@ -163,7 +163,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -187,7 +187,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->del_vlan_id) + if (!smcd->dibs->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); @@ -205,7 +205,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -539,8 +539,12 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); - if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { smc_ism_set_v2_capable(); + } + mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 454d9d6a6e8f..982a19430313 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -20,7 +20,6 @@ #include "smc_ism.h" #include "smc_loopback.h" -#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) @@ -242,10 +241,6 @@ static const struct smcd_ops lo_ops = { .support_dmb_nocopy = smc_lo_support_dmb_nocopy, .attach_dmb = smc_lo_attach_dmb, .detach_dmb = smc_lo_detach_dmb, - .add_vlan_id = NULL, - .del_vlan_id = NULL, - .set_vlan_required = NULL, - .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, }; -- cgit v1.2.3 From 719c3b67bb7ea95bb8158b03c75641c8fc8f94a0 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:58 +0200 Subject: dibs: Move query_remote_gid() to dibs_dev_ops Provide the dibs_dev_ops->query_remote_gid() in ism and dibs_loopback dibs_devices. And call it in smc dibs_client. Reviewed-by: Julian Ruess Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-13-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 10 ++++++++++ drivers/s390/net/ism_drv.c | 41 ++++++++++++++++++----------------------- include/linux/dibs.h | 14 ++++++++++++++ include/net/smc.h | 2 -- net/smc/smc_ism.c | 8 ++++++-- net/smc/smc_loopback.c | 13 ------------- 6 files changed, 48 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index d7e6fa5e90f3..6b53e626a6d1 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -24,8 +24,18 @@ static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) return DIBS_LOOPBACK_FABRIC; } +static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + /* rgid should be the same as lgid */ + if (!uuid_equal(rgid, &dibs->gid)) + return -ENETUNREACH; + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, + .query_remote_gid = dibs_lo_query_rgid, }; static int dibs_lo_dev_probe(void) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ed4c28ca355b..121b3a2be760 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -291,6 +291,23 @@ out: return ret; } +static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_query_rgid cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_QUERY_RGID; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.vlan_valid = vid_valid; + cmd.request.vlan_id = vid; + + return ism_cmd(ism, &cmd); +} + static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -537,6 +554,7 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .query_remote_gid = ism_query_rgid, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -748,28 +766,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, - u32 vid) -{ - union ism_query_rgid cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_QUERY_RGID; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.vlan_valid = vid_valid; - cmd.request.vlan_id = vid; - - return ism_cmd(ism, &cmd); -} - -static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - return ism_query_rgid(smcd->priv, rgid->gid, vid_valid, vid); -} - static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client) { @@ -813,7 +809,6 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, } static const struct smcd_ops ism_smcd_ops = { - .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 166148fb8d76..c75a40fe3039 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,20 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * query_remote_gid() + * @dev: local dibs device + * @rgid: gid of remote dibs device + * @vid_valid: if zero, vid will be ignored; + * deprecated, ignored if device does not support vlan + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * Query whether a remote dibs device is reachable via this local device + * and this vlan id. + * Return: 0 if remote gid is reachable. + */ + int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, + u32 vid_valid, u32 vid); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device diff --git a/include/net/smc.h b/include/net/smc.h index 51b4aefc106a..5bd135fb4d49 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -53,8 +53,6 @@ struct smcd_gid { }; struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, void *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5118441bed18..d20d00b46825 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -77,8 +77,12 @@ static void smc_ism_create_system_eid(void) int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 982a19430313..52cba01cb209 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -25,18 +25,6 @@ static struct smc_lo_dev *lo_dev; -static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - uuid_t temp; - - copy_to_dibsgid(&temp, rgid); - /* rgid should be the same as lgid */ - if (!uuid_equal(&temp, &smcd->dibs->gid)) - return -ENETUNREACH; - return 0; -} - static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client_priv) { @@ -235,7 +223,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, } static const struct smcd_ops lo_ops = { - .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, .unregister_dmb = smc_lo_unregister_dmb, .support_dmb_nocopy = smc_lo_support_dmb_nocopy, -- cgit v1.2.3 From cc21191b584c6f7836b0f10774f8278b7cbfba10 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:59 +0200 Subject: dibs: Move data path to dibs layer Use struct dibs_dmb instead of struct smc_dmb and move the corresponding client tables to dibs_dev. Leave driver specific implementation details like sba in the device drivers. Register and unregister dmbs via dibs_dev_ops. A dmb is dedicated to a single client, but a dibs device can have dmbs for more than one client. Trigger dibs clients via dibs_client_ops->handle_irq(), when data is received into a dmb. For dibs_loopback replace scheduling an smcd receive tasklet with calling dibs_client_ops->handle_irq(). For loopback devices attach_dmb(), detach_dmb() and move_data() need to access the dmb tables, so move those to dibs_dev_ops in this patch as well. Remove remaining definitions of smc_loopback as they are no longer required, now that everything is in dibs_loopback. Note that struct ism_client and struct ism_dev are still required in smc until a follow-on patch moves event handling to dibs. (Loopback does not use events). Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-14-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 257 +++++++++++++++++++++++++++++++++++++ drivers/dibs/dibs_loopback.h | 19 +++ drivers/dibs/dibs_main.c | 56 ++++++++- drivers/s390/net/ism_drv.c | 121 ++++++++---------- include/linux/dibs.h | 177 ++++++++++++++++++++++++++ include/linux/ism.h | 23 ---- include/net/smc.h | 22 ---- net/smc/Makefile | 1 - net/smc/smc_ism.c | 70 +++++------ net/smc/smc_ism.h | 4 +- net/smc/smc_loopback.c | 294 ------------------------------------------- net/smc/smc_loopback.h | 47 ------- 12 files changed, 591 insertions(+), 500 deletions(-) delete mode 100644 net/smc/smc_loopback.c delete mode 100644 net/smc/smc_loopback.h (limited to 'include') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 6b53e626a6d1..b3fd0f8100d4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -9,12 +9,18 @@ * */ +#include +#include #include #include +#include #include #include "dibs_loopback.h" +#define DIBS_LO_SUPPORT_NOCOPY 0x1 +#define DIBS_DMA_ADDR_INVALID (~(dma_addr_t)0) + static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -33,11 +39,259 @@ static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return 0; } +static int dibs_lo_max_dmbs(void) +{ + return DIBS_LO_MAX_DMBS; +} + +static int dibs_lo_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) +{ + struct dibs_lo_dmb_node *dmb_node, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + int sba_idx, rc; + + ldev = dibs->drv_priv; + sba_idx = dmb->idx; + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, DIBS_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == DIBS_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = DIBS_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[sba_idx] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __dibs_lo_unregister_dmb(struct dibs_lo_dev *ldev, + struct dibs_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int dibs_lo_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + + ldev = dibs->drv_priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + read_unlock_bh(&ldev->dmb_ht_lock); + if (!dmb_node) + return -EINVAL; + + if (refcount_dec_and_test(&dmb_node->refcnt)) { + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb_node->sba_idx] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); + + __dibs_lo_unregister_dmb(ldev, dmb_node); + } + return 0; +} + +static int dibs_lo_support_dmb_nocopy(struct dibs_dev *dibs) +{ + return DIBS_LO_SUPPORT_NOCOPY; +} + +static int dibs_lo_attach_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int dibs_lo_detach_dmb(struct dibs_dev *dibs, u64 token) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __dibs_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int dibs_lo_move_data(struct dibs_dev *dibs, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct dibs_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + u16 s_mask; + u8 client_id; + u32 sba_idx; + + ldev = dibs->drv_priv; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + sba_idx = rmb_node->sba_idx; + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!sf) + return 0; + + spin_lock(&dibs->lock); + client_id = dibs->dmb_clientid_arr[sba_idx]; + s_mask = ror16(0x1000, idx); + if (likely(client_id != NO_DIBS_CLIENT && dibs->subs[client_id])) + dibs->subs[client_id]->ops->handle_irq(dibs, sba_idx, s_mask); + spin_unlock(&dibs->lock); + + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, .query_remote_gid = dibs_lo_query_rgid, + .max_dmbs = dibs_lo_max_dmbs, + .register_dmb = dibs_lo_register_dmb, + .unregister_dmb = dibs_lo_unregister_dmb, + .move_data = dibs_lo_move_data, + .support_mmapped_rdmb = dibs_lo_support_dmb_nocopy, + .attach_dmb = dibs_lo_attach_dmb, + .detach_dmb = dibs_lo_detach_dmb, }; +static void dibs_lo_dev_init(struct dibs_lo_dev *ldev) +{ + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); +} + +static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) +{ + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -56,6 +310,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + dibs_lo_dev_init(ldev); uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; @@ -69,6 +324,7 @@ static int dibs_lo_dev_probe(void) return 0; err_reg: + kfree(dibs->dmb_clientid_arr); /* pairs with dibs_dev_alloc() */ put_device(&dibs->dev); kfree(ldev); @@ -82,6 +338,7 @@ static void dibs_lo_dev_remove(void) return; dibs_dev_del(lo_dev->dibs); + dibs_lo_dev_exit(lo_dev); /* pairs with dibs_dev_alloc() */ put_device(&lo_dev->dibs->dev); kfree(lo_dev); diff --git a/drivers/dibs/dibs_loopback.h b/drivers/dibs/dibs_loopback.h index fd03b6333a24..0664f6a8e662 100644 --- a/drivers/dibs/dibs_loopback.h +++ b/drivers/dibs/dibs_loopback.h @@ -13,13 +13,32 @@ #define _DIBS_LOOPBACK_H #include +#include +#include #include #include #if IS_ENABLED(CONFIG_DIBS_LO) +#define DIBS_LO_DMBS_HASH_BITS 12 +#define DIBS_LO_MAX_DMBS 5000 + +struct dibs_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; struct dibs_lo_dev { struct dibs_dev *dibs; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, DIBS_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, DIBS_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; }; int dibs_loopback_init(void); diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f20ed0594a51..aacb3ea7825a 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,6 +36,16 @@ static struct dibs_dev_list dibs_dev_list = { .mutex = __MUTEX_INITIALIZER(dibs_dev_list.mutex), }; +static void dibs_setup_forwarding(struct dibs_client *client, + struct dibs_dev *dibs) +{ + unsigned long flags; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->subs[client->id] = client; + spin_unlock_irqrestore(&dibs->lock, flags); +} + int dibs_register_client(struct dibs_client *client) { struct dibs_dev *dibs; @@ -60,6 +70,7 @@ int dibs_register_client(struct dibs_client *client) list_for_each_entry(dibs, &dibs_dev_list.list, list) { dibs->priv[i] = NULL; client->ops->add_dev(dibs); + dibs_setup_forwarding(client, dibs); } } mutex_unlock(&dibs_dev_list.mutex); @@ -71,10 +82,25 @@ EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { struct dibs_dev *dibs; + unsigned long flags; + int max_dmbs; int rc = 0; mutex_lock(&dibs_dev_list.mutex); list_for_each_entry(dibs, &dibs_dev_list.list, list) { + spin_lock_irqsave(&dibs->lock, flags); + max_dmbs = dibs->ops->max_dmbs(); + for (int i = 0; i < max_dmbs; ++i) { + if (dibs->dmb_clientid_arr[i] == client->id) { + WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", + __func__, client->name); + rc = -EBUSY; + goto err_reg_dmb; + } + } + /* Stop forwarding IRQs */ + dibs->subs[client->id] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); dibs->priv[client->id] = NULL; } @@ -87,6 +113,11 @@ int dibs_unregister_client(struct dibs_client *client) mutex_unlock(&dibs_dev_list.mutex); return rc; + +err_reg_dmb: + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -150,11 +181,19 @@ static const struct attribute_group dibs_dev_attr_group = { int dibs_dev_add(struct dibs_dev *dibs) { + int max_dmbs; int i, ret; + max_dmbs = dibs->ops->max_dmbs(); + spin_lock_init(&dibs->lock); + dibs->dmb_clientid_arr = kzalloc(max_dmbs, GFP_KERNEL); + if (!dibs->dmb_clientid_arr) + return -ENOMEM; + memset(dibs->dmb_clientid_arr, NO_DIBS_CLIENT, max_dmbs); + ret = device_add(&dibs->dev); if (ret) - return ret; + goto free_client_arr; ret = sysfs_create_group(&dibs->dev.kobj, &dibs_dev_attr_group); if (ret) { @@ -164,8 +203,10 @@ int dibs_dev_add(struct dibs_dev *dibs) mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { - if (clients[i]) + if (clients[i]) { clients[i]->ops->add_dev(dibs); + dibs_setup_forwarding(clients[i], dibs); + } } mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); @@ -175,6 +216,8 @@ int dibs_dev_add(struct dibs_dev *dibs) err_device_del: device_del(&dibs->dev); +free_client_arr: + kfree(dibs->dmb_clientid_arr); return ret; } @@ -182,8 +225,16 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + unsigned long flags; int i; + sysfs_remove_group(&dibs->dev.kobj, &dibs_dev_attr_group); + + spin_lock_irqsave(&dibs->lock, flags); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) + dibs->subs[i] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -195,6 +246,7 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&dibs_dev_list.mutex); device_del(&dibs->dev); + kfree(dibs->dmb_clientid_arr); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 121b3a2be760..346d1ea8650b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -98,14 +98,6 @@ int ism_unregister_client(struct ism_client *client) spin_lock_irqsave(&ism->lock, flags); /* Stop forwarding IRQs and events */ ism->subs[client->id] = NULL; - for (int i = 0; i < ISM_NR_DMBS; ++i) { - if (ism->sba_client_arr[i] == client->id) { - WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", - __func__, client->name); - rc = -EBUSY; - goto err_reg_dmb; - } - } spin_unlock_irqrestore(&ism->lock, flags); } mutex_unlock(&ism_dev_list.mutex); @@ -116,11 +108,6 @@ int ism_unregister_client(struct ism_client *client) max_client--; mutex_unlock(&clients_lock); return rc; - -err_reg_dmb: - spin_unlock_irqrestore(&ism->lock, flags); - mutex_unlock(&ism_dev_list.mutex); - return rc; } EXPORT_SYMBOL_GPL(ism_unregister_client); @@ -308,15 +295,20 @@ static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return ism_cmd(ism, &cmd); } -static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_max_dmbs(void) +{ + return ISM_NR_DMBS; +} + +static void ism_free_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, DMA_FROM_DEVICE); folio_put(virt_to_folio(dmb->cpu_addr)); } -static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_alloc_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { struct folio *folio; unsigned long bit; @@ -325,16 +317,16 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; - if (!dmb->sba_idx) { + if (!dmb->idx) { bit = find_next_zero_bit(ism->sba_bitmap, ISM_NR_DMBS, ISM_DMB_BIT_OFFSET); if (bit == ISM_NR_DMBS) return -ENOSPC; - dmb->sba_idx = bit; + dmb->idx = bit; } - if (dmb->sba_idx < ISM_DMB_BIT_OFFSET || - test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) + if (dmb->idx < ISM_DMB_BIT_OFFSET || + test_and_set_bit(dmb->idx, ism->sba_bitmap)) return -EINVAL; folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | @@ -359,13 +351,14 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out_free: kfree(dmb->cpu_addr); out_bit: - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); return rc; } -int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, - struct ism_client *client) +static int ism_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) { + struct ism_dev *ism = dibs->drv_priv; union ism_reg_dmb cmd; unsigned long flags; int ret; @@ -380,10 +373,10 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, cmd.request.dmb = dmb->dma_addr; cmd.request.dmb_len = dmb->dmb_len; - cmd.request.sba_idx = dmb->sba_idx; + cmd.request.sba_idx = dmb->idx; cmd.request.vlan_valid = dmb->vlan_valid; cmd.request.vlan_id = dmb->vlan_id; - cmd.request.rgid = dmb->rgid; + memcpy(&cmd.request.rgid, &dmb->rgid, sizeof(u64)); ret = ism_cmd(ism, &cmd); if (ret) { @@ -391,16 +384,16 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, goto out; } dmb->dmb_tok = cmd.response.dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); out: return ret; } -EXPORT_SYMBOL_GPL(ism_register_dmb); -int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) { + struct ism_dev *ism = dibs->drv_priv; union ism_unreg_dmb cmd; unsigned long flags; int ret; @@ -411,9 +404,9 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) cmd.request.dmb_tok = dmb->dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); ret = ism_cmd(ism, &cmd); if (ret && ret != ISM_ERROR) @@ -423,7 +416,6 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out: return ret; } -EXPORT_SYMBOL_GPL(ism_unregister_dmb); static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { @@ -459,9 +451,11 @@ static unsigned int max_bytes(unsigned int start, unsigned int len, return min(boundary - (start & (boundary - 1)), len); } -int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size) +static int ism_move(struct dibs_dev *dibs, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size) { + struct ism_dev *ism = dibs->drv_priv; unsigned int bytes; u64 dmb_req; int ret; @@ -482,7 +476,6 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, return 0; } -EXPORT_SYMBOL_GPL(ism_move); static u16 ism_get_chid(struct dibs_dev *dibs) { @@ -518,14 +511,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) { struct ism_dev *ism = data; unsigned long bit, end; + struct dibs_dev *dibs; unsigned long *bv; u16 dmbemask; u8 client_id; + dibs = ism->dibs; + bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET]; end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET; - spin_lock(&ism->lock); + spin_lock(&dibs->lock); ism->sba->s = 0; barrier(); for (bit = 0;;) { @@ -537,10 +533,13 @@ static irqreturn_t ism_handle_irq(int irq, void *data) dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - client_id = ism->sba_client_arr[bit]; - if (unlikely(client_id == NO_CLIENT || !ism->subs[client_id])) + client_id = dibs->dmb_clientid_arr[bit]; + if (unlikely(client_id == NO_DIBS_CLIENT || + !dibs->subs[client_id])) continue; - ism->subs[client_id]->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); + dibs->subs[client_id]->ops->handle_irq(dibs, + bit + ISM_DMB_BIT_OFFSET, + dmbemask); } if (ism->sba->e) { @@ -548,13 +547,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) barrier(); ism_handle_event(ism); } - spin_unlock(&ism->lock); + spin_unlock(&dibs->lock); return IRQ_HANDLED; } static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, .query_remote_gid = ism_query_rgid, + .max_dmbs = ism_max_dmbs, + .register_dmb = ism_register_dmb, + .unregister_dmb = ism_unregister_dmb, + .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -568,15 +571,10 @@ static int ism_dev_init(struct ism_dev *ism) if (ret <= 0) goto out; - ism->sba_client_arr = kzalloc(ISM_NR_DMBS, GFP_KERNEL); - if (!ism->sba_client_arr) - goto free_vectors; - memset(ism->sba_client_arr, NO_CLIENT, ISM_NR_DMBS); - ret = request_irq(pci_irq_vector(pdev, 0), ism_handle_irq, 0, pci_name(pdev), ism); if (ret) - goto free_client_arr; + goto free_vectors; ret = register_sba(ism); if (ret) @@ -605,8 +603,6 @@ unreg_sba: unregister_sba(ism); free_irq: free_irq(pci_irq_vector(pdev, 0), ism); -free_client_arr: - kfree(ism->sba_client_arr); free_vectors: pci_free_irq_vectors(pdev); out: @@ -629,7 +625,6 @@ static void ism_dev_exit(struct ism_dev *ism) unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); - kfree(ism->sba_client_arr); pci_free_irq_vectors(pdev); list_del_init(&ism->list); mutex_unlock(&ism_dev_list.mutex); @@ -677,6 +672,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dibs->drv_priv = ism; dibs->ops = &ism_ops; + /* enable ism device, but any interrupts and events will be ignored + * before dibs_dev_add() adds it to any clients. + */ ret = ism_dev_init(ism); if (ret) goto err_dibs; @@ -766,17 +764,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client) -{ - return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client); -} - -static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -801,18 +788,8 @@ static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, trigger_irq, event_code, info); } -static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size) -{ - return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); -} - static const struct smcd_ops ism_smcd_ops = { - .register_dmb = smcd_register_dmb, - .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, - .move_data = smcd_move, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c75a40fe3039..be009c614205 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -36,12 +36,44 @@ * clients. */ +/* DMB - Direct Memory Buffer + * -------------------------- + * A dibs client provides a dmb as input buffer for a local receiving + * dibs device for exactly one (remote) sending dibs device. Only this + * sending device can send data into this dmb using move_data(). Sender + * and receiver can be the same device. A dmb belongs to exactly one client. + */ +struct dibs_dmb { + /* tok - Token for this dmb + * Used by remote and local devices and clients to address this dmb. + * Provided by dibs fabric. Unique per dibs fabric. + */ + u64 dmb_tok; + /* rgid - GID of designated remote sending device */ + uuid_t rgid; + /* cpu_addr - buffer address */ + void *cpu_addr; + /* len - buffer length */ + u32 dmb_len; + /* idx - Index of this DMB on this receiving device */ + u32 idx; + /* VLAN support (deprecated) + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan + */ + u32 vlan_valid; + u32 vlan_id; + /* optional, used by device driver */ + dma_addr_t dma_addr; +}; + struct dibs_dev; /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +#define NO_DIBS_CLIENT 0xff /* All dibs clients have access to all dibs devices. * A dibs client provides the following functions to be called by dibs layer or * dibs device drivers: @@ -69,6 +101,22 @@ struct dibs_client_ops { * The device is no longer usable by this client after this call. */ void (*del_dev)(struct dibs_dev *dev); + /** + * handle_irq() - Handle signaling for a DMB + * @dev: device that owns the dmb + * @idx: Index of the dmb that got signalled + * @dmbemask: signaling mask of the dmb + * + * Handle signaling for a dmb that was registered by this client + * for this device. + * The dibs device can coalesce multiple signaling triggers into a + * single call of handle_irq(). dmbemask can be used to indicate + * different kinds of triggers. + * + * Context: Called in IRQ context by dibs device driver + */ + void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, + u16 dmbemask); }; struct dibs_client { @@ -147,6 +195,77 @@ struct dibs_dev_ops { */ int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, u32 vid_valid, u32 vid); + /** + * max_dmbs() + * Return: Max number of DMBs that can be registered for this kind of + * dibs_dev + */ + int (*max_dmbs)(void); + /** + * register_dmb() - allocate and register a dmb + * @dev: dibs device + * @dmb: dmb struct to be registered + * @client: dibs client + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * The following fields of dmb must provide valid input: + * @rgid: gid of remote user device + * @dmb_len: buffer length + * @idx: Optionally:requested idx (if non-zero) + * @vlan_valid: if zero, vlan_id will be ignored; + * deprecated, ignored if device does not support vlan + * @vlan_id: deprecated, ignored if device does not support vlan + * Upon return in addition the following fields will be valid: + * @dmb_tok: for usage by remote and local devices and clients + * @cpu_addr: allocated buffer + * @idx: dmb index, unique per dibs device + * @dma_addr: to be used by device driver,if applicable + * + * Allocate a dmb buffer and register it with this device and for this + * client. + * Return: zero on success + */ + int (*register_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb, + struct dibs_client *client); + /** + * unregister_dmb() - unregister and free a dmb + * @dev: dibs device + * @dmb: dmb struct to be unregistered + * The following fields of dmb must provide valid input: + * @dmb_tok + * @cpu_addr + * @idx + * + * Free dmb.cpu_addr and unregister the dmb from this device. + * Return: zero on success + */ + int (*unregister_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * move_data() - write into a remote dmb + * @dev: Local sending dibs device + * @dmb_tok: Token of the remote dmb + * @idx: signaling index in dmbemask + * @sf: signaling flag; + * if true, idx will be turned on at target dmbemask mask + * and target device will be signaled. + * @offset: offset within target dmb + * @data: pointer to data to be sent + * @size: length of data to be sent, can be zero. + * + * Use dev to write data of size at offset into a remote dmb + * identified by dmb_tok. Data is moved synchronously, *data can + * be freed when this function returns. + * + * If signaling flag (sf) is true, bit number idx bit will be turned + * on in the dmbemask mask when handle_irq() is called at the remote + * dibs client that owns the target dmb. The target device may chose + * to coalesce the signaling triggers of multiple move_data() calls + * to the same target dmb into a single handle_irq() call. + * Return: zero on success + */ + int (*move_data)(struct dibs_dev *dev, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device @@ -166,6 +285,55 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * support_mmapped_rdmb() - can this device provide memory mapped + * remote dmbs? (optional) + * @dev: dibs device + * + * A dibs device can provide a kernel address + length, that represent + * a remote target dmb (like MMIO). Alternatively to calling + * move_data(), a dibs client can write into such a ghost-send-buffer + * (= to this kernel address) and the data will automatically + * immediately appear in the target dmb, even without calling + * move_data(). + * + * Either all 3 function pointers for support_dmb_nocopy(), + * attach_dmb() and detach_dmb() are defined, or all of them must + * be NULL. + * + * Return: non-zero, if memory mapped remote dmbs are supported. + */ + int (*support_mmapped_rdmb)(struct dibs_dev *dev); + /** + * attach_dmb() - attach local memory to a remote dmb + * @dev: Local sending ism device + * @dmb: all other parameters are passed in the form of a + * dmb struct + * TODO: (THIS IS CONFUSING, should be changed) + * dmb_tok: (in) Token of the remote dmb, we want to attach to + * cpu_addr: (out) MMIO address + * dma_addr: (out) MMIO address (if applicable, invalid otherwise) + * dmb_len: (out) length of local MMIO region, + * equal to length of remote DMB. + * sba_idx: (out) index of remote dmb (NOT HELPFUL, should be removed) + * + * Provides a memory address to the sender that can be used to + * directly write into the remote dmb. + * Memory is available until detach_dmb is called + * + * Return: Zero upon success, Error code otherwise + */ + int (*attach_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * detach_dmb() - Detach the ghost buffer from a remote dmb + * @dev: ism device + * @token: dmb token of the remote dmb + * + * No need to free cpu_addr. + * + * Return: Zero upon success, Error code otherwise + */ + int (*detach_dmb)(struct dibs_dev *dev, u64 token); }; struct dibs_dev { @@ -179,6 +347,15 @@ struct dibs_dev { /* priv pointer per client; for client usage only */ void *priv[MAX_DIBS_CLIENTS]; + + /* get this lock before accessing any of the fields below */ + spinlock_t lock; + /* array of client ids indexed by dmb idx; + * can be used as indices into priv and subs arrays + */ + u8 *dmb_clientid_arr; + /* Sparse array of all ISM clients */ + struct dibs_client *subs[MAX_DIBS_CLIENTS]; }; static inline void dibs_set_priv(struct dibs_dev *dev, diff --git a/include/linux/ism.h b/include/linux/ism.h index a926dd61b5a1..b7feb4dcd5a8 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -11,17 +11,6 @@ #include -struct ism_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - /* Unless we gain unexpected popularity, this limit should hold for a while */ #define MAX_CLIENTS 8 #define ISM_NR_DMBS 1920 @@ -36,7 +25,6 @@ struct ism_dev { struct ism_sba *sba; dma_addr_t sba_dma_addr; DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); - u8 *sba_client_arr; /* entries are indices into 'clients' array */ void *priv[MAX_CLIENTS]; struct ism_eq *ieq; @@ -58,11 +46,6 @@ struct ism_event { struct ism_client { const char *name; void (*handle_event)(struct ism_dev *dev, struct ism_event *event); - /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent - * via ism_move_data(). Callback function must handle all active bits - * indicated by dmbemask. - */ - void (*handle_irq)(struct ism_dev *dev, unsigned int bit, u16 dmbemask); /* Private area - don't touch! */ u8 id; }; @@ -79,12 +62,6 @@ static inline void ism_set_priv(struct ism_dev *dev, struct ism_client *client, dev->priv[client->id] = priv; } -int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, - struct ism_client *client); -int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); -int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size); - const struct smcd_ops *ism_get_smcd_ops(void); #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index 5bd135fb4d49..8e3debcf7db5 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -28,17 +28,6 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -struct smcd_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - #define ISM_EVENT_DMB 0 #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 @@ -53,25 +42,14 @@ struct smcd_gid { }; struct smcd_ops { - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - void *client); - int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size); - /* optional operations */ int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); - int (*support_dmb_nocopy)(struct smcd_dev *dev); - int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*detach_dmb)(struct smcd_dev *dev, u64 token); }; struct smcd_dev { const struct smcd_ops *ops; void *priv; - void *client; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Makefile b/net/smc/Makefile index 96ccfdf246df..0e754cbc38f9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,3 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-y += smc_loopback.o diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index d20d00b46825..01e49371d23d 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,7 +15,6 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" -#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -33,18 +32,19 @@ static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, - u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .handle_event = smcd_handle_event, - .handle_irq = smcd_handle_irq, }; #endif +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, + u16 dmbemask); + static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_irq = smcd_handle_irq, }; static struct dibs_client smc_dibs_client = { @@ -221,18 +221,19 @@ out: void smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - smcd->ops->unregister_dmb(smcd, &dmb); + + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); return; } @@ -240,17 +241,20 @@ void smc_ism_unregister_dmb(struct smcd_dev *smcd, int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -265,24 +269,24 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) * merging sndbuf with peer DMB to avoid * data copies between them. */ - return (smcd->ops->support_dmb_nocopy && - smcd->ops->support_dmb_nocopy(smcd)); + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; int rc = 0; - if (!dev->ops->attach_dmb) + if (!dev->dibs->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; - rc = dev->ops->attach_dmb(dev, &dmb); + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -294,10 +298,10 @@ int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { - if (!dev->ops->detach_dmb) + if (!dev->dibs->ops->detach_dmb) return -EINVAL; - return dev->ops->detach_dmb(dev, token); + return dev->dibs->ops->detach_dmb(dev->dibs, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -503,26 +507,20 @@ static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; const struct smcd_ops *ops; - struct smc_lo_dev *smc_lo; struct ism_dev *ism; + int max_dmbs; - if (smc_ism_is_loopback(dibs)) { - if (smc_loopback_init(&smc_lo)) - return; - } + max_dmbs = dibs->ops->max_dmbs(); if (smc_ism_is_loopback(dibs)) { - ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - SMC_LO_MAX_DMBS); + ops = NULL; } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - ISM_NR_DMBS); } + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); if (!smcd) return; @@ -530,13 +528,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) dibs_set_priv(dibs, &smc_dibs_client, smcd); if (smc_ism_is_loopback(dibs)) { - smcd->priv = smc_lo; - smc_lo->smcd = smcd; + smcd->priv = NULL; } else { smcd->priv = ism; #if IS_ENABLED(CONFIG_ISM) ism_set_priv(ism, &smc_ism_client, smcd); - smcd->client = &smc_ism_client; #endif } @@ -590,8 +586,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); - if (smc_ism_is_loopback(dibs)) - smc_loopback_exit(); kfree(smcd->conn); kfree(smcd); } @@ -624,6 +618,7 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } +#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -632,10 +627,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -645,7 +640,6 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -#endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 139e99da2c9f..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -69,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c deleted file mode 100644 index 52cba01cb209..000000000000 --- a/net/smc/smc_loopback.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * Functions for loopback-ism device. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#include -#include -#include -#include - -#include "smc_cdc.h" -#include "smc_ism.h" -#include "smc_loopback.h" - -#define SMC_LO_SUPPORT_NOCOPY 0x1 -#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) - -static struct smc_lo_dev *lo_dev; - -static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client_priv) -{ - struct smc_lo_dmb_node *dmb_node, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - int sba_idx, rc; - - /* check space for new dmb */ - for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { - if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) - break; - } - if (sba_idx == SMC_LO_MAX_DMBS) - return -ENOSPC; - - dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); - if (!dmb_node) { - rc = -ENOMEM; - goto err_bit; - } - - dmb_node->sba_idx = sba_idx; - dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { - rc = -ENOMEM; - goto err_node; - } - dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; - refcount_set(&dmb_node->refcnt, 1); - -again: - /* add new dmb into hash table */ - get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); - write_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { - if (tmp_node->token == dmb_node->token) { - write_unlock_bh(&ldev->dmb_ht_lock); - goto again; - } - } - hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); - write_unlock_bh(&ldev->dmb_ht_lock); - atomic_inc(&ldev->dmb_cnt); - - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - - return 0; - -err_node: - kfree(dmb_node); -err_bit: - clear_bit(sba_idx, ldev->sba_idx_mask); - return rc; -} - -static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, - struct smc_lo_dmb_node *dmb_node) -{ - /* remove dmb from hash table */ - write_lock_bh(&ldev->dmb_ht_lock); - hash_del(&dmb_node->list); - write_unlock_bh(&ldev->dmb_ht_lock); - - clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); - kfree(dmb_node); - - if (atomic_dec_and_test(&ldev->dmb_cnt)) - wake_up(&ldev->ldev_release); -} - -static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb from hash table */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) -{ - return SMC_LO_SUPPORT_NOCOPY; -} - -static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!refcount_inc_not_zero(&dmb_node->refcnt)) - /* the dmb is being unregistered, but has - * not been removed from the hash table. - */ - return -EINVAL; - - /* provide dmb information */ - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - return 0; -} - -static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { - if (tmp_node->token == token) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, - unsigned int idx, bool sf, unsigned int offset, - void *data, unsigned int size) -{ - struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct smc_connection *conn; - - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { - if (tmp_node->token == dmb_tok) { - rmb_node = tmp_node; - break; - } - } - if (!rmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - memcpy((char *)rmb_node->cpu_addr + offset, data, size); - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!sf) - return 0; - - conn = smcd->conn[rmb_node->sba_idx]; - if (!conn || conn->killed) - return -EPIPE; - tasklet_schedule(&conn->rx_tsklet); - return 0; -} - -static const struct smcd_ops lo_ops = { - .register_dmb = smc_lo_register_dmb, - .unregister_dmb = smc_lo_unregister_dmb, - .support_dmb_nocopy = smc_lo_support_dmb_nocopy, - .attach_dmb = smc_lo_attach_dmb, - .detach_dmb = smc_lo_detach_dmb, - .signal_event = NULL, - .move_data = smc_lo_move_data, -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void) -{ - return &lo_ops; -} - -static void smc_lo_dev_init(struct smc_lo_dev *ldev) -{ - rwlock_init(&ldev->dmb_ht_lock); - hash_init(ldev->dmb_ht); - atomic_set(&ldev->dmb_cnt, 0); - init_waitqueue_head(&ldev->ldev_release); - - return; -} - -static void smc_lo_dev_exit(struct smc_lo_dev *ldev) -{ - if (atomic_read(&ldev->dmb_cnt)) - wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); -} - -static int smc_lo_dev_probe(void) -{ - struct smc_lo_dev *ldev; - - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) - return -ENOMEM; - - smc_lo_dev_init(ldev); - - lo_dev = ldev; /* global loopback device */ - - return 0; -} - -static void smc_lo_dev_remove(void) -{ - if (!lo_dev) - return; - - smc_lo_dev_exit(lo_dev); - kfree(lo_dev); - lo_dev = NULL; -} - -int smc_loopback_init(struct smc_lo_dev **smc_lb) -{ - int ret; - - ret = smc_lo_dev_probe(); - if (!ret) - *smc_lb = lo_dev; - return ret; -} - -void smc_loopback_exit(void) -{ - smc_lo_dev_remove(); -} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h deleted file mode 100644 index 33bb96ec8b77..000000000000 --- a/net/smc/smc_loopback.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * SMC-D loopback-ism device structure definitions. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#ifndef _SMC_LOOPBACK_H -#define _SMC_LOOPBACK_H - -#include -#include - -#define SMC_LO_MAX_DMBS 5000 -#define SMC_LO_DMBS_HASH_BITS 12 - -struct smc_lo_dmb_node { - struct hlist_node list; - u64 token; - u32 len; - u32 sba_idx; - void *cpu_addr; - dma_addr_t dma_addr; - refcount_t refcnt; -}; - -struct smc_lo_dev { - struct smcd_dev *smcd; - atomic_t dmb_cnt; - rwlock_t dmb_ht_lock; - DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); - DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); - wait_queue_head_t ldev_release; -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void); - -int smc_loopback_init(struct smc_lo_dev **smc_lb); -void smc_loopback_exit(void); - -#endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From a612dbe8d04d47af91fa88f0599c1370cc70f687 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:05:00 +0200 Subject: dibs: Move event handling to dibs layer Add defines for all event types and subtypes an ism device is known to produce as it can be helpful for debugging purposes. Introduces a generic 'struct dibs_event' and adopt ism device driver and smc-d client accordingly. Tolerate and ignore other type and subtype values to enable future device extensions. SMC-D and ISM are now independent. struct ism_dev can be moved to drivers/s390/net/ism.h. Note that in smc, the term 'ism' is still used. Future patches could replace that with 'dibs' or 'smc-d' as appropriate. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-15-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 2 - arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 2 +- drivers/s390/net/Kconfig | 1 - drivers/s390/net/ism.h | 42 +++++++- drivers/s390/net/ism_drv.c | 221 ++++++++++++-------------------------- include/linux/dibs.h | 62 +++++++++++ include/net/smc.h | 15 --- net/smc/Kconfig | 1 - net/smc/smc_ism.c | 99 ++++++----------- 11 files changed, 208 insertions(+), 239 deletions(-) (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index ecc55fae5f9d..1fbe541198f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17580,7 +17580,6 @@ F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h -F: include/linux/ism.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h @@ -22237,7 +22236,6 @@ L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported F: drivers/s390/net/ -F: include/linux/ism.h S390 PCI SUBSYSTEM M: Niklas Schnelle diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index a97c8d19f643..fdde8ee0d7bd 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -122,6 +122,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 7f7b52d9a33c..bf9e7dbd4a89 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -113,6 +113,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index aacb3ea7825a..5425238d5a42 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -98,7 +98,7 @@ int dibs_unregister_client(struct dibs_client *client) goto err_reg_dmb; } } - /* Stop forwarding IRQs */ + /* Stop forwarding IRQs and events */ dibs->subs[client->id] = NULL; spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 92985f595d59..0fd700c5745a 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -82,7 +82,6 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" depends on PCI && DIBS - imply SMC default n help Select this option if you want to use the Internal Shared Memory diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 1b9fa14da20c..08d17956cb36 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -6,13 +6,13 @@ #include #include #include -#include -#include #include #define UTIL_STR_LEN 16 #define ISM_ERROR 0xFFFF +#define ISM_NR_DMBS 1920 + /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. */ @@ -34,6 +34,23 @@ #define ISM_UNREG_SBA 0x11 #define ISM_UNREG_IEQ 0x12 +enum ism_event_type { + ISM_EVENT_BUF = 0x00, + ISM_EVENT_DEV = 0x01, + ISM_EVENT_SWR = 0x02 +}; + +enum ism_event_code { + ISM_BUF_DMB_UNREGISTERED = 0x04, + ISM_BUF_USING_ISM_DEV_DISABLED = 0x08, + ISM_BUF_OWNING_ISM_DEV_IN_ERR_STATE = 0x02, + ISM_BUF_USING_ISM_DEV_IN_ERR_STATE = 0x03, + ISM_BUF_VLAN_MISMATCH_WITH_OWNER = 0x05, + ISM_BUF_VLAN_MISMATCH_WITH_USER = 0x06, + ISM_DEV_GID_DISABLED = 0x07, + ISM_DEV_GID_ERR_STATE = 0x01 +}; + struct ism_req_hdr { u32 cmd; u16 : 16; @@ -185,6 +202,14 @@ struct ism_eq_header { u64 : 64; }; +struct ism_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + struct ism_eq { struct ism_eq_header header; struct ism_event entry[15]; @@ -199,6 +224,19 @@ struct ism_sba { u16 dmbe_mask[ISM_NR_DMBS]; }; +struct ism_dev { + spinlock_t cmd_lock; /* serializes cmds */ + struct dibs_dev *dibs; + struct pci_dev *pdev; + struct ism_sba *sba; + dma_addr_t sba_dma_addr; + DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); + + struct ism_eq *ieq; + dma_addr_t ieq_dma_addr; + int ieq_idx; +}; + #define ISM_CREATE_REQ(dmb, idx, sf, offset) \ ((dmb) | (idx) << 24 | (sf) << 23 | (offset)) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 346d1ea8650b..f84aa2e676e9 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -31,86 +31,6 @@ MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; -#define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ -static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ - /* a list for fast mapping */ -static u8 max_client; -static DEFINE_MUTEX(clients_lock); -struct ism_dev_list { - struct list_head list; - struct mutex mutex; /* protects ism device list */ -}; - -static struct ism_dev_list ism_dev_list = { - .list = LIST_HEAD_INIT(ism_dev_list.list), - .mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex), -}; - -static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism) -{ - unsigned long flags; - - spin_lock_irqsave(&ism->lock, flags); - ism->subs[client->id] = client; - spin_unlock_irqrestore(&ism->lock, flags); -} - -int ism_register_client(struct ism_client *client) -{ - struct ism_dev *ism; - int i, rc = -ENOSPC; - - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < MAX_CLIENTS; ++i) { - if (!clients[i]) { - clients[i] = client; - client->id = i; - if (i == max_client) - max_client++; - rc = 0; - break; - } - } - mutex_unlock(&clients_lock); - - if (i < MAX_CLIENTS) { - /* initialize with all devices that we got so far */ - list_for_each_entry(ism, &ism_dev_list.list, list) { - ism->priv[i] = NULL; - ism_setup_forwarding(client, ism); - } - } - mutex_unlock(&ism_dev_list.mutex); - - return rc; -} -EXPORT_SYMBOL_GPL(ism_register_client); - -int ism_unregister_client(struct ism_client *client) -{ - struct ism_dev *ism; - unsigned long flags; - int rc = 0; - - mutex_lock(&ism_dev_list.mutex); - list_for_each_entry(ism, &ism_dev_list.list, list) { - spin_lock_irqsave(&ism->lock, flags); - /* Stop forwarding IRQs and events */ - ism->subs[client->id] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - } - mutex_unlock(&ism_dev_list.mutex); - - mutex_lock(&clients_lock); - clients[client->id] = NULL; - if (client->id + 1 == max_client) - max_client--; - mutex_unlock(&clients_lock); - return rc; -} -EXPORT_SYMBOL_GPL(ism_unregister_client); - static int ism_cmd(struct ism_dev *ism, void *cmd) { struct ism_req_hdr *req = cmd; @@ -445,6 +365,24 @@ static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) return ism_cmd(ism, &cmd); } +static int ism_signal_ieq(struct dibs_dev *dibs, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_sig_ieq cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.trigger_irq = trigger_irq; + cmd.request.event_code = event_code; + cmd.request.info = info; + + return ism_cmd(ism, &cmd); +} + static unsigned int max_bytes(unsigned int start, unsigned int len, unsigned int boundary) { @@ -487,22 +425,68 @@ static u16 ism_get_chid(struct dibs_dev *dibs) return to_zpci(ism->pdev)->pchid; } +static int ism_match_event_type(u32 s390_event_type) +{ + switch (s390_event_type) { + case ISM_EVENT_BUF: + return DIBS_BUF_EVENT; + case ISM_EVENT_DEV: + return DIBS_DEV_EVENT; + case ISM_EVENT_SWR: + return DIBS_SW_EVENT; + default: + return DIBS_OTHER_TYPE; + } +} + +static int ism_match_event_subtype(u32 s390_event_subtype) +{ + switch (s390_event_subtype) { + case ISM_BUF_DMB_UNREGISTERED: + return DIBS_BUF_UNREGISTERED; + case ISM_DEV_GID_DISABLED: + return DIBS_DEV_DISABLED; + case ISM_DEV_GID_ERR_STATE: + return DIBS_DEV_ERR_STATE; + default: + return DIBS_OTHER_SUBTYPE; + } +} + static void ism_handle_event(struct ism_dev *ism) { + struct dibs_dev *dibs = ism->dibs; + struct dibs_event event; struct ism_event *entry; - struct ism_client *clt; + struct dibs_client *clt; int i; while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) { - if (++(ism->ieq_idx) == ARRAY_SIZE(ism->ieq->entry)) + if (++ism->ieq_idx == ARRAY_SIZE(ism->ieq->entry)) ism->ieq_idx = 0; entry = &ism->ieq->entry[ism->ieq_idx]; debug_event(ism_debug_info, 2, entry, sizeof(*entry)); - for (i = 0; i < max_client; ++i) { - clt = ism->subs[i]; + __memset(&event, 0, sizeof(event)); + event.type = ism_match_event_type(entry->type); + if (event.type == DIBS_SW_EVENT) + event.subtype = entry->code; + else + event.subtype = ism_match_event_subtype(entry->code); + event.time = entry->time; + event.data = entry->info; + switch (event.type) { + case DIBS_BUF_EVENT: + event.buffer_tok = entry->tok; + break; + case DIBS_DEV_EVENT: + case DIBS_SW_EVENT: + memcpy(&event.gid, &entry->tok, sizeof(u64)); + } + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + clt = dibs->subs[i]; if (clt) - clt->handle_event(ism, entry); + clt->ops->handle_event(dibs, &event); } } } @@ -560,12 +544,13 @@ static const struct dibs_dev_ops ism_ops = { .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, + .signal_event = ism_signal_ieq, }; static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - int i, ret; + int ret; ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); if (ret <= 0) @@ -584,18 +569,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) { - ism_setup_forwarding(clients[i], ism); - } - } - mutex_unlock(&clients_lock); - - list_add(&ism->list, &ism_dev_list.list); - mutex_unlock(&ism_dev_list.mutex); - query_info(ism); return 0; @@ -612,22 +585,11 @@ out: static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - unsigned long flags; - int i; - - spin_lock_irqsave(&ism->lock, flags); - for (i = 0; i < max_client; ++i) - ism->subs[i] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - - mutex_lock(&ism_dev_list.mutex); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); pci_free_irq_vectors(pdev); - list_del_init(&ism->list); - mutex_unlock(&ism_dev_list.mutex); } static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -641,7 +603,6 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!ism) return -ENOMEM; - spin_lock_init(&ism->lock); spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; @@ -742,8 +703,6 @@ static int __init ism_init(void) if (!ism_debug_info) return -ENODEV; - memset(clients, 0, sizeof(clients)); - max_client = 0; debug_register_view(ism_debug_info, &debug_hex_ascii_view); ret = pci_register_driver(&ism_driver); if (ret) @@ -760,41 +719,3 @@ static void __exit ism_exit(void) module_init(ism_init); module_exit(ism_exit); - -/*************************** SMC-D Implementation *****************************/ - -#if IS_ENABLED(CONFIG_SMC) -static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - union ism_sig_ieq cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.trigger_irq = trigger_irq; - cmd.request.event_code = event_code; - cmd.request.info = info; - - return ism_cmd(ism, &cmd); -} - -static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info) -{ - return ism_signal_ieq(smcd->priv, rgid->gid, - trigger_irq, event_code, info); -} - -static const struct smcd_ops ism_smcd_ops = { - .signal_event = smcd_signal_ieq, -}; - -const struct smcd_ops *ism_get_smcd_ops(void) -{ - return &ism_smcd_ops; -} -EXPORT_SYMBOL_GPL(ism_get_smcd_ops); -#endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index be009c614205..c75607f8a5cf 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -67,6 +67,41 @@ struct dibs_dmb { dma_addr_t dma_addr; }; +/* DIBS events + * ----------- + * Dibs devices can optionally notify dibs clients about events that happened + * in the fabric or at the remote device or remote dmb. + */ +enum dibs_event_type { + /* Buffer event, e.g. a remote dmb was unregistered */ + DIBS_BUF_EVENT, + /* Device event, e.g. a remote dibs device was disabled */ + DIBS_DEV_EVENT, + /* Software event, a dibs client can send an event signal to a + * remote dibs device. + */ + DIBS_SW_EVENT, + DIBS_OTHER_TYPE }; + +enum dibs_event_subtype { + DIBS_BUF_UNREGISTERED, + DIBS_DEV_DISABLED, + DIBS_DEV_ERR_STATE, + DIBS_OTHER_SUBTYPE +}; + +struct dibs_event { + u32 type; + u32 subtype; + /* uuid_null if invalid */ + uuid_t gid; + /* zero if invalid */ + u64 buffer_tok; + u64 time; + /* additional data or zero */ + u64 data; +}; + struct dibs_dev; /* DIBS client @@ -117,6 +152,15 @@ struct dibs_client_ops { */ void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, u16 dmbemask); + /** + * handle_event() - Handle control information sent by device + * @dev: device reporting the event + * @event: ism event structure + * + * * Context: Called in IRQ context by dibs device driver + */ + void (*handle_event)(struct dibs_dev *dev, + const struct dibs_event *event); }; struct dibs_client { @@ -285,6 +329,24 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * signal_event() - trigger an event at a remote dibs device (optional) + * @dev: local dibs device + * @rgid: gid of remote dibs device + * trigger_irq: zero: notification may be coalesced with other events + * non-zero: notify immediately + * @subtype: 4 byte event code, meaning is defined by dibs client + * @data: 8 bytes of additional information, + * meaning is defined by dibs client + * + * dibs devices can offer support for sending a control event of type + * EVENT_SWR to a remote dibs device. + * NOTE: handle_event() will be called for all registered dibs clients + * at the remote device. + * Return: zero on success + */ + int (*signal_event)(struct dibs_dev *dev, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info); /** * support_mmapped_rdmb() - can this device provide memory mapped * remote dmbs? (optional) diff --git a/include/net/smc.h b/include/net/smc.h index 8e3debcf7db5..08bee529ed8d 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -16,7 +16,6 @@ #include #include #include -#include "linux/ism.h" struct sock; @@ -28,28 +27,14 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -#define ISM_EVENT_DMB 0 -#define ISM_EVENT_GID 1 -#define ISM_EVENT_SWR 2 - #define ISM_RESERVED_VLANID 0x1FFF -struct smcd_dev; - struct smcd_gid { u64 gid; u64 gid_ext; }; -struct smcd_ops { - /* optional operations */ - int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info); -}; - struct smcd_dev { - const struct smcd_ops *ops; - void *priv; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 9535d88c2acb..99ecd59d1f4b 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,7 +2,6 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND && DIBS - depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 01e49371d23d..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,7 +17,6 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" -#include "linux/ism.h" #include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { @@ -30,20 +29,15 @@ static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); -#if IS_ENABLED(CONFIG_ISM) -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); - -static struct ism_client smc_ism_client = { - .name = "SMC-D", - .handle_event = smcd_handle_event, -}; -#endif +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask); static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; @@ -399,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct ism_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -423,25 +416,27 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - struct smcd_gid peer_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && - wrk->smcd->ops->signal_event) { + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - &peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } @@ -451,26 +446,24 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); - struct smcd_gid smcd_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -#endif -static struct smcd_dev *smcd_alloc_dev(const char *name, - const struct smcd_ops *ops, - int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; @@ -487,8 +480,6 @@ static struct smcd_dev *smcd_alloc_dev(const char *name, if (!smcd->event_wq) goto free_conn; - smcd->ops = ops; - spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); @@ -506,36 +497,17 @@ free_smcd: static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; - const struct smcd_ops *ops; - struct ism_dev *ism; int max_dmbs; max_dmbs = dibs->ops->max_dmbs(); - if (smc_ism_is_loopback(dibs)) { - ops = NULL; - } else { - ism = dibs->drv_priv; -#if IS_ENABLED(CONFIG_ISM) - ops = ism_get_smcd_ops(); -#endif - } - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); if (!smcd) return; smcd->dibs = dibs; dibs_set_priv(dibs, &smc_dibs_client, smcd); - if (smc_ism_is_loopback(dibs)) { - smcd->priv = NULL; - } else { - smcd->priv = ism; -#if IS_ENABLED(CONFIG_ISM) - ism_set_priv(ism, &smc_ism_client, smcd); -#endif - } - if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); @@ -590,7 +562,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) kfree(smcd); } -#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), @@ -602,9 +573,10 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) * Context: * - Function called in IRQ context from ISM device driver event handler. */ -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -618,7 +590,6 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -644,22 +615,22 @@ static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; + uuid_t ism_rgid; if (lgr->peer_shutdown) return 0; - if (!lgr->smcd->ops->signal_event) + if (!lgr->smcd->dibs->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); -#endif return rc; } @@ -670,9 +641,6 @@ int smc_ism_init(void) smc_ism_v2_capable = false; smc_ism_create_system_eid(); -#if IS_ENABLED(CONFIG_ISM) - rc = ism_register_client(&smc_ism_client); -#endif rc = dibs_register_client(&smc_dibs_client); return rc; } @@ -680,7 +648,4 @@ int smc_ism_init(void) void smc_ism_exit(void) { dibs_unregister_client(&smc_dibs_client); -#if IS_ENABLED(CONFIG_ISM) - ism_unregister_client(&smc_ism_client); -#endif } -- cgit v1.2.3 From 9095d207417477eb50e84fd0652895db77ec584e Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 14 Aug 2025 14:22:12 -0300 Subject: fs: Create sb_encoding() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filesystems that need to deal with the super block encoding need to use a if IS_ENABLED(CONFIG_UNICODE) around it because this struct member is not declared otherwise. In order to move this if/endif guards outside of the filesytem code and make it simpler, create a new function that returns the s_encoding member of struct super_block if Unicode is enabled, and return NULL otherwise. Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Signed-off-by: André Almeida Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Amir Goldstein --- include/linux/fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..43b3a7cf6750 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3740,15 +3740,20 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qst } #endif -static inline bool sb_has_encoding(const struct super_block *sb) +static inline struct unicode_map *sb_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) - return !!sb->s_encoding; + return sb->s_encoding; #else - return false; + return NULL; #endif } +static inline bool sb_has_encoding(const struct super_block *sb) +{ + return !!sb_encoding(sb); +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); -- cgit v1.2.3 From 23253e278a4574114d4c2729ed70f70b4ec7a30e Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 14 Aug 2025 14:22:13 -0300 Subject: fs: Create sb_same_encoding() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For cases where a file lookup can look in different filesystems (like in overlayfs), both super blocks must have the same encoding and the same flags. To help with that, create a sb_same_encoding() function. Reviewed-by: Amir Goldstein Signed-off-by: André Almeida Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Amir Goldstein --- include/linux/fs.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 43b3a7cf6750..ec867f112fd5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3754,6 +3754,24 @@ static inline bool sb_has_encoding(const struct super_block *sb) return !!sb_encoding(sb); } +/* + * Compare if two super blocks have the same encoding and flags + */ +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ +#if IS_ENABLED(CONFIG_UNICODE) + if (sb1->s_encoding == sb2->s_encoding) + return true; + + return (sb1->s_encoding && sb2->s_encoding && + (sb1->s_encoding->version == sb2->s_encoding->version) && + (sb1->s_encoding_flags == sb2->s_encoding_flags)); +#else + return true; +#endif +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); -- cgit v1.2.3 From 0f67b56d84b4c49adfd61f19f81f84ec613ab51a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Aug 2025 16:46:21 +0100 Subject: clocksource/drivers/arm_arch_timer_mmio: Switch over to standalone driver Remove all the MMIO support from the per-CPU timer driver, and switch over to the standalove driver. Signed-off-by: Marc Zyngier Signed-off-by: Daniel Lezcano Tested-by: Sudeep Holla Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250814154622.10193-4-maz@kernel.org --- drivers/clocksource/Makefile | 1 + drivers/clocksource/arm_arch_timer.c | 686 ++++------------------------------- include/clocksource/arm_arch_timer.h | 5 - 3 files changed, 66 insertions(+), 626 deletions(-) (limited to 'include') diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 77a0f08eb43b..ec4452ee958f 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_REALTEK_OTTO_TIMER) += timer-rtl-otto.o obj-$(CONFIG_ARC_TIMERS) += arc_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o +obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer_mmio.o obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o obj-$(CONFIG_ARMV7M_SYSTICK) += armv7m_systick.o obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp804.o diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 80ba6a54248c..90aeff44a276 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -34,42 +34,12 @@ #include -#define CNTTIDR 0x08 -#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) - -#define CNTACR(n) (0x40 + ((n) * 4)) -#define CNTACR_RPCT BIT(0) -#define CNTACR_RVCT BIT(1) -#define CNTACR_RFRQ BIT(2) -#define CNTACR_RVOFF BIT(3) -#define CNTACR_RWVT BIT(4) -#define CNTACR_RWPT BIT(5) - -#define CNTPCT_LO 0x00 -#define CNTVCT_LO 0x08 -#define CNTFRQ 0x10 -#define CNTP_CVAL_LO 0x20 -#define CNTP_CTL 0x2c -#define CNTV_CVAL_LO 0x30 -#define CNTV_CTL 0x3c - /* * The minimum amount of time a generic counter is guaranteed to not roll over * (40 years) */ #define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) -static unsigned arch_timers_present __initdata; - -struct arch_timer { - void __iomem *base; - struct clock_event_device evt; -}; - -static struct arch_timer *arch_timer_mem __ro_after_init; - -#define to_arch_timer(e) container_of(e, struct arch_timer, evt) - static u32 arch_timer_rate __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; @@ -85,7 +55,6 @@ static struct clock_event_device __percpu *arch_timer_evt; static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI; static bool arch_timer_c3stop __ro_after_init; -static bool arch_timer_mem_use_virtual __ro_after_init; static bool arch_counter_suspend_stop __ro_after_init; #ifdef CONFIG_GENERIC_GETTIMEOFDAY static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER; @@ -121,76 +90,6 @@ static int arch_counter_get_width(void) /* * Architected system timer support. */ - -static __always_inline -void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, - struct clock_event_device *clk) -{ - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTP_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* - * Not guaranteed to be atomic, so the timer - * must be disabled at this point. - */ - writeq_relaxed(val, timer->base + CNTP_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTV_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* Same restriction as above */ - writeq_relaxed(val, timer->base + CNTV_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else { - arch_timer_reg_write_cp15(access, reg, val); - } -} - -static __always_inline -u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, - struct clock_event_device *clk) -{ - u32 val; - - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTP_CTL); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTV_CTL); - break; - default: - BUILD_BUG(); - } - } else { - val = arch_timer_reg_read_cp15(access, reg); - } - - return val; -} - static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); @@ -424,7 +323,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, unsigned long ctrl; u64 cval; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -436,7 +335,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, write_sysreg(cval, cntv_cval_el0); } - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, @@ -667,10 +566,10 @@ static __always_inline irqreturn_t timer_handler(const int access, { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { ctrl |= ARCH_TIMER_CTRL_IT_MASK; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); evt->event_handler(evt); return IRQ_HANDLED; } @@ -692,28 +591,14 @@ static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id) return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt); } -static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt); -} - -static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt); -} - static __always_inline int arch_timer_shutdown(const int access, struct clock_event_device *clk) { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); return 0; } @@ -728,23 +613,13 @@ static int arch_timer_shutdown_phys(struct clock_event_device *clk) return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk); } -static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk); -} - -static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk); -} - static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cnt; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -753,8 +628,8 @@ static __always_inline void set_next_event(const int access, unsigned long evt, else cnt = __arch_counter_get_cntvct(); - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static int arch_timer_set_next_event_virt(unsigned long evt, @@ -771,60 +646,6 @@ static int arch_timer_set_next_event_phys(unsigned long evt, return 0; } -static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) -{ - u32 cnt_lo, cnt_hi, tmp_hi; - - do { - cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); - tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - } while (cnt_hi != tmp_hi); - - return ((u64) cnt_hi << 32) | cnt_lo; -} - -static __always_inline void set_next_event_mem(const int access, unsigned long evt, - struct clock_event_device *clk) -{ - struct arch_timer *timer = to_arch_timer(clk); - unsigned long ctrl; - u64 cnt; - - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); - - /* Timer must be disabled before programming CVAL */ - if (ctrl & ARCH_TIMER_CTRL_ENABLE) { - ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); - } - - ctrl |= ARCH_TIMER_CTRL_ENABLE; - ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; - - if (access == ARCH_TIMER_MEM_VIRT_ACCESS) - cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); - else - cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); - - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); -} - -static int arch_timer_set_next_event_virt_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); - return 0; -} - -static int arch_timer_set_next_event_phys_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); - return 0; -} - static u64 __arch_timer_check_delta(void) { #ifdef CONFIG_ARM64 @@ -850,63 +671,41 @@ static u64 __arch_timer_check_delta(void) return CLOCKSOURCE_MASK(arch_counter_get_width()); } -static void __arch_timer_setup(unsigned type, - struct clock_event_device *clk) +static void __arch_timer_setup(struct clock_event_device *clk) { + typeof(clk->set_next_event) sne; u64 max_delta; clk->features = CLOCK_EVT_FEAT_ONESHOT; - if (type == ARCH_TIMER_TYPE_CP15) { - typeof(clk->set_next_event) sne; - - arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - - if (arch_timer_c3stop) - clk->features |= CLOCK_EVT_FEAT_C3STOP; - clk->name = "arch_sys_timer"; - clk->rating = 450; - clk->cpumask = cpumask_of(smp_processor_id()); - clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; - switch (arch_timer_uses_ppi) { - case ARCH_TIMER_VIRT_PPI: - clk->set_state_shutdown = arch_timer_shutdown_virt; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; - sne = erratum_handler(set_next_event_virt); - break; - case ARCH_TIMER_PHYS_SECURE_PPI: - case ARCH_TIMER_PHYS_NONSECURE_PPI: - case ARCH_TIMER_HYP_PPI: - clk->set_state_shutdown = arch_timer_shutdown_phys; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; - sne = erratum_handler(set_next_event_phys); - break; - default: - BUG(); - } + arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - clk->set_next_event = sne; - max_delta = __arch_timer_check_delta(); - } else { - clk->features |= CLOCK_EVT_FEAT_DYNIRQ; - clk->name = "arch_mem_timer"; - clk->rating = 400; - clk->cpumask = cpu_possible_mask; - if (arch_timer_mem_use_virtual) { - clk->set_state_shutdown = arch_timer_shutdown_virt_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; - clk->set_next_event = - arch_timer_set_next_event_virt_mem; - } else { - clk->set_state_shutdown = arch_timer_shutdown_phys_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; - clk->set_next_event = - arch_timer_set_next_event_phys_mem; - } - - max_delta = CLOCKSOURCE_MASK(56); + if (arch_timer_c3stop) + clk->features |= CLOCK_EVT_FEAT_C3STOP; + clk->name = "arch_sys_timer"; + clk->rating = 450; + clk->cpumask = cpumask_of(smp_processor_id()); + clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; + switch (arch_timer_uses_ppi) { + case ARCH_TIMER_VIRT_PPI: + clk->set_state_shutdown = arch_timer_shutdown_virt; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; + sne = erratum_handler(set_next_event_virt); + break; + case ARCH_TIMER_PHYS_SECURE_PPI: + case ARCH_TIMER_PHYS_NONSECURE_PPI: + case ARCH_TIMER_HYP_PPI: + clk->set_state_shutdown = arch_timer_shutdown_phys; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; + sne = erratum_handler(set_next_event_phys); + break; + default: + BUG(); } + clk->set_next_event = sne; + max_delta = __arch_timer_check_delta(); + clk->set_state_shutdown(clk); clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); @@ -1029,7 +828,7 @@ static int arch_timer_starting_cpu(unsigned int cpu) struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); u32 flags; - __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk); + __arch_timer_setup(clk); flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]); enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags); @@ -1075,22 +874,12 @@ static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np pr_warn("frequency not available\n"); } -static void __init arch_timer_banner(unsigned type) +static void __init arch_timer_banner(void) { - pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n", - type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? - " and " : "", - type & ARCH_TIMER_TYPE_MEM ? "mmio" : "", + pr_info("cp15 timer running at %lu.%02luMHz (%s).\n", (unsigned long)arch_timer_rate / 1000000, (unsigned long)(arch_timer_rate / 10000) % 100, - type & ARCH_TIMER_TYPE_CP15 ? - (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" : - "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "", - type & ARCH_TIMER_TYPE_MEM ? - arch_timer_mem_use_virtual ? "virt" : "phys" : - ""); + (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys"); } u32 arch_timer_get_rate(void) @@ -1108,11 +897,6 @@ bool arch_timer_evtstrm_available(void) return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } -static noinstr u64 arch_counter_get_cntvct_mem(void) -{ - return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); -} - static struct arch_timer_kvm_info arch_timer_kvm_info; struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) @@ -1120,42 +904,35 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) return &arch_timer_kvm_info; } -static void __init arch_counter_register(unsigned type) +static void __init arch_counter_register(void) { u64 (*scr)(void); + u64 (*rd)(void); u64 start_count; int width; - /* Register the CP15 based counter if we have one */ - if (type & ARCH_TIMER_TYPE_CP15) { - u64 (*rd)(void); - - if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || - arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntvct_stable; - scr = raw_counter_get_cntvct_stable; - } else { - rd = arch_counter_get_cntvct; - scr = arch_counter_get_cntvct; - } + if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || + arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntvct_stable; + scr = raw_counter_get_cntvct_stable; } else { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntpct_stable; - scr = raw_counter_get_cntpct_stable; - } else { - rd = arch_counter_get_cntpct; - scr = arch_counter_get_cntpct; - } + rd = arch_counter_get_cntvct; + scr = arch_counter_get_cntvct; } - - arch_timer_read_counter = rd; - clocksource_counter.vdso_clock_mode = vdso_default; } else { - arch_timer_read_counter = arch_counter_get_cntvct_mem; - scr = arch_counter_get_cntvct_mem; + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntpct_stable; + scr = raw_counter_get_cntpct_stable; + } else { + rd = arch_counter_get_cntpct; + scr = arch_counter_get_cntpct; + } } + arch_timer_read_counter = rd; + clocksource_counter.vdso_clock_mode = vdso_default; + width = arch_counter_get_width(); clocksource_counter.mask = CLOCKSOURCE_MASK(width); cyclecounter.mask = CLOCKSOURCE_MASK(width); @@ -1303,76 +1080,10 @@ out: return err; } -static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) -{ - int ret; - irq_handler_t func; - - arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); - if (!arch_timer_mem) - return -ENOMEM; - - arch_timer_mem->base = base; - arch_timer_mem->evt.irq = irq; - __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); - - if (arch_timer_mem_use_virtual) - func = arch_timer_handler_virt_mem; - else - func = arch_timer_handler_phys_mem; - - ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); - if (ret) { - pr_err("Failed to request mem timer irq\n"); - kfree(arch_timer_mem); - arch_timer_mem = NULL; - } - - return ret; -} - -static const struct of_device_id arch_timer_of_match[] __initconst = { - { .compatible = "arm,armv7-timer", }, - { .compatible = "arm,armv8-timer", }, - {}, -}; - -static const struct of_device_id arch_timer_mem_of_match[] __initconst = { - { .compatible = "arm,armv7-timer-mem", }, - {}, -}; - -static bool __init arch_timer_needs_of_probing(void) -{ - struct device_node *dn; - bool needs_probing = false; - unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM; - - /* We have two timers, and both device-tree nodes are probed. */ - if ((arch_timers_present & mask) == mask) - return false; - - /* - * Only one type of timer is probed, - * check if we have another type of timer node in device-tree. - */ - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) - dn = of_find_matching_node(NULL, arch_timer_mem_of_match); - else - dn = of_find_matching_node(NULL, arch_timer_of_match); - - if (dn && of_device_is_available(dn)) - needs_probing = true; - - of_node_put(dn); - - return needs_probing; -} - static int __init arch_timer_common_init(void) { - arch_timer_banner(arch_timers_present); - arch_counter_register(arch_timers_present); + arch_timer_banner(); + arch_counter_register(); return arch_timer_arch_init(); } @@ -1421,13 +1132,11 @@ static int __init arch_timer_of_init(struct device_node *np) u32 rate; bool has_names; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("multiple nodes in dt, skipping\n"); return 0; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { @@ -1472,283 +1181,22 @@ static int __init arch_timer_of_init(struct device_node *np) if (ret) return ret; - if (arch_timer_needs_of_probing()) - return 0; - return arch_timer_common_init(); } TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); -static u32 __init -arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - u32 rate; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Unable to map frame @ %pa\n", &frame->cntbase); - return 0; - } - - rate = readl_relaxed(base + CNTFRQ); - - iounmap(base); - - return rate; -} - -static struct arch_timer_mem_frame * __init -arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame, *best_frame = NULL; - void __iomem *cntctlbase; - u32 cnttidr; - int i; - - cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size); - if (!cntctlbase) { - pr_err("Can't map CNTCTLBase @ %pa\n", - &timer_mem->cntctlbase); - return NULL; - } - - cnttidr = readl_relaxed(cntctlbase + CNTTIDR); - - /* - * Try to find a virtual capable frame. Otherwise fall back to a - * physical capable frame. - */ - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | - CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; - - frame = &timer_mem->frame[i]; - if (!frame->valid) - continue; - - /* Try enabling everything, and see what sticks */ - writel_relaxed(cntacr, cntctlbase + CNTACR(i)); - cntacr = readl_relaxed(cntctlbase + CNTACR(i)); - - if ((cnttidr & CNTTIDR_VIRT(i)) && - !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { - best_frame = frame; - arch_timer_mem_use_virtual = true; - break; - } - - if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) - continue; - - best_frame = frame; - } - - iounmap(cntctlbase); - - return best_frame; -} - -static int __init -arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - int ret, irq; - - if (arch_timer_mem_use_virtual) - irq = frame->virt_irq; - else - irq = frame->phys_irq; - - if (!irq) { - pr_err("Frame missing %s irq.\n", - arch_timer_mem_use_virtual ? "virt" : "phys"); - return -EINVAL; - } - - if (!request_mem_region(frame->cntbase, frame->size, - "arch_mem_timer")) - return -EBUSY; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Can't map frame's registers\n"); - return -ENXIO; - } - - ret = arch_timer_mem_register(base, irq); - if (ret) { - iounmap(base); - return ret; - } - - arch_timers_present |= ARCH_TIMER_TYPE_MEM; - - return 0; -} - -static int __init arch_timer_mem_of_init(struct device_node *np) -{ - struct arch_timer_mem *timer_mem; - struct arch_timer_mem_frame *frame; - struct resource res; - int ret = -EINVAL; - u32 rate; - - timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL); - if (!timer_mem) - return -ENOMEM; - - if (of_address_to_resource(np, 0, &res)) - goto out; - timer_mem->cntctlbase = res.start; - timer_mem->size = resource_size(&res); - - for_each_available_child_of_node_scoped(np, frame_node) { - u32 n; - struct arch_timer_mem_frame *frame; - - if (of_property_read_u32(frame_node, "frame-number", &n)) { - pr_err(FW_BUG "Missing frame-number.\n"); - goto out; - } - if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { - pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n", - ARCH_TIMER_MEM_MAX_FRAMES - 1); - goto out; - } - frame = &timer_mem->frame[n]; - - if (frame->valid) { - pr_err(FW_BUG "Duplicated frame-number.\n"); - goto out; - } - - if (of_address_to_resource(frame_node, 0, &res)) - goto out; - - frame->cntbase = res.start; - frame->size = resource_size(&res); - - frame->virt_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_VIRT_SPI); - frame->phys_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_PHYS_SPI); - - frame->valid = true; - } - - frame = arch_timer_mem_find_best_frame(timer_mem); - if (!frame) { - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer_mem->cntctlbase); - ret = -EINVAL; - goto out; - } - - rate = arch_timer_mem_frame_get_cntfrq(frame); - arch_timer_of_configure_rate(rate, np); - - ret = arch_timer_mem_frame_register(frame); - if (!ret && !arch_timer_needs_of_probing()) - ret = arch_timer_common_init(); -out: - kfree(timer_mem); - return ret; -} -TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", - arch_timer_mem_of_init); - #ifdef CONFIG_ACPI_GTDT -static int __init -arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame; - u32 rate; - int i; - - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - frame = &timer_mem->frame[i]; - - if (!frame->valid) - continue; - - rate = arch_timer_mem_frame_get_cntfrq(frame); - if (rate == arch_timer_rate) - continue; - - pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n", - &frame->cntbase, - (unsigned long)rate, (unsigned long)arch_timer_rate); - - return -EINVAL; - } - - return 0; -} - -static int __init arch_timer_mem_acpi_init(int platform_timer_count) -{ - struct arch_timer_mem *timers, *timer; - struct arch_timer_mem_frame *frame, *best_frame = NULL; - int timer_count, i, ret = 0; - - timers = kcalloc(platform_timer_count, sizeof(*timers), - GFP_KERNEL); - if (!timers) - return -ENOMEM; - - ret = acpi_arch_timer_mem_init(timers, &timer_count); - if (ret || !timer_count) - goto out; - - /* - * While unlikely, it's theoretically possible that none of the frames - * in a timer expose the combination of feature we want. - */ - for (i = 0; i < timer_count; i++) { - timer = &timers[i]; - - frame = arch_timer_mem_find_best_frame(timer); - if (!best_frame) - best_frame = frame; - - ret = arch_timer_mem_verify_cntfrq(timer); - if (ret) { - pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); - goto out; - } - - if (!best_frame) /* implies !frame */ - /* - * Only complain about missing suitable frames if we - * haven't already found one in a previous iteration. - */ - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer->cntctlbase); - } - - if (best_frame) - ret = arch_timer_mem_frame_register(best_frame); -out: - kfree(timers); - return ret; -} - -/* Initialize per-processor generic timer and memory-mapped timer(if present) */ static int __init arch_timer_acpi_init(struct acpi_table_header *table) { - int ret, platform_timer_count; + int ret; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("already initialized, skipping\n"); return -EINVAL; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - - ret = acpi_gtdt_init(table, &platform_timer_count); + ret = acpi_gtdt_init(table, NULL); if (ret) return ret; @@ -1790,10 +1238,6 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) if (ret) return ret; - if (platform_timer_count && - arch_timer_mem_acpi_init(platform_timer_count)) - pr_err("Failed to initialize memory-mapped timer.\n"); - return arch_timer_common_init(); } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index ce6521ad04d1..2eda895f19f5 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -9,9 +9,6 @@ #include #include -#define ARCH_TIMER_TYPE_CP15 BIT(0) -#define ARCH_TIMER_TYPE_MEM BIT(1) - #define ARCH_TIMER_CTRL_ENABLE (1 << 0) #define ARCH_TIMER_CTRL_IT_MASK (1 << 1) #define ARCH_TIMER_CTRL_IT_STAT (1 << 2) @@ -51,8 +48,6 @@ enum arch_timer_spi_nr { #define ARCH_TIMER_PHYS_ACCESS 0 #define ARCH_TIMER_VIRT_ACCESS 1 -#define ARCH_TIMER_MEM_PHYS_ACCESS 2 -#define ARCH_TIMER_MEM_VIRT_ACCESS 3 #define ARCH_TIMER_MEM_MAX_FRAMES 8 -- cgit v1.2.3 From 0494fc345b377d1207c2cbfef67dc51f6ec874c0 Mon Sep 17 00:00:00 2001 From: Gokul Praveen Date: Tue, 12 Aug 2025 16:23:46 +0530 Subject: clocksource/drivers/timer-ti-dm : Capture functionality for OMAP DM timer Add PWM capture function in DM timer driver. OMAP DM timer hardware supports capture feature.It can be used to timestamp events (falling/rising edges) detected on input signal. Signed-off-by: Gokul Praveen Signed-off-by: Daniel Lezcano Reviewed-by: Neha Malcom Francis Link: https://lore.kernel.org/r/20250812105346.203541-1-g-praveen@ti.com --- drivers/clocksource/timer-ti-dm.c | 119 ++++++++++++++++++++++++++++- include/linux/platform_data/dmtimer-omap.h | 4 + 2 files changed, 121 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index e9e32df6b566..793e7cdcb1b1 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -31,6 +31,7 @@ #include #include +#include /* * timer errata flags @@ -836,6 +837,48 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, return 0; } +static int omap_dm_timer_set_cap(struct omap_dm_timer *cookie, + int autoreload, bool config_period) +{ + struct dmtimer *timer; + struct device *dev; + int rc; + u32 l; + + timer = to_dmtimer(cookie); + if (unlikely(!timer)) + return -EINVAL; + + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + /* + * 1. Select autoreload mode. TIMER_TCLR[1] AR bit. + * 2. TIMER_TCLR[14]: Sets the functionality of the TIMER IO pin. + * 3. TIMER_TCLR[13] : Capture mode select bit. + * 3. TIMER_TCLR[9-8] : Select transition capture mode. + */ + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + + if (autoreload) + l |= OMAP_TIMER_CTRL_AR; + + l |= OMAP_TIMER_CTRL_CAPTMODE | OMAP_TIMER_CTRL_GPOCFG; + + if (config_period == true) + l |= OMAP_TIMER_CTRL_TCM_LOWTOHIGH; /* Time Period config */ + else + l |= OMAP_TIMER_CTRL_TCM_BOTHEDGES; /* Duty Cycle config */ + + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); + + return 0; +} + static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { @@ -1023,23 +1066,92 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) return __omap_dm_timer_read_counter(timer); } +static inline unsigned int __omap_dm_timer_cap(struct dmtimer *timer, int idx) +{ + return idx == 0 ? dmtimer_read(timer, OMAP_TIMER_CAPTURE_REG) : + dmtimer_read(timer, OMAP_TIMER_CAPTURE2_REG); +} + static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { struct dmtimer *timer; + struct device *dev; timer = to_dmtimer(cookie); - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); + if (unlikely(!timer)) { + pr_err("%s: timer not available.\n", __func__); return -EINVAL; } + dev = &timer->pdev->dev; + + pm_runtime_resume_and_get(dev); dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); + pm_runtime_put_sync(dev); /* Save the context */ timer->context.tcrr = value; return 0; } +/** + * omap_dm_timer_cap_counter() - Calculate the high count or period count depending on the + * configuration. + * @cookie:Pointer to OMAP DM timer + * @is_period:Whether to configure timer in period or duty cycle mode + * + * Return high count or period count if timer is enabled else appropriate error. + */ +static unsigned int omap_dm_timer_cap_counter(struct omap_dm_timer *cookie, bool is_period) +{ + struct dmtimer *timer; + unsigned int cap1 = 0; + unsigned int cap2 = 0; + u32 l, ret; + + timer = to_dmtimer(cookie); + if (unlikely(!timer || !atomic_read(&timer->enabled))) { + pr_err("%s:timer is not available or enabled.%p\n", __func__, (void *)timer); + return -EINVAL; + } + + /* Stop the timer */ + omap_dm_timer_stop(cookie); + + /* Clear the timer counter value to 0 */ + ret = omap_dm_timer_write_counter(cookie, 0); + if (ret) + return ret; + + /* Sets the timer capture configuration for period/duty cycle calculation */ + ret = omap_dm_timer_set_cap(cookie, true, is_period); + if (ret) { + pr_err("%s: Failed to set timer capture configuration.\n", __func__); + return ret; + } + /* Start the timer */ + omap_dm_timer_start(cookie); + + /* + * 1 sec delay is given so as to provide + * enough time to capture low frequency signals. + */ + msleep(1000); + + cap1 = __omap_dm_timer_cap(timer, 0); + cap2 = __omap_dm_timer_cap(timer, 1); + + /* + * Clears the TCLR configuration. + * The start bit must be set to 1 as the timer is already in start mode. + */ + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + l &= ~(0xffff) | 0x1; + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + return (cap2-cap1); +} + static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { struct dmtimer *timer = dev_get_drvdata(dev); @@ -1246,6 +1358,9 @@ static const struct omap_dm_timer_ops dmtimer_ops = { .write_counter = omap_dm_timer_write_counter, .read_status = omap_dm_timer_read_status, .write_status = omap_dm_timer_write_status, + .set_cap = omap_dm_timer_set_cap, + .get_cap_status = omap_dm_timer_get_pwm_status, + .read_cap = omap_dm_timer_cap_counter, }; static const struct dmtimer_platform_data omap3plus_pdata = { diff --git a/include/linux/platform_data/dmtimer-omap.h b/include/linux/platform_data/dmtimer-omap.h index 95d852aef130..726d89143842 100644 --- a/include/linux/platform_data/dmtimer-omap.h +++ b/include/linux/platform_data/dmtimer-omap.h @@ -36,9 +36,13 @@ struct omap_dm_timer_ops { int (*set_pwm)(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload); int (*get_pwm_status)(struct omap_dm_timer *timer); + int (*set_cap)(struct omap_dm_timer *timer, + int autoreload, bool config_period); + int (*get_cap_status)(struct omap_dm_timer *timer); int (*set_prescaler)(struct omap_dm_timer *timer, int prescaler); unsigned int (*read_counter)(struct omap_dm_timer *timer); + unsigned int (*read_cap)(struct omap_dm_timer *timer, bool is_period); int (*write_counter)(struct omap_dm_timer *timer, unsigned int value); unsigned int (*read_status)(struct omap_dm_timer *timer); -- cgit v1.2.3 From 17eb98d6b517b6e7faaebed496fd688dbb1771d9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:48 +1000 Subject: VFS/ovl: add lookup_one_positive_killable() ovl wants a lookup which won't block on a fatal signal. It currently uses down_write_killable() and then repeatedly calls to lookup_one() The lock may not be needed if the name is already in the dcache and it aids proposed future changes if the locking is kept internal to namei.c So this patch adds lookup_one_positive_killable() which is like lookup_one_positive() but will abort in the face of a fatal signal. overlayfs is changed to use this. Note that instead of always getting an exclusive lock, ovl now only gets a shared lock, and only sometimes. The exclusive lock was never needed. However down_read_killable() was only added in v4.15 but overlayfs started using down_write_killable() here in v4.7. Note that the linked list ->first_maybe_whiteout ->next_maybe_white is local to the thread so there is no concurrency in that list which could be threatened by removing the locking. Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/readdir.c | 28 ++++++++++++------------- include/linux/namei.h | 3 +++ 3 files changed, 72 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa..c7c6b255db2c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1827,6 +1827,20 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} + static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { @@ -3010,6 +3024,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, } EXPORT_SYMBOL(lookup_one_unlocked); +/** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); + /** * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b65cdfce31ce..15cb06fa0c9a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err; + int err = 0; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); - err = down_write_killable(&dir->d_inode->i_rwsem); - if (!err) { - while (rdd->first_maybe_whiteout) { - struct ovl_cache_entry *p = - rdd->first_maybe_whiteout; - rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), - &QSTR_LEN(p->name, p->len), dir); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } + while (rdd->first_maybe_whiteout) { + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), + dir); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } else if (PTR_ERR(dentry) == -EINTR) { + err = -EINTR; + break; } - inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471..551a1a01e5e7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -80,6 +80,9 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); -- cgit v1.2.3 From d7fb2c410240348edee7867c29b60688175dcc11 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:50 +1000 Subject: VFS: unify old_mnt_idmap and new_mnt_idmap in renamedata A rename operation can only rename within a single mount. Callers of vfs_rename() must and do ensure this is the case. So there is no point in having two mnt_idmaps in renamedata as they are always the same. Only one of them is passed to ->rename in any case. This patch replaces both with a single "mnt_idmap" and changes all callers. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 3 +-- fs/ecryptfs/inode.c | 3 +-- fs/namei.c | 17 ++++++++--------- fs/nfsd/vfs.c | 3 +-- fs/overlayfs/overlayfs.h | 3 +-- fs/smb/server/vfs.c | 3 +-- include/linux/fs.h | 6 ++---- 7 files changed, 15 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 91dfd0231877..d1edb2ac3837 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -387,10 +387,9 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = dir, .old_dentry = rep, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = cache->graveyard, .new_dentry = grave, }; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 72fbe1316ab8..abd954c6a14e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_lock; } - rd.old_mnt_idmap = &nop_mnt_idmap; + rd.mnt_idmap = &nop_mnt_idmap; rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; - rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); diff --git a/fs/namei.c b/fs/namei.c index e2c2ab286bc0..180037b96956 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5077,20 +5077,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); + error = may_create(rd->mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -5105,13 +5105,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_idmap, source, + error = inode_permission(rd->mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_idmap, target, + error = inode_permission(rd->mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -5179,7 +5179,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -5322,10 +5322,9 @@ retry_deleg: rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; - rd.old_mnt_idmap = mnt_idmap(old_path.mnt); + rd.mnt_idmap = mnt_idmap(old_path.mnt); rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; - rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 98ab55ba3ced..5f3e99f956ca 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1943,10 +1943,9 @@ retry: goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = fdentry, .old_dentry = odentry, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = tdentry, .new_dentry = ndentry, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index bb0d7ded8e76..4f84abaa0d68 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, { int err; struct renamedata rd = { - .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), + .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, .old_dentry = olddentry, - .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_parent = newdir, .new_dentry = newdentry, .flags = flags, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 04539037108c..07739055ac9f 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -770,10 +770,9 @@ retry: goto out4; } - rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.mnt_idmap = mnt_idmap(old_path->mnt), rd.old_parent = old_parent, rd.old_dentry = old_child, - rd.new_mnt_idmap = mnt_idmap(new_path.mnt), rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..73b39e5bb9e4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2008,20 +2008,18 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming - * @old_mnt_idmap: idmap of the old mount the inode was found from + * @mnt_idmap: idmap of the mount in which the rename is happening. * @old_parent: parent of source * @old_dentry: source - * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct mnt_idmap *old_mnt_idmap; + struct mnt_idmap *mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; - struct mnt_idmap *new_mnt_idmap; struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; -- cgit v1.2.3 From 76a53de6f7ff0641570364234fb4489f4d4fc8e9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:51 +1000 Subject: VFS/audit: introduce kern_path_parent() for audit audit_alloc_mark() and audit_get_nd() both need to perform a path lookup getting the parent dentry (which must exist) and the final target (following a LAST_NORM name) which sometimes doesn't need to exist. They don't need the parent to be locked, but use kern_path_locked() or kern_path_locked_negative() anyway. This is somewhat misleading to the casual reader. This patch introduces a more targeted function, kern_path_parent(), which returns not holding locks. On success the "path" will be set to the parent, which must be found, and the return value is the dentry of the target, which might be negative. This will clear the way to rename kern_path_locked() which is otherwise only used to prepare for removing something. It also allows us to remove kern_path_locked_negative(), which is transformed into the new kern_path_parent(). Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 23 +++++++++++++++++------ include/linux/namei.h | 2 +- kernel/audit_fsnotify.c | 11 ++++++----- kernel/audit_watch.c | 3 +-- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/namei.c b/fs/namei.c index 180037b96956..4017bc8641d3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2781,7 +2781,20 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return d; } -struct dentry *kern_path_locked_negative(const char *name, struct path *path) +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); @@ -2794,12 +2807,10 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) return d; - } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; diff --git a/include/linux/namei.h b/include/linux/namei.h index 551a1a01e5e7..1d5038c21c20 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -57,12 +57,12 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); +struct dentry *kern_path_parent(const char *name, struct path *parent); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac8..b92805b317a2 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f..a700e3c8925f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } -- cgit v1.2.3 From 3d18f80ce181ba27f37d0ec1c550b22acb01dd49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:52 +1000 Subject: VFS: rename kern_path_locked() and related functions. kern_path_locked() is now only used to prepare for removing an object from the filesystem (and that is the only credible reason for wanting a positive locked dentry). Thus it corresponds to kern_path_create() and so should have a corresponding name. Unfortunately the name "kern_path_create" is somewhat misleading as it doesn't actually create anything. The recently added simple_start_creating() provides a better pattern I believe. The "start" can be matched with "end" to bracket the creating or removing. So this patch changes names: kern_path_locked -> start_removing_path kern_path_create -> start_creating_path user_path_create -> start_creating_user_path user_path_locked_at -> start_removing_user_path_at done_path_create -> end_creating_path and also introduces end_removing_path() which is identical to end_creating_path(). __start_removing_path (which was __kern_path_locked) is enhanced to call mnt_want_write() for consistency with the start_creating_path(). Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 12 ++++++ arch/powerpc/platforms/cell/spufs/syscalls.c | 4 +- drivers/base/devtmpfs.c | 22 +++++------ fs/bcachefs/fs-ioctl.c | 10 ++--- fs/init.c | 17 ++++---- fs/namei.c | 59 +++++++++++++++++----------- fs/ocfs2/refcounttree.c | 4 +- fs/smb/server/vfs.c | 8 ++-- include/linux/namei.h | 14 ++++--- kernel/bpf/inode.c | 4 +- net/unix/af_unix.c | 6 +-- 11 files changed, 93 insertions(+), 67 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..e0494860be6b 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Several functions are renamed: + +- kern_path_locked -> start_removing_path +- kern_path_create -> start_creating_path +- user_path_create -> start_creating_user_path +- user_path_locked_at -> start_removing_user_path_at +- done_path_create -> end_creating_path diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 157e046e6e93..ea4ba1b6ce6a 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, struct dentry *dentry; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 31bfb3194b4c..9d4e46ad8352 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode) struct dentry *dentry; struct path path; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } @@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, struct path path; int err; - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -256,7 +256,7 @@ static int dev_rmdir(const char *name) struct dentry *dentry; int err; - dentry = kern_path_locked(name, &parent); + dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) @@ -265,9 +265,7 @@ static int dev_rmdir(const char *name) else err = -EPERM; - dput(dentry); - inode_unlock(d_inode(parent.dentry)); - path_put(&parent); + end_removing_path(&parent, dentry); return err; } @@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev) int deleted = 0; int err = 0; - dentry = kern_path_locked(nodename, &parent); + dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev) if (!err || err == -ENOENT) deleted = 1; } - dput(dentry); - inode_unlock(d_inode(parent.dentry)); + end_removing_path(&parent, dentry); - path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4e72e654da96..43510da5e734 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -255,7 +255,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); } - dst_dentry = user_path_create(arg.dirfd, + dst_dentry = start_creating_user_path(arg.dirfd, (const char __user *)(unsigned long)arg.dst_ptr, &dst_path, lookup_flags); error = PTR_ERR_OR_ZERO(dst_dentry); @@ -314,7 +314,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, d_instantiate(dst_dentry, &inode->v); fsnotify_mkdir(dir, dst_dentry); err3: - done_path_create(&dst_path, dst_dentry); + end_creating_path(&dst_path, dst_dentry); err2: if (arg.src_ptr) path_put(&src_path); @@ -334,7 +334,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (arg.flags) return -EINVAL; - victim = user_path_locked_at(arg.dirfd, name, &path); + victim = start_removing_user_path_at(arg.dirfd, name, &path); if (IS_ERR(victim)) return PTR_ERR(victim); @@ -351,9 +351,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, d_invalidate(victim); } err: - inode_unlock(dir); - dput(victim); - path_put(&path); + end_removing_path(&path, victim); return ret; } diff --git a/fs/init.c b/fs/init.c index eef5124885e3..07f592ccdba8 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index 4017bc8641d3..92973a7a8091 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2758,7 +2758,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2770,15 +2771,26 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } /** @@ -2816,24 +2828,26 @@ struct dentry *kern_path_parent(const char *name, struct path *path) return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -4223,8 +4237,8 @@ out: return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4232,9 +4246,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4242,10 +4256,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4253,7 +4268,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4361,7 +4376,7 @@ retry: break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4465,7 +4480,7 @@ retry: if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4819,7 +4834,7 @@ retry: if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4988,7 +5003,7 @@ retry: error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e..267b50e8e42e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 07739055ac9f..1cfa688904b2 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -1325,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/include/linux/namei.h b/include/linux/namei.h index 1d5038c21c20..a7800ef04e76 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -59,11 +59,15 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..fadf3817a9c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ff..768098dec231 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1427,7 @@ out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; -- cgit v1.2.3 From 1abc1b212effe920f4729353880c8e03f1d76b4b Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 31 Jul 2025 13:23:40 +0100 Subject: coresight: Appropriately disable programming clocks Some CoreSight components have programming clocks (pclk) and are enabled using clk_get() and clk_prepare_enable(). However, in many cases, these clocks are not disabled when modules exit and only released by clk_put(). To fix the issue, this commit refactors programming clock by replacing clk_get() and clk_prepare_enable() with devm_clk_get_optional_enabled() for enabling APB clock. If the "apb_pclk" clock is not found, a NULL pointer is returned, and the function proceeds to attempt enabling the "apb" clock. Since ACPI platforms rely on firmware to manage clocks, returning a NULL pointer in this case leaves clock management to the firmware rather than the driver. This effectively avoids a clock imbalance issue during module removal - where the clock could be disabled twice: once during the ACPI runtime suspend and again during the devm resource release. Callers are updated to reuse the returned error value. With the change, programming clocks are managed as resources in driver model layer, allowing clock cleanup to be handled automatically. As a result, manual cleanup operations are no longer needed and are removed from the Coresight drivers. Fixes: 73d779a03a76 ("coresight: etm4x: Change etm4_platform_driver driver for MMIO devices") Reviewed-by: Yeoreum Yun Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250731-arm_cs_fix_clock_v4-v6-4-1dfe10bb3f6f@arm.com --- drivers/hwtracing/coresight/coresight-catu.c | 9 ++------- drivers/hwtracing/coresight/coresight-cpu-debug.c | 6 +----- drivers/hwtracing/coresight/coresight-ctcu-core.c | 10 ++-------- drivers/hwtracing/coresight/coresight-etm4x-core.c | 9 ++------- drivers/hwtracing/coresight/coresight-funnel.c | 6 +----- drivers/hwtracing/coresight/coresight-replicator.c | 6 +----- drivers/hwtracing/coresight/coresight-stm.c | 4 +--- drivers/hwtracing/coresight/coresight-tmc-core.c | 4 +--- drivers/hwtracing/coresight/coresight-tpiu.c | 4 +--- include/linux/coresight.h | 22 +++++++++------------- 10 files changed, 21 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index af2a55f0c907..4c345ff2cff1 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -636,7 +636,7 @@ static int catu_platform_probe(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); @@ -645,11 +645,8 @@ static int catu_platform_probe(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, drvdata); ret = __catu_probe(&pdev->dev, res); pm_runtime_put(&pdev->dev); - if (ret) { + if (ret) pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); - } return ret; } @@ -663,8 +660,6 @@ static void catu_platform_remove(struct platform_device *pdev) __catu_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c index a871d997330b..e39dfb886688 100644 --- a/drivers/hwtracing/coresight/coresight-cpu-debug.c +++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c @@ -699,7 +699,7 @@ static int debug_platform_probe(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); @@ -710,8 +710,6 @@ static int debug_platform_probe(struct platform_device *pdev) if (ret) { pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } return ret; } @@ -725,8 +723,6 @@ static void debug_platform_remove(struct platform_device *pdev) __debug_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_ACPI diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index c6bafc96db96..de279efe3405 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -209,7 +209,7 @@ static int ctcu_probe(struct platform_device *pdev) drvdata->apb_clk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->apb_clk)) - return -ENODEV; + return PTR_ERR(drvdata->apb_clk); cfgs = of_device_get_match_data(dev); if (cfgs) { @@ -233,12 +233,8 @@ static int ctcu_probe(struct platform_device *pdev) desc.access = CSDEV_ACCESS_IOMEM(base); drvdata->csdev = coresight_register(&desc); - if (IS_ERR(drvdata->csdev)) { - if (!IS_ERR_OR_NULL(drvdata->apb_clk)) - clk_put(drvdata->apb_clk); - + if (IS_ERR(drvdata->csdev)) return PTR_ERR(drvdata->csdev); - } return 0; } @@ -275,8 +271,6 @@ static void ctcu_platform_remove(struct platform_device *pdev) ctcu_remove(pdev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->apb_clk)) - clk_put(drvdata->apb_clk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index 81f20a167e00..4b98a7bf4cb7 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -2309,14 +2309,12 @@ static int etm4_probe_platform_dev(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); if (res) { drvdata->base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(drvdata->base)) { - clk_put(drvdata->pclk); + if (IS_ERR(drvdata->base)) return PTR_ERR(drvdata->base); - } } dev_set_drvdata(&pdev->dev, drvdata); @@ -2423,9 +2421,6 @@ static void etm4_remove_platform_dev(struct platform_device *pdev) if (drvdata) etm4_remove_dev(drvdata); pm_runtime_disable(&pdev->dev); - - if (drvdata && !IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } static const struct amba_id etm4_ids[] = { diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c index b1922dbe9292..36fc4e991458 100644 --- a/drivers/hwtracing/coresight/coresight-funnel.c +++ b/drivers/hwtracing/coresight/coresight-funnel.c @@ -240,7 +240,7 @@ static int funnel_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); /* * Map the device base for dynamic-funnel, which has been @@ -284,8 +284,6 @@ static int funnel_probe(struct device *dev, struct resource *res) out_disable_clk: if (ret && !IS_ERR_OR_NULL(drvdata->atclk)) clk_disable_unprepare(drvdata->atclk); - if (ret && !IS_ERR_OR_NULL(drvdata->pclk)) - clk_disable_unprepare(drvdata->pclk); return ret; } @@ -355,8 +353,6 @@ static void funnel_platform_remove(struct platform_device *pdev) funnel_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } static const struct of_device_id funnel_match[] = { diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c index 06efd2b01a0f..6dd24eb10a94 100644 --- a/drivers/hwtracing/coresight/coresight-replicator.c +++ b/drivers/hwtracing/coresight/coresight-replicator.c @@ -247,7 +247,7 @@ static int replicator_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); /* * Map the device base for dynamic-replicator, which has been @@ -296,8 +296,6 @@ static int replicator_probe(struct device *dev, struct resource *res) out_disable_clk: if (ret && !IS_ERR_OR_NULL(drvdata->atclk)) clk_disable_unprepare(drvdata->atclk); - if (ret && !IS_ERR_OR_NULL(drvdata->pclk)) - clk_disable_unprepare(drvdata->pclk); return ret; } @@ -335,8 +333,6 @@ static void replicator_platform_remove(struct platform_device *pdev) replicator_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index 464b0c85c3f7..f2de16e4d3b4 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -851,7 +851,7 @@ static int __stm_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); base = devm_ioremap_resource(dev, res); @@ -1033,8 +1033,6 @@ static void stm_platform_remove(struct platform_device *pdev) __stm_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_ACPI diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 85ba037c27f7..519f22542807 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -981,7 +981,7 @@ static int tmc_platform_probe(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); @@ -1005,8 +1005,6 @@ static void tmc_platform_remove(struct platform_device *pdev) __tmc_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 3e0159288428..b2559c6fac6d 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -153,7 +153,7 @@ static int __tpiu_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); /* Validity for the resource is already checked by the AMBA core */ @@ -293,8 +293,6 @@ static void tpiu_platform_remove(struct platform_device *pdev) __tpiu_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_ACPI diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 4ac65c68bbf4..1e652e157841 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -6,6 +6,7 @@ #ifndef _LINUX_CORESIGHT_H #define _LINUX_CORESIGHT_H +#include #include #include #include @@ -480,26 +481,21 @@ static inline bool is_coresight_device(void __iomem *base) * Returns: * * clk - Clock is found and enabled - * NULL - clock is not found + * NULL - Clock is controlled by firmware (ACPI device only) * ERROR - Clock is found but failed to enable */ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) { struct clk *pclk; - int ret; - pclk = clk_get(dev, "apb_pclk"); - if (IS_ERR(pclk)) { - pclk = clk_get(dev, "apb"); - if (IS_ERR(pclk)) - return NULL; - } + /* Firmware controls clocks for an ACPI device. */ + if (has_acpi_companion(dev)) + return NULL; + + pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); + if (!pclk) + pclk = devm_clk_get_optional_enabled(dev, "apb"); - ret = clk_prepare_enable(pclk); - if (ret) { - clk_put(pclk); - return ERR_PTR(ret); - } return pclk; } -- cgit v1.2.3 From d091c6312561821f216ced63a7ad17c946b6d335 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 31 Jul 2025 13:23:42 +0100 Subject: coresight: Avoid enable programming clock duplicately The programming clock is enabled by AMBA bus driver before a dynamic probe. As a result, a CoreSight driver may redundantly enable the same clock. To avoid this, add a check for device type and skip enabling the programming clock for AMBA devices. The returned NULL pointer will be tolerated by the drivers. Fixes: 73d779a03a76 ("coresight: etm4x: Change etm4_platform_driver driver for MMIO devices") Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250731-arm_cs_fix_clock_v4-v6-6-1dfe10bb3f6f@arm.com --- include/linux/coresight.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 1e652e157841..bb49080ec8f9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -481,20 +481,23 @@ static inline bool is_coresight_device(void __iomem *base) * Returns: * * clk - Clock is found and enabled - * NULL - Clock is controlled by firmware (ACPI device only) + * NULL - Clock is controlled by firmware (ACPI device only) or when managed + * by the AMBA bus driver instead * ERROR - Clock is found but failed to enable */ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) { - struct clk *pclk; + struct clk *pclk = NULL; /* Firmware controls clocks for an ACPI device. */ if (has_acpi_companion(dev)) return NULL; - pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); - if (!pclk) - pclk = devm_clk_get_optional_enabled(dev, "apb"); + if (!dev_is_amba(dev)) { + pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); + if (!pclk) + pclk = devm_clk_get_optional_enabled(dev, "apb"); + } return pclk; } -- cgit v1.2.3 From fbe7514a7912959e384acf108931ac1bfbb16466 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 31 Jul 2025 13:23:43 +0100 Subject: coresight: Consolidate clock enabling CoreSight drivers enable pclk and atclk conditionally. For example, pclk is only enabled in the static probe, while atclk is an optional clock that it is enabled for both dynamic and static probes, if it is present. In the current CoreSight drivers, these two clocks are initialized separately. This causes complex and duplicate codes. CoreSight drivers are refined so that clocks are initialized in one go. For this purpose, this commit renames coresight_get_enable_apb_pclk() to coresight_get_enable_clocks() and encapsulates clock initialization logic: - If a clock is initialized successfully, its clock pointer is assigned to the double pointer passed as an argument. - For ACPI devices, clocks are controlled by firmware, directly bail out. - Skip enabling pclk for an AMBA device. - If atclk is not found, the corresponding double pointer is set to NULL. The function returns Success (0) to guide callers can proceed with no error. - Otherwise, an error number is returned for failures. The function became complex, move it from the header to the CoreSight core layer and the symbol is exported. Added comments for recording details. Suggested-by: Suzuki K Poulose Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250731-arm_cs_fix_clock_v4-v6-7-1dfe10bb3f6f@arm.com --- drivers/hwtracing/coresight/coresight-catu.c | 10 ++--- drivers/hwtracing/coresight/coresight-core.c | 48 ++++++++++++++++++++++ drivers/hwtracing/coresight/coresight-cpu-debug.c | 8 ++-- drivers/hwtracing/coresight/coresight-ctcu-core.c | 8 ++-- drivers/hwtracing/coresight/coresight-etm4x-core.c | 11 ++--- drivers/hwtracing/coresight/coresight-funnel.c | 11 ++--- drivers/hwtracing/coresight/coresight-replicator.c | 11 ++--- drivers/hwtracing/coresight/coresight-stm.c | 9 ++-- drivers/hwtracing/coresight/coresight-tmc-core.c | 10 ++--- drivers/hwtracing/coresight/coresight-tpiu.c | 10 ++--- include/linux/coresight.h | 30 +------------- 11 files changed, 83 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index 4c345ff2cff1..0f476a0cbd74 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -520,9 +520,9 @@ static int __catu_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; void __iomem *base; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; catu_desc.name = coresight_alloc_device_name(&catu_devs, dev); if (!catu_desc.name) @@ -634,10 +634,6 @@ static int catu_platform_probe(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index 1accd7cbd54b..3267192f0c1c 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -3,6 +3,7 @@ * Copyright (c) 2012, The Linux Foundation. All rights reserved. */ +#include #include #include #include @@ -1700,6 +1701,53 @@ int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode } EXPORT_SYMBOL_GPL(coresight_etm_get_trace_id); +/* + * Attempt to find and enable programming clock (pclk) and trace clock (atclk) + * for the given device. + * + * For ACPI devices, clocks are controlled by firmware, so bail out early in + * this case. Also, skip enabling pclk if the clock is managed by the AMBA + * bus driver instead. + * + * atclk is an optional clock, it will be only enabled when it is existed. + * Otherwise, a NULL pointer will be returned to caller. + * + * Returns: '0' on Success; Error code otherwise. + */ +int coresight_get_enable_clocks(struct device *dev, struct clk **pclk, + struct clk **atclk) +{ + WARN_ON(!pclk); + + if (has_acpi_companion(dev)) + return 0; + + if (!dev_is_amba(dev)) { + /* + * "apb_pclk" is the default clock name for an Arm Primecell + * peripheral, while "apb" is used only by the CTCU driver. + * + * For easier maintenance, CoreSight drivers should use + * "apb_pclk" as the programming clock name. + */ + *pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); + if (!*pclk) + *pclk = devm_clk_get_optional_enabled(dev, "apb"); + if (IS_ERR(*pclk)) + return PTR_ERR(*pclk); + } + + /* Initialization of atclk is skipped if it is a NULL pointer. */ + if (atclk) { + *atclk = devm_clk_get_optional_enabled(dev, "atclk"); + if (IS_ERR(*atclk)) + return PTR_ERR(*atclk); + } + + return 0; +} +EXPORT_SYMBOL_GPL(coresight_get_enable_clocks); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Pratik Patel "); MODULE_AUTHOR("Mathieu Poirier "); diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c index e39dfb886688..5f6db2fb95d4 100644 --- a/drivers/hwtracing/coresight/coresight-cpu-debug.c +++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c @@ -566,6 +566,10 @@ static int __debug_probe(struct device *dev, struct resource *res) void __iomem *base; int ret; + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, NULL); + if (ret) + return ret; + drvdata->cpu = coresight_get_cpu(dev); if (drvdata->cpu < 0) return drvdata->cpu; @@ -697,10 +701,6 @@ static int debug_platform_probe(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index de279efe3405..75b5114ef652 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -188,7 +188,7 @@ static int ctcu_probe(struct platform_device *pdev) const struct ctcu_config *cfgs; struct ctcu_drvdata *drvdata; void __iomem *base; - int i; + int i, ret; desc.name = coresight_alloc_device_name(&ctcu_devs, dev); if (!desc.name) @@ -207,9 +207,9 @@ static int ctcu_probe(struct platform_device *pdev) if (IS_ERR(base)) return PTR_ERR(base); - drvdata->apb_clk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->apb_clk)) - return PTR_ERR(drvdata->apb_clk); + ret = coresight_get_enable_clocks(dev, &drvdata->apb_clk, NULL); + if (ret) + return ret; cfgs = of_device_get_match_data(dev); if (cfgs) { diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index 4b98a7bf4cb7..020f070bf17d 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -2217,13 +2217,14 @@ static int etm4_probe(struct device *dev) struct csdev_access access = { 0 }; struct etm4_init_arg init_arg = { 0 }; struct etm4_init_arg *delayed; + int ret; if (WARN_ON(!drvdata)) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; if (pm_save_enable == PARAM_PM_SAVE_FIRMWARE) pm_save_enable = coresight_loses_context_with_cpu(dev) ? @@ -2307,10 +2308,6 @@ static int etm4_probe_platform_dev(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - if (res) { drvdata->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(drvdata->base)) diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c index b044a4125310..02e0dc678a32 100644 --- a/drivers/hwtracing/coresight/coresight-funnel.c +++ b/drivers/hwtracing/coresight/coresight-funnel.c @@ -217,6 +217,7 @@ static int funnel_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; struct funnel_drvdata *drvdata; struct coresight_desc desc = { 0 }; + int ret; if (is_of_node(dev_fwnode(dev)) && of_device_is_compatible(dev->of_node, "arm,coresight-funnel")) @@ -230,13 +231,9 @@ static int funnel_probe(struct device *dev, struct resource *res) if (!drvdata) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); - - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; /* * Map the device base for dynamic-funnel, which has been diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c index 9e8bd36e7a9a..f1bbd12e63e0 100644 --- a/drivers/hwtracing/coresight/coresight-replicator.c +++ b/drivers/hwtracing/coresight/coresight-replicator.c @@ -223,6 +223,7 @@ static int replicator_probe(struct device *dev, struct resource *res) struct replicator_drvdata *drvdata; struct coresight_desc desc = { 0 }; void __iomem *base; + int ret; if (is_of_node(dev_fwnode(dev)) && of_device_is_compatible(dev->of_node, "arm,coresight-replicator")) @@ -237,13 +238,9 @@ static int replicator_probe(struct device *dev, struct resource *res) if (!drvdata) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); - - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; /* * Map the device base for dynamic-replicator, which has been diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index 275d67b91dfd..23ba33ac4d2e 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -842,13 +842,10 @@ static int __stm_probe(struct device *dev, struct resource *res) if (!drvdata) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); base = devm_ioremap_resource(dev, res); diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 519f22542807..25c987e2d114 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -779,9 +779,9 @@ static int __tmc_probe(struct device *dev, struct resource *res) struct coresight_desc desc = { 0 }; struct coresight_dev_list *dev_list = NULL; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; ret = -ENOMEM; @@ -979,10 +979,6 @@ static int tmc_platform_probe(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 8d6179c83e5d..5e47d761e1c4 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -132,6 +132,7 @@ static int __tpiu_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; struct tpiu_drvdata *drvdata; struct coresight_desc desc = { 0 }; + int ret; desc.name = coresight_alloc_device_name(&tpiu_devs, dev); if (!desc.name) @@ -143,13 +144,10 @@ static int __tpiu_probe(struct device *dev, struct resource *res) spin_lock_init(&drvdata->spinlock); - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); /* Validity for the resource is already checked by the AMBA core */ diff --git a/include/linux/coresight.h b/include/linux/coresight.h index bb49080ec8f9..6de59ce8ef8c 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -6,7 +6,6 @@ #ifndef _LINUX_CORESIGHT_H #define _LINUX_CORESIGHT_H -#include #include #include #include @@ -475,33 +474,6 @@ static inline bool is_coresight_device(void __iomem *base) return cid == CORESIGHT_CID; } -/* - * Attempt to find and enable "APB clock" for the given device - * - * Returns: - * - * clk - Clock is found and enabled - * NULL - Clock is controlled by firmware (ACPI device only) or when managed - * by the AMBA bus driver instead - * ERROR - Clock is found but failed to enable - */ -static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) -{ - struct clk *pclk = NULL; - - /* Firmware controls clocks for an ACPI device. */ - if (has_acpi_companion(dev)) - return NULL; - - if (!dev_is_amba(dev)) { - pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); - if (!pclk) - pclk = devm_clk_get_optional_enabled(dev, "apb"); - } - - return pclk; -} - #define CORESIGHT_PIDRn(i) (0xFE0 + ((i) * 4)) static inline u32 coresight_get_pid(struct csdev_access *csa) @@ -732,4 +704,6 @@ void coresight_remove_driver(struct amba_driver *amba_drv, struct platform_driver *pdev_drv); int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode, struct coresight_device *sink); +int coresight_get_enable_clocks(struct device *dev, struct clk **pclk, + struct clk **atclk); #endif /* _LINUX_COREISGHT_H */ -- cgit v1.2.3 From 559f2eacc8a23c7f44daac09d4f3efd958d497f2 Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Tue, 23 Sep 2025 11:24:28 +0800 Subject: ACPI: processor: Do not expose global variable acpi_idle_driver Move the cpuidle driver check from __acpi_processor_start() to acpi_processor_power_init() which allows variable acpi_idle_driver to become static. No intentional functional impact. Signed-off-by: Huisong Li Link: https://patch.msgid.link/20250923032428.2656329-2-lihuisong@huawei.com [ rjw: Subject tweak, new changelog, adjustment of a new comment ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_driver.c | 3 +-- drivers/acpi/processor_idle.c | 9 ++++++++- include/acpi/processor.h | 1 - 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index de17c1412678..5d824435b26b 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -166,8 +166,7 @@ static int __acpi_processor_start(struct acpi_device *device) if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS)) dev_dbg(&device->dev, "CPPC data invalid or not present\n"); - if (cpuidle_get_driver() == &acpi_idle_driver) - acpi_processor_power_init(pr); + acpi_processor_power_init(pr); acpi_pss_perf_init(pr); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 698d14c19587..22b051b94a86 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -51,7 +51,7 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -struct cpuidle_driver acpi_idle_driver = { +static struct cpuidle_driver acpi_idle_driver = { .name = "acpi_idle", .owner = THIS_MODULE, }; @@ -1404,6 +1404,13 @@ void acpi_processor_power_init(struct acpi_processor *pr) { struct cpuidle_device *dev; + /* + * The code below only works if the current cpuidle driver is the ACPI + * idle driver. + */ + if (cpuidle_get_driver() != &acpi_idle_driver) + return; + if (disabled_by_idle_boot_param()) return; diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 24fdaa3c2899..7146a8e9e9c2 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -417,7 +417,6 @@ static inline void acpi_processor_throttling_init(void) {} #endif /* CONFIG_ACPI_CPU_FREQ_PSS */ /* in processor_idle.c */ -extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE void acpi_processor_power_init(struct acpi_processor *pr); void acpi_processor_power_exit(struct acpi_processor *pr); -- cgit v1.2.3 From ade2105e748f85eb026d26701091213855aea633 Mon Sep 17 00:00:00 2001 From: Elijah Wright Date: Wed, 20 Aug 2025 22:39:07 -0700 Subject: tracing: Move buffer in trace_seq to end of struct TRACE_SEQ_BUFFER_SIZE is dependent on the architecture for its size. on 64-bit systems, it is 8148 bytes. forced 8-byte alignment in size_t and seq_buf means that trace_seq is 8200 bytes on 64-bit systems. moving the buffer to the end of the struct fixes the issue. there shouldn't be any side effects, i.e. pointer arithmetic on trace_seq Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250821053917.23301-1-git@elijahs.space Signed-off-by: Elijah Wright Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index a93ed5ac3226..557780fe1c77 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -21,10 +21,10 @@ (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) struct trace_seq { - char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; + char buffer[TRACE_SEQ_BUFFER_SIZE]; }; static inline void -- cgit v1.2.3 From 5c8fd7e2b5b0a527cf88740da122166695382a78 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:00 +0100 Subject: bpf: bpf task work plumbing This patch adds necessary plumbing in verifier, syscall and maps to support handling new kfunc bpf_task_work_schedule and kernel structure bpf_task_work. The idea is similar to how we already handle bpf_wq and bpf_timer. verifier changes validate calls to bpf_task_work_schedule to make sure it is safe and expected invariants hold. btf part is required to detect bpf_task_work structure inside map value and store its offset, which will be used in the next patch to calculate key and value addresses. arraymap and hashtab changes are needed to handle freeing of the bpf_task_work: run code needed to deinitialize it, for example cancel task_work callback if possible. The use of bpf_task_work and proper implementation for kfuncs are introduced in the next patch. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++ include/uapi/linux/bpf.h | 4 ++ kernel/bpf/arraymap.c | 8 +-- kernel/bpf/btf.c | 7 +++ kernel/bpf/hashtab.c | 19 ++++--- kernel/bpf/helpers.c | 40 ++++++++++++++ kernel/bpf/syscall.c | 16 +++++- kernel/bpf/verifier.c | 117 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 4 ++ 9 files changed, 208 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dfc1a27b56d5..a2ab51fa8b0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,6 +209,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -262,6 +263,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -363,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -401,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -433,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -464,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -600,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2426,6 +2436,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26d5dda989bc..80b1765a3159 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -443,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -451,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -795,7 +797,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c51e16bbf0c1..9f47a3aa7ff8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3490,6 +3490,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, { BPF_TIMER, "bpf_timer", true }, { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, { BPF_LIST_HEAD, "bpf_list_head", false }, { BPF_LIST_NODE, "bpf_list_node", false }, { BPF_RB_ROOT, "bpf_rb_root", false }, @@ -3675,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3967,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4006,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2319f8f8fa3e..c2fcd0cd51e5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -223,9 +223,12 @@ static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem * if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -1495,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1514,16 +1517,16 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 969f63f8ca28..7f5e528df13b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3906,8 +3906,48 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, } #endif /* CONFIG_KEYS */ +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + __bpf_kfunc_end_defs(); +void bpf_task_work_cancel_and_free(void *val) +{ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a3c3d26f6e2..adb05d235011 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -674,6 +674,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -727,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -785,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -809,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1240,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1272,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02b93a54a446..ceeb0ffe7d67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2224,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -8461,6 +8461,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TIMER: field_off = map->record->timer_off; break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8514,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10343,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static bool is_task_work_add_kfunc(u32 func_id); + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10561,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10876,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -12000,6 +12056,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12010,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12058,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12145,6 +12208,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12194,6 +12258,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12262,6 +12328,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12352,6 +12426,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12695,7 +12772,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13076,7 +13154,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13091,6 +13170,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13129,6 +13214,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13422,6 +13508,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13788,6 +13883,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); -- cgit v1.2.3 From 66e2d96b1c5875122bfb94239989d832ccf51477 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Sep 2025 23:37:26 +0800 Subject: LoongArch: KVM: Move kvm_iocsr tracepoint out of generic code The tracepoint kvm_iocsr is only used by the loongarch architecture. As trace events can take up to 5K of memory, move this tracepoint into the LoongArch specific tracing file so that it doesn't waste memory for all other architectures. Reviewed-by: Bibo Mao Signed-off-by: Steven Rostedt (Google) Signed-off-by: Huacai Chen --- arch/loongarch/kvm/trace.h | 35 +++++++++++++++++++++++++++++++++++ include/trace/events/kvm.h | 35 ----------------------------------- 2 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h index 145514dab6d5..3467ee22b704 100644 --- a/arch/loongarch/kvm/trace.h +++ b/arch/loongarch/kvm/trace.h @@ -161,6 +161,41 @@ TRACE_EVENT(kvm_aux, __entry->pc) ); +#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 +#define KVM_TRACE_IOCSR_READ 1 +#define KVM_TRACE_IOCSR_WRITE 2 + +#define kvm_trace_symbol_iocsr \ + { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ + { KVM_TRACE_IOCSR_READ, "read" }, \ + { KVM_TRACE_IOCSR_WRITE, "write" } + +TRACE_EVENT(kvm_iocsr, + TP_PROTO(int type, int len, u64 gpa, void *val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( + __field( u32, type ) + __field( u32, len ) + __field( u64, gpa ) + __field( u64, val ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); + ), + + TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", + __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), + __entry->len, __entry->gpa, __entry->val) +); + TRACE_EVENT(kvm_vpid_change, TP_PROTO(struct kvm_vcpu *vcpu, unsigned long vpid), TP_ARGS(vcpu, vpid), diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 8b7252b8d751..b282e3a86769 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -156,41 +156,6 @@ TRACE_EVENT(kvm_mmio, __entry->len, __entry->gpa, __entry->val) ); -#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 -#define KVM_TRACE_IOCSR_READ 1 -#define KVM_TRACE_IOCSR_WRITE 2 - -#define kvm_trace_symbol_iocsr \ - { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ - { KVM_TRACE_IOCSR_READ, "read" }, \ - { KVM_TRACE_IOCSR_WRITE, "write" } - -TRACE_EVENT(kvm_iocsr, - TP_PROTO(int type, int len, u64 gpa, void *val), - TP_ARGS(type, len, gpa, val), - - TP_STRUCT__entry( - __field( u32, type ) - __field( u32, len ) - __field( u64, gpa ) - __field( u64, val ) - ), - - TP_fast_assign( - __entry->type = type; - __entry->len = len; - __entry->gpa = gpa; - __entry->val = 0; - if (val) - memcpy(&__entry->val, val, - min_t(u32, sizeof(__entry->val), len)); - ), - - TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", - __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), - __entry->len, __entry->gpa, __entry->val) -); - #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"} -- cgit v1.2.3 From 9082aae154be2d9e208b56e249cb886612f7c6cf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2025 09:19:22 -0400 Subject: sunrpc: remove dfprintk_cont() and dfprintk_rcu_cont() KERN_CONT hails from a simpler time, when SMP wasn't the norm. These days, it doesn't quite work right since another printk() can always race in between the first one and the one being "continued". Nothing calls dprintk_rcu_cont(), so just remove it. The only caller of dprintk_cont() is in nfs_commit_release_pages(). Just use a normal dprintk() there instead, since this is not SMP-safe anyway. Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Reviewed-by: Simon Horman Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 6 +++--- include/linux/sunrpc/debug.h | 24 ++---------------------- 2 files changed, 5 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 52a763b6aa3a..66acc5247435 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1818,7 +1818,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } - dprintk_cont(", error = %d\n", status); + dprintk(", error = %d\n", status); goto next; } @@ -1828,11 +1828,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a match */ if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index f6aeed07fe04..99a6fa4a1d6a 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -23,12 +23,8 @@ extern unsigned int nlm_debug; #define dprintk(fmt, ...) \ dfprintk(FACILITY, fmt, ##__VA_ARGS__) -#define dprintk_cont(fmt, ...) \ - dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__) #define dprintk_rcu(fmt, ...) \ dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__) -#define dprintk_rcu_cont(fmt, ...) \ - dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__) #undef ifdebug #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -37,29 +33,14 @@ extern unsigned int nlm_debug; # define dfprintk(fac, fmt, ...) \ do { \ ifdebug(fac) \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ -} while (0) - -# define dfprintk_cont(fac, fmt, ...) \ -do { \ - ifdebug(fac) \ - printk(KERN_CONT fmt, ##__VA_ARGS__); \ + printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ do { \ ifdebug(fac) { \ rcu_read_lock(); \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ - rcu_read_unlock(); \ - } \ -} while (0) - -# define dfprintk_rcu_cont(fac, fmt, ...) \ -do { \ - ifdebug(fac) { \ - rcu_read_lock(); \ - printk(KERN_CONT fmt, ##__VA_ARGS__); \ + printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ } \ } while (0) @@ -68,7 +49,6 @@ do { \ #else # define ifdebug(fac) if (0) # define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_cont(fac, fmt, ...) do {} while (0) # define dfprintk_rcu(fac, fmt, ...) do {} while (0) # define RPC_IFDEBUG(x) #endif -- cgit v1.2.3 From ec7d8e68ef0ec5c635c8f9e93cd881673445a397 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2025 09:19:23 -0400 Subject: sunrpc: add a Kconfig option to redirect dfprintk() output to trace buffer We have a lot of old dprintk() call sites that aren't going anywhere anytime soon. At the same time, turning them up is a serious burden on the host due to the console locking overhead. Add a new Kconfig option that redirects dfprintk() output to the trace buffer. This is more efficient than logging to the console and allows for proper interleaving of dprintk and static tracepoint events. Since using trace_printk() causes scary warnings to pop at boot time, this new option defaults to "n". Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Reviewed-by: Simon Horman Signed-off-by: Anna Schumaker --- include/linux/sunrpc/debug.h | 10 ++++++++-- net/sunrpc/Kconfig | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 99a6fa4a1d6a..891f6173c951 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -30,17 +30,23 @@ extern unsigned int nlm_debug; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) +# if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE) +# define __sunrpc_printk(fmt, ...) trace_printk(fmt, ##__VA_ARGS__) +# else +# define __sunrpc_printk(fmt, ...) printk(KERN_DEFAULT fmt, ##__VA_ARGS__) +# endif + # define dfprintk(fac, fmt, ...) \ do { \ ifdebug(fac) \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + __sunrpc_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ do { \ ifdebug(fac) { \ rcu_read_lock(); \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ } \ } while (0) diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -101,6 +101,20 @@ config SUNRPC_DEBUG If unsure, say Y. +config SUNRPC_DEBUG_TRACE + bool "RPC: Send dfprintk() output to the trace buffer" + depends on SUNRPC_DEBUG && TRACING + default n + help + dprintk() output can be voluminous, which can overwhelm the + kernel's logging facility as it must be sent to the console. + This option causes dprintk() output to go to the trace buffer + instead of the kernel log. + + This will cause warnings about trace_printk() being used to be + logged at boot time, so say N unless you are debugging a problem + with sunrpc-based clients or services. + config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS -- cgit v1.2.3 From 62c0c0e7491211969d8d1c2a9ab0e112b34664cf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Signed-off-by: Chuck Lever Tested-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..fde60d4e2cd5 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -165,7 +165,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b1fab3a69544..9c7245d811eb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..049ab53088e9 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1102,6 +1102,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1114,7 +1115,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1124,6 +1126,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); -- cgit v1.2.3 From 301f3470273c89df3a933762b7495569f650e68b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Aug 2025 12:27:00 -0400 Subject: nfs: remove NFS_WBACK_BUSY() Nothing calls this macro. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_page.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 9aed39abc94b..afe1d8f09d89 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -122,8 +122,6 @@ struct nfs_pageio_descriptor { /* arbitrarily selected limit to number of mirrors */ #define NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX 16 -#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) - extern struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx, struct page *page, unsigned int pgbase, -- cgit v1.2.3 From c8a127596edc026e5364a7a609986dcd3999914c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 10:04:54 -0400 Subject: SUNRPC: Introduce xdr_set_scratch_folio() This will replace xdr_set_scratch_page() when we switch pages to folios. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a9ec617cf66..3ce17321689a 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -300,6 +300,19 @@ xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); } +/** + * xdr_set_scratch_folio - Attach a scratch buffer for decoding data + * @xdr: pointer to xdr_stream struct + * @page: an anonymous folio + * + * See xdr_set_scratch_buffer(). + */ +static inline void +xdr_set_scratch_folio(struct xdr_stream *xdr, struct folio *folio) +{ + xdr_set_scratch_buffer(xdr, folio_address(folio), folio_size(folio)); +} + /** * xdr_reset_scratch_buffer - Clear scratch buffer information * @xdr: pointer to xdr_stream struct -- cgit v1.2.3 From 2f8416f23edf3b5c49ce8e08616d0071cda5c37b Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 10:32:29 -0400 Subject: NFS: Update getacl to use xdr_set_scratch_folio() Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 ++-- fs/nfs/nfs4xdr.c | 2 +- include/linux/nfs_xdr.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4de3e4bd724b..a5085820ec0a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6160,7 +6160,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -6196,7 +6196,7 @@ out_free: while (--i >= 0) __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); kfree(pages); return ret; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4ee88a4c1daa..1d0e6c10f921 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; if (res->acl_scratch != NULL) - xdr_set_scratch_page(xdr, res->acl_scratch); + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ac4bff6e9913..7823d4574e29 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -862,7 +862,7 @@ struct nfs_getaclres { size_t acl_len; size_t acl_data_offset; int acl_flags; - struct page * acl_scratch; + struct folio * acl_scratch; }; struct nfs_setattrres { -- cgit v1.2.3 From c9cefd7ae86aa3463adfedca309696ba0946f9c5 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 11:14:41 -0400 Subject: NFS: Update listxattr to use xdr_set_scratch_folio() Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 4 ++-- fs/nfs/nfs42xdr.c | 2 +- include/linux/nfs_xdr.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6a0b5871ba3b..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = -ENOMEM; - res.scratch = alloc_page(GFP_KERNEL); + res.scratch = folio_alloc(GFP_KERNEL, 0); if (!res.scratch) goto out; @@ -1552,7 +1552,7 @@ out_free_pages: } kfree(pages); out_free_scratch: - __free_page(res.scratch); + folio_put(res.scratch); out: return ret; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 4cc915d5741d..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_page(xdr, res->scratch); + xdr_set_scratch_folio(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7823d4574e29..d56583572c98 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1596,7 +1596,7 @@ struct nfs42_listxattrsargs { struct nfs42_listxattrsres { struct nfs4_sequence_res seq_res; - struct page *scratch; + struct folio *scratch; void *xattr_buf; size_t xattr_len; u64 cookie; -- cgit v1.2.3 From d57e43b72bf2071caac90da323849c3983a695f0 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 14:53:09 -0400 Subject: SUNRPC: Update svcxdr_init_decode() to call xdr_set_scratch_folio() The only snag here is that __folio_alloc_node() doesn't handle NUMA_NO_NODE, so I also need to update svc_pool_map_get_node() to return numa_mem_id() instead. I arrived at this approach by looking at what other users of __folio_alloc_node() do for this case. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 4 ++-- net/sunrpc/svc.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 40cbe81360ed..5506d20857c3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -196,7 +196,7 @@ struct svc_rqst { struct xdr_buf rq_arg; struct xdr_stream rq_arg_stream; struct xdr_stream rq_res_stream; - struct page *rq_scratch_page; + struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* num of entries in rq_pages */ struct page * *rq_pages; @@ -503,7 +503,7 @@ static inline void svcxdr_init_decode(struct svc_rqst *rqstp) buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len; xdr_init_decode(xdr, buf, argv->iov_base, NULL); - xdr_set_scratch_page(xdr, rqstp->rq_scratch_page); + xdr_set_scratch_folio(xdr, rqstp->rq_scratch_folio); } /** diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9c7245d811eb..de05ef637bdc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx) if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } - return NUMA_NO_NODE; + return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it @@ -669,8 +669,8 @@ svc_rqst_free(struct svc_rqst *rqstp) folio_batch_release(&rqstp->rq_fbatch); kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); - if (rqstp->rq_scratch_page) - put_page(rqstp->rq_scratch_page); + if (rqstp->rq_scratch_folio) + folio_put(rqstp->rq_scratch_folio); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -691,8 +691,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); - if (!rqstp->rq_scratch_page) + rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); + if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); -- cgit v1.2.3 From cc6ac66f1c0946299b8f192026cff9a320aaad18 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Jul 2025 10:46:50 -0400 Subject: SUNRPC: Update gssx_accept_sec_context() to use xdr_set_scratch_folio() This was the last caller of xdr_set_scratch_page(), so I remove this function while I'm at it. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 13 ------------- net/sunrpc/auth_gss/gss_rpc_xdr.c | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 3ce17321689a..49278749ad0c 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -287,19 +287,6 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) xdr->scratch.iov_len = buflen; } -/** - * xdr_set_scratch_page - Attach a scratch buffer for decoding data - * @xdr: pointer to xdr_stream struct - * @page: an anonymous page - * - * See xdr_set_scratch_buffer(). - */ -static inline void -xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) -{ - xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); -} - /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index cb32ab9a8395..7d2cdc2bd374 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -794,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; - struct page *scratch; + struct folio *scratch; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (!scratch) return -ENOMEM; - xdr_set_scratch_page(xdr, scratch); + xdr_set_scratch_folio(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); @@ -844,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, err = gssx_dec_option_array(xdr, &res->options); out_free: - __free_page(scratch); + folio_put(scratch); return err; } -- cgit v1.2.3 From 24bbd533f596a4544e17579e9f622918680e7bff Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 12:48:14 -0400 Subject: filemap: Add a helper for filesystems implementing dropbehind Add a helper to allow filesystems to attempt to free the 'dropbehind' folio. Signed-off-by: Trond Myklebust Link: https://lore.kernel.org/all/5588a06f6d5a2cf6746828e2d36e7ada668b1739.1745381692.git.trond.myklebust@hammerspace.com/ Reviewed-by: Mike Snitzer Signed-off-by: Anna Schumaker --- include/linux/pagemap.h | 1 + mm/filemap.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..201b7c6f6441 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1221,6 +1221,7 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); +void folio_end_dropbehind(struct folio *folio); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); diff --git a/mm/filemap.c b/mm/filemap.c index 751838ef05e5..66cec689bec4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1608,7 +1608,7 @@ static void filemap_end_dropbehind(struct folio *folio) * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. */ -static void filemap_end_dropbehind_write(struct folio *folio) +void folio_end_dropbehind(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; @@ -1625,6 +1625,7 @@ static void filemap_end_dropbehind_write(struct folio *folio) folio_unlock(folio); } } +EXPORT_SYMBOL_GPL(folio_end_dropbehind); /** * folio_end_writeback - End writeback against a folio. @@ -1660,7 +1661,7 @@ void folio_end_writeback(struct folio *folio) if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); - filemap_end_dropbehind_write(folio); + folio_end_dropbehind(folio); acct_reclaim_writeback(folio); folio_put(folio); } -- cgit v1.2.3 From 010054a530aa266ee1711dfbe23fc06b6eb0fa48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 12:48:15 -0400 Subject: filemap: Add a version of folio_end_writeback that ignores dropbehind Filesystems such as NFS may need to defer dropbehind until after their 2-stage writes are done. This adds a helper folio_end_writeback_no_dropbehind() that allows them to release the writeback flag without immediately dropping the folio. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/pagemap.h | 1 + mm/filemap.c | 29 +++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 201b7c6f6441..5b26465358ce 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1221,6 +1221,7 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); +void folio_end_writeback_no_dropbehind(struct folio *folio); void folio_end_dropbehind(struct folio *folio); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); diff --git a/mm/filemap.c b/mm/filemap.c index 66cec689bec4..d12bbb4c9d8a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1628,14 +1628,15 @@ void folio_end_dropbehind(struct folio *folio) EXPORT_SYMBOL_GPL(folio_end_dropbehind); /** - * folio_end_writeback - End writeback against a folio. + * folio_end_writeback_no_dropbehind - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. + * This call is intended for filesystems that need to defer dropbehind. * * Context: May be called from process or interrupt context. */ -void folio_end_writeback(struct folio *folio) +void folio_end_writeback_no_dropbehind(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); @@ -1651,6 +1652,25 @@ void folio_end_writeback(struct folio *folio) folio_rotate_reclaimable(folio); } + if (__folio_end_writeback(folio)) + folio_wake_bit(folio, PG_writeback); + + acct_reclaim_writeback(folio); +} +EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind); + +/** + * folio_end_writeback - End writeback against a folio. + * @folio: The folio. + * + * The folio must actually be under writeback. + * + * Context: May be called from process or interrupt context. + */ +void folio_end_writeback(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); + /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. @@ -1658,11 +1678,8 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); - if (__folio_end_writeback(folio)) - folio_wake_bit(folio, PG_writeback); - + folio_end_writeback_no_dropbehind(folio); folio_end_dropbehind(folio); - acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); -- cgit v1.2.3 From a91ae3c89311648cbaa9b46b860e4f76004a24b8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Sep 2025 11:01:49 +0000 Subject: bpf, x86: Add support for signed arena loads Currently, signed load instructions into arena memory are unsupported. The compiler is free to generate these, and on GCC-14 we see a corresponding error when it happens. The hurdle in supporting them is deciding which unused opcode to use to mark them for the JIT's own consumption. After much thinking, it appears 0xc0 / BPF_NOSPEC can be combined with load instructions to identify signed arena loads. Use this to recognize and JIT them appropriately, and remove the verifier side limitation on the program if the JIT supports them. Co-developed-by: Puranjay Mohan Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250923110157.18326-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 5 +++++ arch/riscv/net/bpf_jit_comp64.c | 5 +++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/x86/net/bpf_jit_comp.c | 40 +++++++++++++++++++++++++++++++++++++--- include/linux/filter.h | 3 +++ kernel/bpf/verifier.c | 11 ++++++++--- 6 files changed, 63 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e36261c63952..796938b535cd 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -3064,6 +3064,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!bpf_atomic_is_load_store(insn) && !cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 14d7aab61fcb..83672373d026 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -2066,6 +2066,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == BPF_CMPXCHG) return rv_ext_enabled(ZACAS); + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8b57d8532f36..cf461d76e9da 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2967,6 +2967,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8d34a9400a5e..fc13306af15f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1152,11 +1152,38 @@ static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 i *pprog = prog; } +static void emit_ldsx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movsx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* movsx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* movsx rax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x63); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); } +static void emit_ldsx_r12(u8 **prog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldsx_index(prog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -2109,15 +2136,22 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: start_of_ldx = prog; - if (BPF_CLASS(insn->code) == BPF_LDX) - emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); - else + if (BPF_CLASS(insn->code) == BPF_LDX) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) + emit_ldsx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } else { emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } populate_extable: { struct exception_table_entry *ex; diff --git a/include/linux/filter.h b/include/linux/filter.h index 4241a885975f..f5c859b8131a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -78,6 +78,9 @@ struct ctl_table_header; /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 +/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */ +#define BPF_PROBE_MEM32SX 0xc0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceeb0ffe7d67..164199237176 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21379,10 +21379,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; case PTR_TO_ARENA: if (BPF_MODE(insn->code) == BPF_MEMSX) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; + if (!bpf_jit_supports_insn(insn, true)) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); + } else { + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; default: @@ -21588,6 +21592,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; if ((BPF_CLASS(insn->code) == BPF_STX || -- cgit v1.2.3 From edf005fa274a0c224e550a52726aa7a426384e36 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: sched_ext: Improve SCX_KF_DISPATCH comment The comment for SCX_KF_DISPATCH was incomplete and didn't explain that ops.dispatch() may temporarily release the rq lock, allowing ENQUEUE and SELECT_CPU operations to be nested inside DISPATCH contexts. Update the comment to clarify this nesting behavior and provide better context for when these operations can occur within dispatch. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 7047101dbf58..d82b7a9b0658 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -108,7 +108,11 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* ops.dequeue (in REST) may be nested inside DISPATCH */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ -- cgit v1.2.3 From ccb4f5d91ec43c05ba165ccfc7ed889eb9cdfd05 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 19 Sep 2025 12:41:09 +0800 Subject: bpf: Allow union argument in trampoline based programs Currently, functions with 'union' arguments cannot be traced with fentry/fexit: bpftrace -e 'fentry:release_pages { exit(); }' -v The function release_pages arg0 type UNION is unsupported. The type of the 'release_pages' arg0 is defined as: typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); This patch relaxes the restriction by allowing function arguments of type 'union' to be traced in verifier. Reviewed-by: Amery Hung Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250919044110.23729-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/btf.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a2ab51fa8b0a..ba3a3be7eb2a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1128,7 +1128,7 @@ struct bpf_prog_offload { */ #define MAX_BPF_FUNC_REG_ARGS 5 -/* The argument is a structure. */ +/* The argument is a structure or a union. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9f47a3aa7ff8..0de8fc8a0e0b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6751,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -7323,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -7332,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; - if (__btf_type_is_struct(t)) + if (btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; @@ -7373,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0 || __btf_type_is_struct(t)) { + if (ret < 0 || btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); -- cgit v1.2.3 From 8f12d1137c2382c80aada8e05d7cc650cd4e403c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:49 -0700 Subject: bpf: Clear pfmemalloc flag when freeing all fragments It is possible for bpf_xdp_adjust_tail() to free all fragments. The kfunc currently clears the XDP_FLAGS_HAS_FRAGS bit, but not XDP_FLAGS_FRAGS_PF_MEMALLOC. So far, this has not caused a issue when building sk_buff from xdp_buff since all readers of xdp_buff->flags use the flag only when there are fragments. Clear the XDP_FLAGS_FRAGS_PF_MEMALLOC bit as well to make the flags correct. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-2-ameryhung@gmail.com --- include/net/xdp.h | 5 +++++ net/core/filter.c | 1 + 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..f288c348a6c1 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -115,6 +115,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..5837534f4352 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4210,6 +4210,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } -- cgit v1.2.3 From dea1526fbafb55099a788cde0b659530ee5b1c66 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:50 -0700 Subject: bpf: Allow bpf_xdp_shrink_data to shrink a frag from head and tail Move skb_frag_t adjustment into bpf_xdp_shrink_data() and extend its functionality to be able to shrink an xdp fragment from both head and tail. In a later patch, bpf_xdp_pull_data() will reuse it to shrink an xdp fragment from head. Additionally, in bpf_xdp_frags_shrink_tail(), breaking the loop when bpf_xdp_shrink_data() returns false (i.e., not releasing the current fragment) is not necessary as the loop condition, offset > 0, has the same effect. Remove the else branch to simplify the code. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-3-ameryhung@gmail.com --- include/net/xdp_sock_drv.h | 21 ++++++++++++++++++--- net/core/filter.c | 41 ++++++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 513c8e9704f6..4f2d3268a676 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -160,13 +160,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return ret; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { - struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); @@ -389,8 +399,13 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return NULL; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { + return NULL; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) diff --git a/net/core/filter.c b/net/core/filter.c index 5837534f4352..8cae575ad437 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,12 +4209,8 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; -- cgit v1.2.3 From b650bf0977d34c52befb31a9fa711534e11b220f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2025 10:42:40 +0000 Subject: udp: remove busylock and add per NUMA queues busylock was protecting UDP sockets against packet floods, but unfortunately was not protecting the host itself. Under stress, many cpus could spin while acquiring the busylock, and NIC had to drop packets. Or packets would be dropped in cpu backlog if RPS/RFS were in place. This patch replaces the busylock by intermediate lockless queues. (One queue per NUMA node). This means that fewer number of cpus have to acquire the UDP receive queue lock. Most of the cpus can either: - immediately drop the packet. - or queue it in their NUMA aware lockless queue. Then one of the cpu is chosen to process this lockless queue in a batch. The batch only contains packets that were cooked on the same NUMA node, thus with very limited latency impact. Tested: DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes (Intel(R) Xeon(R) 6985P-C) Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1004179 0.0 Udp6InErrors 3117 0.0 Udp6RcvbufErrors 3117 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1116633 0.0 Udp6InErrors 14197275 0.0 Udp6RcvbufErrors 14197275 0.0 We can see this host can now proces 14.2 M more packets per second while under attack, and the victim socket can receive 11 % more packets. I used a small bpftrace program measuring time (in us) spent in __udp_enqueue_schedule_skb(). Before: @udp_enqueue_us[398]: [0] 24901 |@@@ | [1] 63512 |@@@@@@@@@ | [2, 4) 344827 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4, 8) 244673 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8, 16) 54022 |@@@@@@@@ | [16, 32) 222134 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 232042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [64, 128) 4219 | | [128, 256) 188 | | After: @udp_enqueue_us[398]: [0] 5608855 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 1111277 |@@@@@@@@@@ | [2, 4) 501439 |@@@@ | [4, 8) 102921 | | [8, 16) 29895 | | [16, 32) 43500 | | [32, 64) 31552 | | [64, 128) 979 | | [128, 256) 13 | | Note that the remaining bottleneck for this platform is in udp_drops_inc() because we limited struct numa_drop_counters to only two nodes so far. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250922104240.2182559-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 9 +++- include/net/udp.h | 11 ++++- net/ipv4/udp.c | 117 +++++++++++++++++++++++++++++++--------------------- net/ipv6/udp.c | 5 ++- 4 files changed, 91 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index e554890c4415..58795688a186 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -44,6 +44,12 @@ enum { UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; +/* per NUMA structure for lockless producer usage. */ +struct udp_prod_queue { + struct llist_head ll_root ____cacheline_aligned_in_smp; + atomic_t rmem_alloc; +}; + struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; @@ -90,6 +96,8 @@ struct udp_sock { struct sk_buff *skb, int nhoff); + struct udp_prod_queue *udp_prod_queue; + /* udp_recvmsg try to use this before splicing sk_receive_queue */ struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; @@ -109,7 +117,6 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; - spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 059a0cee5f55..cffedb3e40f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -284,16 +284,23 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); -static inline void udp_lib_init_sock(struct sock *sk) +static inline int udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; - spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + + up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue), + GFP_KERNEL); + if (!up->udp_prod_queue) + return -ENOMEM; + for (int i = 0; i < nr_node_ids; i++) + init_llist_head(&up->udp_prod_queue[i].ll_root); + return 0; } static inline void udp_drops_inc(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 85cfc32eb2cc..95241093b7f0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1685,25 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } -/* Idea of busylocks is to let producers grab an extra spinlock - * to relieve pressure on the receive_queue spinlock shared by consumer. - * Under flood, this means that only one producer can be in line - * trying to acquire the receive_queue spinlock. - */ -static spinlock_t *busylock_acquire(struct sock *sk) -{ - spinlock_t *busy = &udp_sk(sk)->busylock; - - spin_lock(busy); - return busy; -} - -static void busylock_release(spinlock_t *busy) -{ - if (busy) - spin_unlock(busy); -} - static int udp_rmem_schedule(struct sock *sk, int size) { int delta; @@ -1718,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; + struct udp_prod_queue *udp_prod_queue; + struct sk_buff *next, *to_drop = NULL; + struct llist_node *ll_list; unsigned int rmem, rcvbuf; - spinlock_t *busy = NULL; int size, err = -ENOMEM; + int total_size = 0; + int q_size = 0; + int dropcount; + int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; + + rmem += atomic_read(&udp_prod_queue->rmem_alloc); + /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ @@ -1747,45 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > rcvbuf) - goto uncharge_drop; - busy = busylock_acquire(sk); - } else { - atomic_add(size, &sk->sk_rmem_alloc); } udp_set_dev_scratch(skb); + atomic_add(size, &udp_prod_queue->rmem_alloc); + + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) + return 0; + + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; + spin_lock(&list->lock); - err = udp_rmem_schedule(sk, size); - if (err) { - spin_unlock(&list->lock); - goto uncharge_drop; - } - sk_forward_alloc_add(sk, -size); + ll_list = llist_del_all(&udp_prod_queue->ll_root); - /* no need to setup a destructor, we will explicitly release the - * forward allocated memory on dequeue - */ - sock_skb_set_dropcount(sk, skb); + ll_list = llist_reverse_order(ll_list); + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + size = udp_skb_truesize(skb); + total_size += size; + err = udp_rmem_schedule(sk, size); + if (unlikely(err)) { + /* Free the skbs outside of locked section. */ + skb->next = to_drop; + to_drop = skb; + continue; + } + + q_size += size; + sk_forward_alloc_add(sk, -size); + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + SOCK_SKB_CB(skb)->dropcount = dropcount; + nb++; + __skb_queue_tail(list, skb); + } + + atomic_add(q_size, &sk->sk_rmem_alloc); - __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + if (!sock_flag(sk, SOCK_DEAD)) { + /* Multiple threads might be blocked in recvmsg(), + * using prepare_to_wait_exclusive(). + */ + while (nb) { + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + nb--; + } + } + + if (unlikely(to_drop)) { + for (nb = 0; to_drop != NULL; nb++) { + skb = to_drop; + to_drop = skb->next; + skb_mark_not_on_list(skb); + /* TODO: update SNMP values. */ + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); + } + numa_drop_add(&udp_sk(sk)->drop_counters, nb); + } - busylock_release(busy); - return 0; + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); -uncharge_drop: - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + return 0; drop: udp_drops_inc(sk); - busylock_release(busy); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); @@ -1803,6 +1826,7 @@ void udp_destruct_common(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); + kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); @@ -1814,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9f4d340d1e3a..813a2ba75824 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3 From 092263a03105539b8dfe74c59be4c6cce1304d5f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 20 Sep 2025 23:34:07 +0200 Subject: net: phy: stop exporting phy_driver_register phy_driver_register() isn't used outside phy_device.c any longer, so we can stop exporting it. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/dff44b83-4a85-4fff-bf6b-f12efd97b56e@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 4 ++-- include/linux/phy.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c82c1997147b..01269b865f5e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3544,7 +3544,8 @@ static int phy_remove(struct device *dev) * @new_driver: new phy_driver to register * @owner: module owning this PHY */ -int phy_driver_register(struct phy_driver *new_driver, struct module *owner) +static int phy_driver_register(struct phy_driver *new_driver, + struct module *owner) { int retval; @@ -3587,7 +3588,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner) return 0; } -EXPORT_SYMBOL(phy_driver_register); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner) diff --git a/include/linux/phy.h b/include/linux/phy.h index d09fc42e61f3..b377dfaa6801 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2032,7 +2032,6 @@ static inline int phy_read_status(struct phy_device *phydev) void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); -int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_error(struct phy_device *phydev); -- cgit v1.2.3 From 6043819e707cefb1c9e59d6e431dcfa735c4f975 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 22 Sep 2025 10:11:32 +0300 Subject: net/mlx5: fs, fix UAF in flow counter release Fix a kernel trace [1] caused by releasing an HWS action of a local flow counter in mlx5_cmd_hws_delete_fte(), where the HWS action refcount and mutex were not initialized and the counter struct could already be freed when deleting the rule. Fix it by adding the missing initializations and adding refcount for the local flow counter struct. [1] Kernel log: Call Trace: dump_stack_lvl+0x34/0x48 mlx5_fs_put_hws_action.part.0.cold+0x21/0x94 [mlx5_core] mlx5_fc_put_hws_action+0x96/0xad [mlx5_core] mlx5_fs_destroy_fs_actions+0x8b/0x152 [mlx5_core] mlx5_cmd_hws_delete_fte+0x5a/0xa0 [mlx5_core] del_hw_fte+0x1ce/0x260 [mlx5_core] mlx5_del_flow_rules+0x12d/0x240 [mlx5_core] ? ttwu_queue_wakelist+0xf4/0x110 mlx5_ib_destroy_flow+0x103/0x1b0 [mlx5_ib] uverbs_free_flow+0x20/0x50 [ib_uverbs] destroy_hw_idr_uobject+0x1b/0x50 [ib_uverbs] uverbs_destroy_uobject+0x34/0x1a0 [ib_uverbs] uobj_destroy+0x3c/0x80 [ib_uverbs] ib_uverbs_run_method+0x23e/0x360 [ib_uverbs] ? uverbs_finalize_object+0x60/0x60 [ib_uverbs] ib_uverbs_cmd_verbs+0x14f/0x2c0 [ib_uverbs] ? do_tty_write+0x1a9/0x270 ? file_tty_write.constprop.0+0x98/0xc0 ? new_sync_write+0xfc/0x190 ib_uverbs_ioctl+0xd7/0x160 [ib_uverbs] __x64_sys_ioctl+0x87/0xc0 do_syscall_64+0x59/0x90 Fixes: b581f4266928 ("net/mlx5: fs, manage flow counters HWS action sharing by refcount") Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758525094-816583-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 1 + .../net/ethernet/mellanox/mlx5/core/fs_counters.c | 25 +++++++++++++++++++--- .../mellanox/mlx5/core/steering/hws/fs_hws_pools.c | 8 ++++++- include/linux/mlx5/fs.h | 2 ++ 5 files changed, 33 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index db552c012b4f..80245c38dbad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -663,7 +663,7 @@ static void del_sw_hw_rule(struct fs_node *node) BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); fte->act_dests.action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT; - mlx5_fc_local_destroy(rule->dest_attr.counter); + mlx5_fc_local_put(rule->dest_attr.counter); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 500826229b0b..e6a95b310b55 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -343,6 +343,7 @@ struct mlx5_fc { enum mlx5_fc_type type; struct mlx5_fc_bulk *bulk; struct mlx5_fc_cache cache; + refcount_t fc_local_refcount; /* last{packets,bytes} are used for calculating deltas since last reading. */ u64 lastpackets; u64 lastbytes; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 492775d3d193..83001eda3884 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -562,17 +562,36 @@ mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size) counter->id = counter_id; fc_bulk->base_id = counter_id - offset; fc_bulk->fs_bulk.bulk_len = bulk_size; + refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0); + mutex_init(&fc_bulk->hws_data.lock); counter->bulk = fc_bulk; + refcount_set(&counter->fc_local_refcount, 1); return counter; } EXPORT_SYMBOL(mlx5_fc_local_create); void mlx5_fc_local_destroy(struct mlx5_fc *counter) { - if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) - return; - kfree(counter->bulk); kfree(counter); } EXPORT_SYMBOL(mlx5_fc_local_destroy); + +void mlx5_fc_local_get(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + refcount_inc(&counter->fc_local_refcount); +} + +void mlx5_fc_local_put(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + if (!refcount_dec_and_test(&counter->fc_local_refcount)) + return; + + mlx5_fc_local_destroy(counter); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c index f1ecdba74e1f..839d71bd4216 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c @@ -407,15 +407,21 @@ struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx, { struct mlx5_fs_hws_create_action_ctx create_ctx; struct mlx5_fc_bulk *fc_bulk = counter->bulk; + struct mlx5hws_action *hws_action; create_ctx.hws_ctx = ctx; create_ctx.id = fc_bulk->base_id; create_ctx.actions_type = MLX5HWS_ACTION_TYP_CTR; - return mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + mlx5_fc_local_get(counter); + hws_action = mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + if (!hws_action) + mlx5_fc_local_put(counter); + return hws_action; } void mlx5_fc_put_hws_action(struct mlx5_fc *counter) { mlx5_fs_put_hws_action(&counter->bulk->hws_data); + mlx5_fc_local_put(counter); } diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 86055d55836d..6ac76a0c3827 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -308,6 +308,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); void mlx5_fc_local_destroy(struct mlx5_fc *counter); +void mlx5_fc_local_get(struct mlx5_fc *counter); +void mlx5_fc_local_put(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); -- cgit v1.2.3 From 7384893d970ea114952aef54ad7e3d7d2ca82d4f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:52:56 +0200 Subject: bpf: Allow uprobe program to change context registers Currently uprobe (BPF_PROG_TYPE_KPROBE) program can't write to the context registers data. While this makes sense for kprobe attachments, for uprobe attachment it might make sense to be able to change user space registers to alter application execution. Since uprobe and kprobe programs share the same type (BPF_PROG_TYPE_KPROBE), we can't deny write access to context during the program load. We need to check on it during program attachment to see if it's going to be kprobe or uprobe. Storing the program's write attempt to context and checking on it during the attachment. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/events/core.c | 4 ++++ kernel/trace/bpf_trace.c | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ba3a3be7eb2a..ea2ed6771cc6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1639,6 +1639,7 @@ struct bpf_prog_aux { bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; + bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/events/core.c b/kernel/events/core.c index 820127536e62..1d354778dcd4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11232,6 +11232,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, if (prog->kprobe_override && !is_kprobe) return -EINVAL; + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) + return -EINVAL; + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f2360579658e..8f23f5273bab 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1338,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1349,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + if (type == BPF_WRITE) + prog->aux->kprobe_write_ctx = true; + return true; } @@ -2735,6 +2736,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* Writing to context is not allowed for kprobes. */ + if (prog->aux->kprobe_write_ctx) + return -EINVAL; + flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; -- cgit v1.2.3 From d4680a11e14c7baf683cb8453d91d71d2e0b9d3e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 24 Sep 2025 10:14:26 +0200 Subject: bpf: Mark kfuncs as __noclone Some distributions (e.g., CachyOS) support building the kernel with -O3, but doing so may break kfuncs, resulting in their symbols not being properly exported. In fact, with gcc -O3, some kfuncs may be optimized away despite being annotated as noinline. This happens because gcc can still clone the function during IPA optimizations, e.g., by duplicating or inlining it into callers, and then dropping the standalone symbol. This breaks BTF ID resolution since resolve_btfids relies on the presence of a global symbol for each kfunc. Currently, this is not an issue for upstream, because we don't allow building the kernel with -O3, but it may be safer to address it anyway, to prevent potential issues in the future if compilers become more aggressive with optimizations. Therefore, add __noclone to __bpf_kfunc to ensure kfuncs are never cloned and remain distinct, globally visible symbols, regardless of the optimization level. Fixes: 57e7c169cd6af ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Acked-by: David Vernet Acked-by: Yonghong Song Signed-off-by: Andrea Righi Link: https://lore.kernel.org/r/20250924081426.156934-1-arighi@nvidia.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 9eda6b113f9b..f06976ffb63f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -86,7 +86,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used __retain noinline +#define __bpf_kfunc __used __retain __noclone noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ -- cgit v1.2.3 From 64f89f6e1f2b7f8f203d218a8c8d90922e1d4048 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 17 Sep 2025 10:54:05 +0200 Subject: gpio: generic: rename BGPIOF_ flags to GPIO_GENERIC_ Make the flags passed to gpio_generic_chip_init() use the same prefix as the rest of the modernized generic GPIO chip API. Link: https://lore.kernel.org/r/20250917-gpio-generic-flags-v1-1-69f51fee8c89@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-amdpt.c | 2 +- drivers/gpio/gpio-brcmstb.c | 2 +- drivers/gpio/gpio-cadence.c | 2 +- drivers/gpio/gpio-ge.c | 2 +- drivers/gpio/gpio-grgpio.c | 2 +- drivers/gpio/gpio-hisi.c | 3 ++- drivers/gpio/gpio-hlwd.c | 2 +- drivers/gpio/gpio-ixp4xx.c | 2 +- drivers/gpio/gpio-mmio.c | 28 ++++++++++++++-------------- drivers/gpio/gpio-mpc8xxx.c | 4 ++-- drivers/gpio/gpio-mt7621.c | 2 +- drivers/gpio/gpio-mxc.c | 2 +- drivers/gpio/gpio-rda.c | 2 +- drivers/gpio/gpio-realtek-otto.c | 2 +- drivers/gpio/gpio-sifive.c | 2 +- drivers/gpio/gpio-spacemit-k1.c | 3 ++- drivers/gpio/gpio-vf610.c | 4 ++-- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-wpcm450.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32-hdp.c | 2 +- include/linux/gpio/driver.h | 18 +++++++++--------- 22 files changed, 47 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpio-amdpt.c b/drivers/gpio/gpio-amdpt.c index bbaf42307bc3..8458a6949c65 100644 --- a/drivers/gpio/gpio-amdpt.c +++ b/drivers/gpio/gpio-amdpt.c @@ -94,7 +94,7 @@ static int pt_gpio_probe(struct platform_device *pdev) .dat = pt_gpio->reg_base + PT_INPUTDATA_REG, .set = pt_gpio->reg_base + PT_OUTPUTDATA_REG, .dirout = pt_gpio->reg_base + PT_DIRECTION_REG, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pt_gpio->chip, &config); diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index be3ff916e134..f40c9472588b 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -630,7 +630,7 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) * else leave I/O in little endian mode. */ #if defined(CONFIG_MIPS) && defined(__BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #endif of_property_for_each_u32(np, "brcm,gpio-bank-widths", bank_width) { diff --git a/drivers/gpio/gpio-cadence.c b/drivers/gpio/gpio-cadence.c index c647953521c7..b75734ca22dd 100644 --- a/drivers/gpio/gpio-cadence.c +++ b/drivers/gpio/gpio-cadence.c @@ -181,7 +181,7 @@ static int cdns_gpio_probe(struct platform_device *pdev) config.dat = cgpio->regs + CDNS_GPIO_INPUT_VALUE; config.set = cgpio->regs + CDNS_GPIO_OUTPUT_VALUE; config.dirin = cgpio->regs + CDNS_GPIO_DIRECTION_MODE; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; ret = gpio_generic_chip_init(&cgpio->gen_gc, &config); if (ret) { diff --git a/drivers/gpio/gpio-ge.c b/drivers/gpio/gpio-ge.c index b5cbf27b8f44..66bdff36eb61 100644 --- a/drivers/gpio/gpio-ge.c +++ b/drivers/gpio/gpio-ge.c @@ -73,7 +73,7 @@ static int __init gef_gpio_probe(struct platform_device *pdev) .dat = regs + GEF_GPIO_IN, .set = regs + GEF_GPIO_OUT, .dirin = regs + GEF_GPIO_DIRECT, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; ret = gpio_generic_chip_init(chip, &config); diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c index 5930f4c6f2b5..0c0f97fa14fc 100644 --- a/drivers/gpio/gpio-grgpio.c +++ b/drivers/gpio/gpio-grgpio.c @@ -359,7 +359,7 @@ static int grgpio_probe(struct platform_device *ofdev) .dat = regs + GRGPIO_DATA, .set = regs + GRGPIO_OUTPUT, .dirout = regs + GRGPIO_DIR, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; gc = &priv->chip.gc; diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index d8c4ab02ceae..d26298c8351b 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -300,7 +300,8 @@ static int hisi_gpio_probe(struct platform_device *pdev) .clr = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, .dirout = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, .dirin = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, - .flags = BGPIOF_NO_SET_ON_INPUT | BGPIOF_UNREADABLE_REG_DIR, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT | + GPIO_GENERIC_UNREADABLE_REG_DIR, }; ret = gpio_generic_chip_init(&hisi_gpio->chip, &config); diff --git a/drivers/gpio/gpio-hlwd.c b/drivers/gpio/gpio-hlwd.c index a395f87436ac..043ce5ef3b07 100644 --- a/drivers/gpio/gpio-hlwd.c +++ b/drivers/gpio/gpio-hlwd.c @@ -253,7 +253,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) .dat = hlwd->regs + HW_GPIOB_IN, .set = hlwd->regs + HW_GPIOB_OUT, .dirout = hlwd->regs + HW_GPIOB_DIR, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; res = gpio_generic_chip_init(&hlwd->gpioc, &config); diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c index 8a3b6b192288..f34d87869c8b 100644 --- a/drivers/gpio/gpio-ixp4xx.c +++ b/drivers/gpio/gpio-ixp4xx.c @@ -289,7 +289,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) * for big endian. */ #if defined(CONFIG_CPU_BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #else flags = 0; #endif diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index a3df14d672a9..7d6dd36cf1ae 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -554,7 +554,7 @@ static int bgpio_setup_io(struct gpio_generic_chip *chip, chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (cfg->flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -562,8 +562,8 @@ static int bgpio_setup_io(struct gpio_generic_chip *chip, gc->set_multiple = bgpio_set_multiple; } - if (!(cfg->flags & BGPIOF_UNREADABLE_REG_SET) && - (cfg->flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & GPIO_GENERIC_UNREADABLE_REG_SET) && + (cfg->flags & GPIO_GENERIC_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; @@ -593,19 +593,19 @@ static int bgpio_setup_direction(struct gpio_generic_chip *chip, if (cfg->dirout || cfg->dirin) { chip->reg_dir_out = cfg->dirout; chip->reg_dir_in = cfg->dirin; - if (cfg->flags & BGPIOF_NO_SET_ON_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (cfg->flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (cfg->flags & BGPIOF_NO_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -654,7 +654,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - chip->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & GPIO_GENERIC_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) @@ -665,7 +665,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, return ret; ret = bgpio_setup_accessors(dev, chip, - flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + flags & GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; @@ -673,7 +673,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, if (ret) return ret; - if (flags & BGPIOF_PINCTRL_BACKEND) { + if (flags & GPIO_GENERIC_PINCTRL_BACKEND) { chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; @@ -681,17 +681,17 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && - !(flags & BGPIOF_UNREADABLE_REG_SET)) + !(flags & GPIO_GENERIC_UNREADABLE_REG_SET)) chip->sdata = chip->read_reg(chip->reg_set); - if (flags & BGPIOF_UNREADABLE_REG_DIR) + if (flags & GPIO_GENERIC_UNREADABLE_REG_DIR) chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ if ((chip->reg_dir_out || chip->reg_dir_in) && - !(flags & BGPIOF_UNREADABLE_REG_DIR)) { + !(flags & GPIO_GENERIC_UNREADABLE_REG_DIR)) { if (chip->reg_dir_out) chip->sdir = chip->read_reg(chip->reg_dir_out); else if (chip->reg_dir_in) @@ -787,10 +787,10 @@ static int bgpio_pdev_probe(struct platform_device *pdev) return -ENOMEM; if (device_is_big_endian(dev)) - flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; if (device_property_read_bool(dev, "no-output")) - flags |= BGPIOF_NO_OUTPUT; + flags |= GPIO_GENERIC_NO_OUTPUT; config = (struct gpio_generic_chip_config) { .dev = dev, diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index a2a83afb41bb..bfe828734ee1 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -350,13 +350,13 @@ static int mpc8xxx_probe(struct platform_device *pdev) .sz = 4, .dat = mpc8xxx_gc->regs + GPIO_DAT, .dirout = mpc8xxx_gc->regs + GPIO_DIR, - .flags = BGPIOF_BIG_ENDIAN + .flags = GPIO_GENERIC_BIG_ENDIAN }; if (device_property_read_bool(dev, "little-endian")) { dev_dbg(dev, "GPIO registers are LITTLE endian\n"); } else { - config.flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + config.flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; dev_dbg(dev, "GPIO registers are BIG endian\n"); } diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index e7bb9b2cd6cf..91230be51587 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -242,7 +242,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) .set = set, .clr = ctrl, .dirout = diro, - .flags = BGPIOF_NO_SET_ON_INPUT, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT, }; ret = gpio_generic_chip_init(&rg->chip, &config); diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 433cbadc3a4c..52060b3ec745 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -481,7 +481,7 @@ static int mxc_gpio_probe(struct platform_device *pdev) config.dat = port->base + GPIO_PSR; config.set = port->base + GPIO_DR; config.dirout = port->base + GPIO_GDIR; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; err = gpio_generic_chip_init(&port->gen_gc, &config); if (err) diff --git a/drivers/gpio/gpio-rda.c b/drivers/gpio/gpio-rda.c index fb479d13eb01..7bbc6f0ce4c8 100644 --- a/drivers/gpio/gpio-rda.c +++ b/drivers/gpio/gpio-rda.c @@ -245,7 +245,7 @@ static int rda_gpio_probe(struct platform_device *pdev) .clr = rda_gpio->base + RDA_GPIO_CLR, .dirout = rda_gpio->base + RDA_GPIO_OEN_SET_OUT, .dirin = rda_gpio->base + RDA_GPIO_OEN_SET_IN, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&rda_gpio->chip, &config); diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c index 37b4f73771e6..de527f4fc6c2 100644 --- a/drivers/gpio/gpio-realtek-otto.c +++ b/drivers/gpio/gpio-realtek-otto.c @@ -395,7 +395,7 @@ static int realtek_gpio_probe(struct platform_device *pdev) ctrl->bank_write = realtek_gpio_bank_write; ctrl->line_imr_pos = realtek_gpio_line_imr_pos; } else { - gen_gc_flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + gen_gc_flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; ctrl->bank_read = realtek_gpio_bank_read_swapped; ctrl->bank_write = realtek_gpio_bank_write_swapped; ctrl->line_imr_pos = realtek_gpio_line_imr_pos_swapped; diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 2ced87ffd3bb..94ef2efbd14f 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -223,7 +223,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) .set = chip->base + SIFIVE_GPIO_OUTPUT_VAL, .dirout = chip->base + SIFIVE_GPIO_OUTPUT_EN, .dirin = chip->base + SIFIVE_GPIO_INPUT_EN, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&chip->gen_gc, &config); diff --git a/drivers/gpio/gpio-spacemit-k1.c b/drivers/gpio/gpio-spacemit-k1.c index a0af23f73281..eb66a15c002f 100644 --- a/drivers/gpio/gpio-spacemit-k1.c +++ b/drivers/gpio/gpio-spacemit-k1.c @@ -197,7 +197,8 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, .clr = clr, .dirout = dirout, .dirin = dirin, - .flags = BGPIOF_UNREADABLE_REG_SET | BGPIOF_UNREADABLE_REG_DIR, + .flags = GPIO_GENERIC_UNREADABLE_REG_SET | + GPIO_GENERIC_UNREADABLE_REG_DIR, }; /* This registers 32 GPIO lines per bank */ diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index f3590db72b14..aa8586d8a787 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -296,14 +296,14 @@ static int vf610_gpio_probe(struct platform_device *pdev) } gc = &port->chip.gc; - flags = BGPIOF_PINCTRL_BACKEND; + flags = GPIO_GENERIC_PINCTRL_BACKEND; /* * We only read the output register for current value on output * lines if the direction register is available so we can switch * direction. */ if (port->sdata->have_paddr) - flags |= BGPIOF_READ_OUTPUT_REG_SET; + flags |= GPIO_GENERIC_READ_OUTPUT_REG_SET; config = (struct gpio_generic_chip_config) { .dev = dev, diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index c2ca71ebb973..10765f19b48b 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1842,7 +1842,7 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) .dat = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, .set = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, .dirin = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 0f155a685bba..1005b464a469 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2335,7 +2335,7 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) .dat = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, .set = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, .dirin = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 4dd8a3daa83e..c575949e42e6 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -1061,7 +1061,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, set = pctrl->gpio_base + bank->dataout; dirout = pctrl->gpio_base + bank->cfg0; } else { - flags = BGPIOF_NO_OUTPUT; + flags = GPIO_GENERIC_NO_OUTPUT; } config = (typeof(config)){ diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index dea49b9aabf2..971959a75b0c 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -648,7 +648,7 @@ static int stm32_hdp_probe(struct platform_device *pdev) .dat = hdp->base + HDP_GPOVAL, .set = hdp->base + HDP_GPOSET, .clr = hdp->base + HDP_GPOCLR, - .flags = BGPIOF_NO_INPUT, + .flags = GPIO_GENERIC_NO_INPUT, }; err = gpio_generic_chip_init(&hdp->gpio_chip, &config); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9b14fd20f13e..e62622e42cad 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -684,15 +684,15 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -#define BGPIOF_BIG_ENDIAN BIT(0) -#define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define BGPIOF_NO_OUTPUT BIT(5) /* only input */ -#define BGPIOF_NO_SET_ON_INPUT BIT(6) -#define BGPIOF_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define BGPIOF_NO_INPUT BIT(8) /* only output */ +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, -- cgit v1.2.3 From 2235b26c1b25daf253748acff501af3ea85faaa8 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 17 Sep 2025 10:54:06 +0200 Subject: gpio: generic: move GPIO_GENERIC_ flags to the correct header These flags are specific to gpio-mmio and belong in linux/gpio/generic.h so move them there. Link: https://lore.kernel.org/r/20250917-gpio-generic-flags-v1-2-69f51fee8c89@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 10 ---------- include/linux/gpio/generic.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e62622e42cad..fabe2baf7b50 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -684,16 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -#define GPIO_GENERIC_BIG_ENDIAN BIT(0) -#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ -#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) -#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ - #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain); diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 162430d96660..ff566dc9c3cb 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -9,6 +9,16 @@ struct device; +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ + /** * struct gpio_generic_chip_config - Generic GPIO chip configuration data * @dev: Parent device of the new GPIO chip (compulsory). -- cgit v1.2.3 From 8327bd4fcb6c1dab01ce5c6ff00b42496836dcd2 Mon Sep 17 00:00:00 2001 From: Varad Gautam Date: Sun, 30 Mar 2025 16:42:29 +0000 Subject: asm-generic/io.h: Skip trace helpers if rwmmio events are disabled With `CONFIG_TRACE_MMIO_ACCESS=y`, the `{read,write}{b,w,l,q}{_relaxed}()` mmio accessors unconditionally call `log_{post_}{read,write}_mmio()` helpers, which in turn call the ftrace ops for `rwmmio` trace events This adds a performance penalty per mmio accessor call, even when `rwmmio` events are disabled at runtime (~80% overhead on local measurement). Guard these with `tracepoint_enabled()`. Signed-off-by: Varad Gautam Fixes: 210031971cdd ("asm-generic/io: Add logging support for MMIO accessors") Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann --- include/asm-generic/io.h | 98 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 11abad6c87e1..ca5a1ce6f0f8 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -75,6 +75,7 @@ #if IS_ENABLED(CONFIG_TRACE_MMIO_ACCESS) && !(defined(__DISABLE_TRACE_MMIO__)) #include +#define rwmmio_tracepoint_enabled(tracepoint) tracepoint_enabled(tracepoint) DECLARE_TRACEPOINT(rwmmio_write); DECLARE_TRACEPOINT(rwmmio_post_write); DECLARE_TRACEPOINT(rwmmio_read); @@ -91,6 +92,7 @@ void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, #else +#define rwmmio_tracepoint_enabled(tracepoint) false static inline void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0) {} static inline void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, @@ -189,11 +191,13 @@ static inline u8 readb(const volatile void __iomem *addr) { u8 val; - log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __raw_readb(addr); __io_ar(val); - log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -204,11 +208,13 @@ static inline u16 readw(const volatile void __iomem *addr) { u16 val; - log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); __io_ar(val); - log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -219,11 +225,13 @@ static inline u32 readl(const volatile void __iomem *addr) { u32 val; - log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); __io_ar(val); - log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -235,11 +243,13 @@ static inline u64 readq(const volatile void __iomem *addr) { u64 val; - log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __le64_to_cpu((__le64 __force)__raw_readq(addr)); __io_ar(val); - log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -249,11 +259,13 @@ static inline u64 readq(const volatile void __iomem *addr) #define writeb writeb static inline void writeb(u8 value, volatile void __iomem *addr) { - log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writeb(value, addr); __io_aw(); - log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); } #endif @@ -261,11 +273,13 @@ static inline void writeb(u8 value, volatile void __iomem *addr) #define writew writew static inline void writew(u16 value, volatile void __iomem *addr) { - log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writew((u16 __force)cpu_to_le16(value), addr); __io_aw(); - log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); } #endif @@ -273,11 +287,13 @@ static inline void writew(u16 value, volatile void __iomem *addr) #define writel writel static inline void writel(u32 value, volatile void __iomem *addr) { - log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writel((u32 __force)__cpu_to_le32(value), addr); __io_aw(); - log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); } #endif @@ -286,11 +302,13 @@ static inline void writel(u32 value, volatile void __iomem *addr) #define writeq writeq static inline void writeq(u64 value, volatile void __iomem *addr) { - log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writeq((u64 __force)__cpu_to_le64(value), addr); __io_aw(); - log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); } #endif #endif /* CONFIG_64BIT */ @@ -306,9 +324,11 @@ static inline u8 readb_relaxed(const volatile void __iomem *addr) { u8 val; - log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); val = __raw_readb(addr); - log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -319,9 +339,11 @@ static inline u16 readw_relaxed(const volatile void __iomem *addr) { u16 val; - log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); - log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -332,9 +354,11 @@ static inline u32 readl_relaxed(const volatile void __iomem *addr) { u32 val; - log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); - log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -345,9 +369,11 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr) { u64 val; - log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_read)) + log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); val = __le64_to_cpu((__le64 __force)__raw_readq(addr)); - log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_read)) + log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); return val; } #endif @@ -356,9 +382,11 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr) #define writeb_relaxed writeb_relaxed static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) { - log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); __raw_writeb(value, addr); - log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); } #endif @@ -366,9 +394,11 @@ static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) #define writew_relaxed writew_relaxed static inline void writew_relaxed(u16 value, volatile void __iomem *addr) { - log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); __raw_writew((u16 __force)cpu_to_le16(value), addr); - log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); } #endif @@ -376,9 +406,11 @@ static inline void writew_relaxed(u16 value, volatile void __iomem *addr) #define writel_relaxed writel_relaxed static inline void writel_relaxed(u32 value, volatile void __iomem *addr) { - log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); __raw_writel((u32 __force)__cpu_to_le32(value), addr); - log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); } #endif @@ -386,9 +418,11 @@ static inline void writel_relaxed(u32 value, volatile void __iomem *addr) #define writeq_relaxed writeq_relaxed static inline void writeq_relaxed(u64 value, volatile void __iomem *addr) { - log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_write)) + log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); __raw_writeq((u64 __force)__cpu_to_le64(value), addr); - log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); + if (rwmmio_tracepoint_enabled(rwmmio_post_write)) + log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); } #endif -- cgit v1.2.3 From 1c1658058c99bcfd3b2347e587a556986037f80a Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:35 +0200 Subject: hwmon: (dell-smm) Add support for automatic fan mode Many machines treat fan state 3 as some sort of automatic mode, which is superior to the separate SMM calls for switching to automatic fan mode for two reasons: - the fan control mode can be controlled for each fan separately - the current fan control mode can be retrieved from the BIOS On some machines however, this special fan state does not exist. Fan state 3 acts like a regular fan state on such machines or does not exist at all. Such machines usually use separate SMM calls for enabling/disabling automatic fan control. Add support for it. If the machine supports separate SMM calls for changing the fan control mode, then the other interface is ignored. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-4-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- Documentation/hwmon/dell-smm-hwmon.rst | 56 +++++++++++++++---------- drivers/hwmon/dell-smm-hwmon.c | 74 ++++++++++++++++++++++++++++------ include/uapi/linux/i8k.h | 2 + 3 files changed, 98 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst index 5a4edb6565cf..3e4e2d916ac5 100644 --- a/Documentation/hwmon/dell-smm-hwmon.rst +++ b/Documentation/hwmon/dell-smm-hwmon.rst @@ -38,7 +38,7 @@ fan[1-4]_min RO Minimal Fan speed in RPM fan[1-4]_max RO Maximal Fan speed in RPM fan[1-4]_target RO Expected Fan speed in RPM pwm[1-4] RW Control the fan PWM duty-cycle. -pwm1_enable WO Enable or disable automatic BIOS fan +pwm[1-4]_enable RW/WO Enable or disable automatic BIOS fan control (not supported on all laptops, see below for details). temp[1-10]_input RO Temperature reading in milli-degrees @@ -49,26 +49,40 @@ temp[1-10]_label RO Temperature sensor label. Due to the nature of the SMM interface, each pwmX attribute controls fan number X. -Disabling automatic BIOS fan control ------------------------------------- - -On some laptops the BIOS automatically sets fan speed every few -seconds. Therefore the fan speed set by mean of this driver is quickly -overwritten. - -There is experimental support for disabling automatic BIOS fan -control, at least on laptops where the corresponding SMM command is -known, by writing the value ``1`` in the attribute ``pwm1_enable`` -(writing ``2`` enables automatic BIOS control again). Even if you have -more than one fan, all of them are set to either enabled or disabled -automatic fan control at the same time and, notwithstanding the name, -``pwm1_enable`` sets automatic control for all fans. - -If ``pwm1_enable`` is not available, then it means that SMM codes for -enabling and disabling automatic BIOS fan control are not whitelisted -for your hardware. It is possible that codes that work for other -laptops actually work for yours as well, or that you have to discover -new codes. +Enabling/Disabling automatic BIOS fan control +--------------------------------------------- + +There exist two methods for enabling/disabling automatic BIOS fan control: + +1. Separate SMM commands to enable/disable automatic BIOS fan control for all fans. + +2. A special fan state that enables automatic BIOS fan control for a individual fan. + +The driver cannot reliably detect what method should be used on a given +device, so instead the following heuristic is used: + +- use fan state 3 for enabling BIOS fan control if the maximum fan state + setable by the user is smaller than 3 (default setting). + +- use separate SMM commands if device is whitelisted to support them. + +When using the first method, each fan will have a standard ``pwmX_enable`` +sysfs attribute. Writing ``1`` into this attribute will disable automatic +BIOS fan control for the associated fan and set it to maximum speed. Enabling +BIOS fan control again can be achieved by writing ``2`` into this attribute. +Reading this sysfs attributes returns the current setting as reported by +the underlying hardware. + +When using the second method however, only the ``pwm1_enable`` sysfs attribute +will be available to enable/disable automatic BIOS fan control globaly for all +fans available on a given device. Additionally, this sysfs attribute is write-only +as there exists no SMM command for reading the current fan control setting. + +If no ``pwmX_enable`` attributes are available, then it means that the driver +cannot use the first method and the SMM codes for enabling and disabling automatic +BIOS fan control are not whitelisted for your device. It is possible that codes +that work for other laptops actually work for yours as well, or that you have to +discover new codes. Check the list ``i8k_whitelist_fan_control`` in file ``drivers/hwmon/dell-smm-hwmon.c`` in the kernel tree: as a first diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 36576db09706..79befa13b699 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -764,6 +764,13 @@ static int dell_smm_get_cur_state(struct thermal_cooling_device *dev, unsigned l if (ret < 0) return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > cdata->data->i8k_fan_max) + return -ENODATA; + *state = ret; return 0; @@ -851,7 +858,14 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types break; case hwmon_pwm_enable: - if (auto_fan) + if (auto_fan) { + /* + * The setting affects all fans, so only create a + * single attribute. + */ + if (channel != 1) + return 0; + /* * There is no command for retrieve the current status * from BIOS, and userspace/firmware itself can change @@ -859,6 +873,10 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types * Thus we can only provide write-only access for now. */ return 0200; + } + + if (data->fan[channel] && data->i8k_fan_max < I8K_FAN_AUTO) + return 0644; break; default: @@ -928,14 +946,28 @@ static int dell_smm_read(struct device *dev, enum hwmon_sensor_types type, u32 a } break; case hwmon_pwm: + ret = i8k_get_fan_status(data, channel); + if (ret < 0) + return ret; + switch (attr) { case hwmon_pwm_input: - ret = i8k_get_fan_status(data, channel); - if (ret < 0) - return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > data->i8k_fan_max) + return -ENODATA; *val = clamp_val(ret * data->i8k_pwm_mult, 0, 255); + return 0; + case hwmon_pwm_enable: + if (ret == I8K_FAN_AUTO) + *val = 2; + else + *val = 1; + return 0; default: break; @@ -1022,16 +1054,32 @@ static int dell_smm_write(struct device *dev, enum hwmon_sensor_types type, u32 return 0; case hwmon_pwm_enable: - if (!val) - return -EINVAL; - - if (val == 1) + switch (val) { + case 1: enable = false; - else + break; + case 2: enable = true; + break; + default: + return -EINVAL; + } mutex_lock(&data->i8k_mutex); - err = i8k_enable_fan_auto_mode(data, enable); + if (auto_fan) { + err = i8k_enable_fan_auto_mode(data, enable); + } else { + /* + * When putting the fan into manual control mode we have to ensure + * that the device does not overheat until the userspace fan control + * software takes over. Because of this we set the fan speed to + * i8k_fan_max when disabling automatic fan control. + */ + if (enable) + err = i8k_set_fan(data, channel, I8K_FAN_AUTO); + else + err = i8k_set_fan(data, channel, data->i8k_fan_max); + } mutex_unlock(&data->i8k_mutex); if (err < 0) @@ -1082,9 +1130,9 @@ static const struct hwmon_channel_info * const dell_smm_info[] = { ), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE ), NULL }; diff --git a/include/uapi/linux/i8k.h b/include/uapi/linux/i8k.h index 268e6268f6c8..a16e4049710f 100644 --- a/include/uapi/linux/i8k.h +++ b/include/uapi/linux/i8k.h @@ -36,6 +36,8 @@ #define I8K_FAN_LOW 1 #define I8K_FAN_HIGH 2 #define I8K_FAN_TURBO 3 +/* Many machines treat this mode as some sort of automatic mode */ +#define I8K_FAN_AUTO 3 #define I8K_FAN_MAX I8K_FAN_TURBO #define I8K_VOL_UP 1 -- cgit v1.2.3 From 23049938605bda390f875ce20e0704252c2e5c3d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:37:10 +0900 Subject: can: populate the minimum and maximum MTU values By populating: net_device->min_mtu and net_device->max_mtu the net core infrastructure will automatically: 1. validate that the user's inputs are in range. 2. report those min and max MTU values through the netlink interface. Add can_set_default_mtu() which sets the default mtu value as well as the minimum and maximum values. The logic for the default mtu value remains unchanged: - CANFD_MTU if the device has a static CAN_CTRLMODE_FD. - CAN_MTU otherwise. Call can_set_default_mtu() each time the CAN_CTRLMODE_FD is modified. This will guarantee that the MTU value is always consistent with the control mode flags. With this, the checks done in can_change_mtu() become fully redundant and will be removed in an upcoming change and it is now possible to confirm the minimum and maximum MTU values on a physical CAN interface by doing: $ ip --details link show can0 The virtual interfaces (vcan and vxcan) are not impacted by this change. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-can-fix-mtu-v3-3-581bde113f52@kernel.org [mkl: squashed https://patch.msgid.link/20250924143644.17622-2-mailhol@kernel.org] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 20 ++++++++++++++++++-- drivers/net/can/dev/netlink.c | 9 ++++----- include/linux/can/dev.h | 1 + 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 02bfed37cc93..befdeb4c54c2 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -240,6 +240,8 @@ void can_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CAN_MTU; + dev->min_mtu = CAN_MTU; + dev->max_mtu = CAN_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 10; @@ -309,6 +311,21 @@ void free_candev(struct net_device *dev) } EXPORT_SYMBOL_GPL(free_candev); +void can_set_default_mtu(struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + + if (priv->ctrlmode & CAN_CTRLMODE_FD) { + dev->mtu = CANFD_MTU; + dev->min_mtu = CANFD_MTU; + dev->max_mtu = CANFD_MTU; + } else { + dev->mtu = CAN_MTU; + dev->min_mtu = CAN_MTU; + dev->max_mtu = CAN_MTU; + } +} + /* changing MTU and control mode for CAN/CANFD devices */ int can_change_mtu(struct net_device *dev, int new_mtu) { @@ -361,8 +378,7 @@ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) priv->ctrlmode = static_mode; /* override MTU which was set by default in can_setup()? */ - if (static_mode & CAN_CTRLMODE_FD) - dev->mtu = CANFD_MTU; + can_set_default_mtu(dev); return 0; } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index d9f6ab3efb97..248f607e3864 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -223,17 +223,16 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], priv->ctrlmode &= ~cm->mask; priv->ctrlmode |= maskedflags; - /* CAN_CTRLMODE_FD can only be set when driver supports FD */ - if (priv->ctrlmode & CAN_CTRLMODE_FD) { - dev->mtu = CANFD_MTU; - } else { - dev->mtu = CAN_MTU; + /* Wipe potential leftovers from previous CAN FD config */ + if (!(priv->ctrlmode & CAN_CTRLMODE_FD)) { memset(&priv->fd.data_bittiming, 0, sizeof(priv->fd.data_bittiming)); priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK; memset(&priv->fd.tdc, 0, sizeof(priv->fd.tdc)); } + can_set_default_mtu(dev); + fd_tdc_flag_provided = cm->mask & CAN_CTRLMODE_FD_TDC_MASK; /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually * exclusive: make sure to turn the other one off diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 5dc58360c2d7..3354f70ed2c6 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -166,6 +166,7 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); +void can_set_default_mtu(struct net_device *dev); int can_change_mtu(struct net_device *dev, int new_mtu); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); -- cgit v1.2.3 From cc470fcf1d59f9d6186810ea5253da49a4f85f83 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:26 +0900 Subject: can: dev: move struct data_bittiming_params to linux/can/bittiming.h In commit b803c4a4f788 ("can: dev: add struct data_bittiming_params to group FD parameters"), struct data_bittiming_params was put into linux/can/dev.h. This structure being a collection of bittiming parameters, on second thought, bittiming.h is actually a better location. This way, users of struct data_bittiming_params will not have to forcefully include linux/can/dev.h thus removing some complexity and reducing the risk of circular dependencies in headers. Move struct data_bittiming_params from linux/can/dev.h to linux/can/bittiming.h. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-1-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 11 +++++++++++ include/linux/can/dev.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 5dfdbb63b1d5..6572ec1712ca 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -114,6 +114,17 @@ struct can_tdc_const { u32 tdcf_max; }; +struct data_bittiming_params { + const struct can_bittiming_const *data_bittiming_const; + struct can_bittiming data_bittiming; + const struct can_tdc_const *tdc_const; + struct can_tdc tdc; + const u32 *data_bitrate_const; + unsigned int data_bitrate_const_cnt; + int (*do_set_data_bittiming)(struct net_device *dev); + int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); +}; + #ifdef CONFIG_CAN_CALC_BITTIMING int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc, struct netlink_ext_ack *extack); diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 3354f70ed2c6..c2fe956ab776 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,17 +38,6 @@ enum can_termination_gpio { CAN_TERMINATION_GPIO_MAX, }; -struct data_bittiming_params { - const struct can_bittiming_const *data_bittiming_const; - struct can_bittiming data_bittiming; - const struct can_tdc_const *tdc_const; - struct can_tdc tdc; - const u32 *data_bitrate_const; - unsigned int data_bitrate_const_cnt; - int (*do_set_data_bittiming)(struct net_device *dev); - int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); -}; - /* * CAN common private data */ -- cgit v1.2.3 From 7208385df7846d30e29febc6c6280cb32e91ee82 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:27 +0900 Subject: can: dev: make can_get_relative_tdco() FD agnostic and move it to bittiming.h can_get_relative_tdco() needs to access can_priv->fd making it specific to CAN FD. Change the function parameter from struct can_priv to struct data_bittiming_params. This way, the function becomes CAN FD agnostic and can be reused later on for the CAN XL TDC. Now that we dropped the dependency on struct can_priv, also move can_get_relative_tdco() back to bittiming.h where it was meant to belong to. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-2-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 29 +++++++++++++++++++++++++++++ include/linux/can/dev.h | 29 ----------------------------- 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 6572ec1712ca..4d5f7794194a 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -160,6 +160,35 @@ int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt, const unsigned int bitrate_const_cnt, struct netlink_ext_ack *extack); +/* + * can_get_relative_tdco() - TDCO relative to the sample point + * + * struct can_tdc::tdco represents the absolute offset from TDCV. Some + * controllers use instead an offset relative to the Sample Point (SP) + * such that: + * + * SSP = TDCV + absolute TDCO + * = TDCV + SP + relative TDCO + * + * -+----------- one bit ----------+-- TX pin + * |<--- Sample Point --->| + * + * --+----------- one bit ----------+-- RX pin + * |<-------- TDCV -------->| + * |<------------------------>| absolute TDCO + * |<--- Sample Point --->| + * | |<->| relative TDCO + * |<------------- Secondary Sample Point ------------>| + */ +static inline s32 can_get_relative_tdco(const struct data_bittiming_params *dbt_params) +{ + const struct can_bittiming *dbt = &dbt_params->data_bittiming; + s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + + dbt->phase_seg1) * dbt->brp; + + return (s32)dbt_params->tdc.tdco - sample_point_in_tc; +} + /* * can_bit_time() - Duration of one bit * diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index c2fe956ab776..8e75e9b3830a 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -85,35 +85,6 @@ static inline bool can_fd_tdc_is_enabled(const struct can_priv *priv) return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK); } -/* - * can_get_relative_tdco() - TDCO relative to the sample point - * - * struct can_tdc::tdco represents the absolute offset from TDCV. Some - * controllers use instead an offset relative to the Sample Point (SP) - * such that: - * - * SSP = TDCV + absolute TDCO - * = TDCV + SP + relative TDCO - * - * -+----------- one bit ----------+-- TX pin - * |<--- Sample Point --->| - * - * --+----------- one bit ----------+-- RX pin - * |<-------- TDCV -------->| - * |<------------------------>| absolute TDCO - * |<--- Sample Point --->| - * | |<->| relative TDCO - * |<------------- Secondary Sample Point ------------>| - */ -static inline s32 can_get_relative_tdco(const struct can_priv *priv) -{ - const struct can_bittiming *dbt = &priv->fd.data_bittiming; - s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + - dbt->phase_seg1) * dbt->brp; - - return (s32)priv->fd.tdc.tdco - sample_point_in_tc; -} - static inline u32 can_get_static_ctrlmode(struct can_priv *priv) { return priv->ctrlmode & ~priv->ctrlmode_supported; -- cgit v1.2.3 From 94040a8f484576cb1b7df3b2e93118c3b3e3aff4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:28 +0900 Subject: can: netlink: document which symbols are FD specific The CAN XL netlink interface will also have data bitrate and TDC parameters. The current FD parameters do not have a prefix in their names to differentiate them. Because the netlink interface is part of the UAPI, it is unfortunately not feasible to rename the existing symbols to add an FD_ prefix. The best alternative is to add a comment for each of the symbols to notify the reader of which parts are CAN FD specific. While at it, fix a typo: transiver -> transceiver. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-3-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 02ec32d69474..ef62f56eaaef 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -101,8 +101,8 @@ struct can_ctrlmode { #define CAN_CTRLMODE_PRESUME_ACK 0x40 /* Ignore missing CAN ACKs */ #define CAN_CTRLMODE_FD_NON_ISO 0x80 /* CAN FD in non-ISO mode */ #define CAN_CTRLMODE_CC_LEN8_DLC 0x100 /* Classic CAN DLC option */ -#define CAN_CTRLMODE_TDC_AUTO 0x200 /* CAN transiver automatically calculates TDCV */ -#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* TDCV is manually set up by user */ +#define CAN_CTRLMODE_TDC_AUTO 0x200 /* FD transceiver automatically calculates TDCV */ +#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* FD TDCV is manually set up by user */ /* * CAN device statistics @@ -129,14 +129,14 @@ enum { IFLA_CAN_RESTART_MS, IFLA_CAN_RESTART, IFLA_CAN_BERR_COUNTER, - IFLA_CAN_DATA_BITTIMING, - IFLA_CAN_DATA_BITTIMING_CONST, + IFLA_CAN_DATA_BITTIMING, /* FD */ + IFLA_CAN_DATA_BITTIMING_CONST, /* FD */ IFLA_CAN_TERMINATION, IFLA_CAN_TERMINATION_CONST, IFLA_CAN_BITRATE_CONST, - IFLA_CAN_DATA_BITRATE_CONST, + IFLA_CAN_DATA_BITRATE_CONST, /* FD */ IFLA_CAN_BITRATE_MAX, - IFLA_CAN_TDC, + IFLA_CAN_TDC, /* FD */ IFLA_CAN_CTRLMODE_EXT, /* add new constants above here */ @@ -145,7 +145,7 @@ enum { }; /* - * CAN FD Transmitter Delay Compensation (TDC) + * CAN FD/XL Transmitter Delay Compensation (TDC) * * Please refer to struct can_tdc_const and can_tdc in * include/linux/can/bittiming.h for further details. -- cgit v1.2.3 From b23a8425cba5d7908d69f3bce8f3c697362b50ae Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:30 +0900 Subject: can: netlink: add can_validate_tdc() Factorise the TDC validation out of can_validate() and move it in the new can_validate_tdc() function. This is a preparation patch for the introduction of CAN XL because this TDC validation will be reused later on. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-5-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/netlink.c | 82 +++++++++++++++++++++++++------------------ include/linux/can/bittiming.h | 4 +++ 2 files changed, 52 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 13555253e789..25c08adee9ad 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -57,6 +57,49 @@ static int can_validate_bittiming(struct nlattr *data[], return 0; } +static int can_validate_tdc(struct nlattr *data_tdc, + struct netlink_ext_ack *extack, u32 tdc_flags) +{ + bool tdc_manual = tdc_flags & CAN_CTRLMODE_TDC_MANUAL_MASK; + bool tdc_auto = tdc_flags & CAN_CTRLMODE_TDC_AUTO_MASK; + int err; + + /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive */ + if (tdc_auto && tdc_manual) + return -EOPNOTSUPP; + + /* If one of the CAN_CTRLMODE_TDC_* flag is set then TDC + * must be set and vice-versa + */ + if ((tdc_auto || tdc_manual) != !!data_tdc) + return -EOPNOTSUPP; + + /* If providing TDC parameters, at least TDCO is needed. TDCV + * is needed if and only if CAN_CTRLMODE_TDC_MANUAL is set + */ + if (data_tdc) { + struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; + + err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, + data_tdc, can_tdc_policy, extack); + if (err) + return err; + + if (tb_tdc[IFLA_CAN_TDC_TDCV]) { + if (tdc_auto) + return -EOPNOTSUPP; + } else { + if (tdc_manual) + return -EOPNOTSUPP; + } + + if (!tb_tdc[IFLA_CAN_TDC_TDCO]) + return -EOPNOTSUPP; + } + + return 0; +} + static int can_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -67,7 +110,7 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[], * - nominal/arbitration bittiming * - data bittiming * - control mode with CAN_CTRLMODE_FD set - * - TDC parameters are coherent (details below) + * - TDC parameters are coherent (details in can_validate_tdc()) */ if (!data) @@ -75,42 +118,13 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_CAN_CTRLMODE]) { struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); - u32 tdc_flags = cm->flags & CAN_CTRLMODE_FD_TDC_MASK; is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD; - /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive */ - if (tdc_flags == CAN_CTRLMODE_FD_TDC_MASK) - return -EOPNOTSUPP; - /* If one of the CAN_CTRLMODE_TDC_* flag is set then - * TDC must be set and vice-versa - */ - if (!!tdc_flags != !!data[IFLA_CAN_TDC]) - return -EOPNOTSUPP; - /* If providing TDC parameters, at least TDCO is - * needed. TDCV is needed if and only if - * CAN_CTRLMODE_TDC_MANUAL is set - */ - if (data[IFLA_CAN_TDC]) { - struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; - - err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, - data[IFLA_CAN_TDC], - can_tdc_policy, extack); - if (err) - return err; - - if (tb_tdc[IFLA_CAN_TDC_TDCV]) { - if (tdc_flags & CAN_CTRLMODE_TDC_AUTO) - return -EOPNOTSUPP; - } else { - if (tdc_flags & CAN_CTRLMODE_TDC_MANUAL) - return -EOPNOTSUPP; - } - - if (!tb_tdc[IFLA_CAN_TDC_TDCO]) - return -EOPNOTSUPP; - } + err = can_validate_tdc(data[IFLA_CAN_TDC], extack, + cm->flags & CAN_CTRLMODE_FD_TDC_MASK); + if (err) + return err; } err = can_validate_bittiming(data, extack, IFLA_CAN_BITTIMING); diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 4d5f7794194a..71f839c3f032 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -16,6 +16,10 @@ #define CAN_CTRLMODE_FD_TDC_MASK \ (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL) +#define CAN_CTRLMODE_TDC_AUTO_MASK \ + (CAN_CTRLMODE_TDC_AUTO) +#define CAN_CTRLMODE_TDC_MANUAL_MASK \ + (CAN_CTRLMODE_TDC_MANUAL) /* * struct can_tdc - CAN FD Transmission Delay Compensation parameters -- cgit v1.2.3 From 6ffc1230d3a728e07d7d2464f388ad4bbefe90c2 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:43 +0900 Subject: can: calc_bittiming: make can_calc_tdco() FD agnostic can_calc_tdco() uses the CAN_CTRLMODE_FD_TDC_MASK and CAN_CTRLMODE_TDC_AUTO macros making it specific to CAN FD. Add the tdc mask to the function parameter list. The value of the tdc auto flag can then be derived from that mask and stored in a local variable. This way, the function becomes CAN FD agnostic and can be reused later on for the CAN XL TDC. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-18-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/calc_bittiming.c | 10 ++++++---- drivers/net/can/dev/netlink.c | 2 +- include/linux/can/bittiming.h | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/calc_bittiming.c b/drivers/net/can/dev/calc_bittiming.c index a94bd67c670c..394d6974f481 100644 --- a/drivers/net/can/dev/calc_bittiming.c +++ b/drivers/net/can/dev/calc_bittiming.c @@ -173,13 +173,15 @@ int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, - u32 *ctrlmode, u32 ctrlmode_supported) + u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported) { - if (!tdc_const || !(ctrlmode_supported & CAN_CTRLMODE_TDC_AUTO)) + u32 tdc_auto = tdc_mask & CAN_CTRLMODE_TDC_AUTO_MASK; + + if (!tdc_const || !(ctrlmode_supported & tdc_auto)) return; - *ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK; + *ctrlmode &= ~tdc_mask; /* As specified in ISO 11898-1 section 11.3.3 "Transmitter * delay compensation" (TDC) is only applicable if data BRP is @@ -193,6 +195,6 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, if (sample_point_in_tc < tdc_const->tdco_min) return; tdc->tdco = min(sample_point_in_tc, tdc_const->tdco_max); - *ctrlmode |= CAN_CTRLMODE_TDC_AUTO; + *ctrlmode |= tdc_auto; } } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 99038e0fb25f..92d8df13e886 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -341,7 +341,7 @@ static int can_dbt_changelink(struct net_device *dev, struct nlattr *data[], * do calculation */ can_calc_tdco(&dbt_params->tdc, dbt_params->tdc_const, &dbt, - &priv->ctrlmode, priv->ctrlmode_supported); + tdc_mask, &priv->ctrlmode, priv->ctrlmode_supported); } /* else: both CAN_CTRLMODE_TDC_{AUTO,MANUAL} are explicitly * turned off. TDC is disabled: do nothing */ diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 71f839c3f032..d30816dd93c7 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -135,7 +135,7 @@ int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, - u32 *ctrlmode, u32 ctrlmode_supported); + u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, @@ -148,7 +148,7 @@ can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, static inline void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, - u32 *ctrlmode, u32 ctrlmode_supported) + u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported) { } #endif /* CONFIG_CAN_CALC_BITTIMING */ -- cgit v1.2.3 From 7de54546fff11cb0a53f47847d62f7b1a5792d17 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:44 +0900 Subject: can: dev: add can_get_ctrlmode_str() In an effort to give more human readable messages when errors occur because of conflicting options, it can be useful to convert the CAN control mode flags into text. Add a function which converts the first set CAN control mode into a human readable string. The reason to only convert the first one is to simplify edge cases: imagine that there are several invalid control modes, we would just return the first invalid one to the user, thus not having to handle complex string concatenation. The user can then solve the first problem, call the netlink interface again and see the next issue. People who wish to enumerate all the control modes can still do so by, for example, using this new function in a for_each_set_bit() loop. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-19-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 33 +++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 2 ++ 2 files changed, 35 insertions(+) (limited to 'include') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index befdeb4c54c2..15ccedbb3f8d 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -88,6 +88,39 @@ const char *can_get_state_str(const enum can_state state) } EXPORT_SYMBOL_GPL(can_get_state_str); +const char *can_get_ctrlmode_str(u32 ctrlmode) +{ + switch (ctrlmode & ~(ctrlmode - 1)) { + case 0: + return "none"; + case CAN_CTRLMODE_LOOPBACK: + return "loopback"; + case CAN_CTRLMODE_LISTENONLY: + return "listen-only"; + case CAN_CTRLMODE_3_SAMPLES: + return "triple-sampling"; + case CAN_CTRLMODE_ONE_SHOT: + return "one-shot"; + case CAN_CTRLMODE_BERR_REPORTING: + return "berr-reporting"; + case CAN_CTRLMODE_FD: + return "fd"; + case CAN_CTRLMODE_PRESUME_ACK: + return "presume-ack"; + case CAN_CTRLMODE_FD_NON_ISO: + return "fd-non-iso"; + case CAN_CTRLMODE_CC_LEN8_DLC: + return "cc-len8-dlc"; + case CAN_CTRLMODE_TDC_AUTO: + return "fd-tdc-auto"; + case CAN_CTRLMODE_TDC_MANUAL: + return "fd-tdc-manual"; + default: + return ""; + } +} +EXPORT_SYMBOL_GPL(can_get_ctrlmode_str); + static enum can_state can_state_err_to_state(u16 err) { if (err < CAN_ERROR_WARNING_THRESHOLD) diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 8e75e9b3830a..a2229a61ccde 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -141,6 +141,8 @@ int can_restart_now(struct net_device *dev); void can_bus_off(struct net_device *dev); const char *can_get_state_str(const enum can_state state); +const char *can_get_ctrlmode_str(u32 ctrlmode); + void can_state_get_by_berr_counter(const struct net_device *dev, const struct can_berr_counter *bec, enum can_state *tx_state, -- cgit v1.2.3 From 3e86e4d74c0490e5fc5a7f8de8f29e7579c9ffe5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 18 Sep 2025 10:05:47 +0200 Subject: kbuild: keep .modinfo section in vmlinux.unstripped Keep the .modinfo section during linking, but strip it from the final vmlinux. Adjust scripts/mksysmap to exclude modinfo symbols from kallsyms. This change will allow the next commit to extract the .modinfo section from the vmlinux.unstripped intermediate. Signed-off-by: Masahiro Yamada Signed-off-by: Alexey Gladkov Reviewed-by: Nicolas Schier Link: https://patch.msgid.link/aaf67c07447215463300fccaa758904bac42f992.1758182101.git.legion@kernel.org Signed-off-by: Nathan Chancellor --- include/asm-generic/vmlinux.lds.h | 2 +- scripts/Makefile.vmlinux | 7 +++++-- scripts/mksysmap | 3 +++ 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e..cfa63860dfd4 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -831,6 +831,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) /* Required sections not related to debugging. */ #define ELF_DETAILS \ + .modinfo : { *(.modinfo) } \ .comment 0 : { *(.comment) } \ .symtab 0 : { *(.symtab) } \ .strtab 0 : { *(.strtab) } \ @@ -1044,7 +1045,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.discard.*) \ *(.export_symbol) \ *(.no_trim_symbol) \ - *(.modinfo) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index 4f2d4c3fb737..70856dab0f54 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -86,11 +86,14 @@ endif # vmlinux # --------------------------------------------------------------------------- -remove-section-y := +remove-section-y := .modinfo remove-section-$(CONFIG_ARCH_VMLINUX_NEEDS_RELOCS) += '.rel*' +# To avoid warnings: "empty loadable segment detected at ..." from GNU objcopy, +# it is necessary to remove the PT_LOAD flag from the segment. quiet_cmd_strip_relocs = OBJCOPY $@ - cmd_strip_relocs = $(OBJCOPY) $(addprefix --remove-section=,$(remove-section-y)) $< $@ + cmd_strip_relocs = $(OBJCOPY) $(patsubst %,--set-section-flags %=noload,$(remove-section-y)) $< $@; \ + $(OBJCOPY) $(addprefix --remove-section=,$(remove-section-y)) $@ targets += vmlinux vmlinux: vmlinux.unstripped FORCE diff --git a/scripts/mksysmap b/scripts/mksysmap index 3accbdb269ac..a607a0059d11 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -79,6 +79,9 @@ / _SDA_BASE_$/d / _SDA2_BASE_$/d +# MODULE_INFO() +/ __UNIQUE_ID_modinfo[0-9]*$/d + # --------------------------------------------------------------------------- # Ignored patterns # (symbols that contain the pattern are ignored) -- cgit v1.2.3 From 83fb49389bbe07defb85b063f7ff0fd016f06b35 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 18 Sep 2025 10:05:50 +0200 Subject: modpost: Add modname to mod_device_table alias At this point, if a symbol is compiled as part of the kernel, information about which module the symbol belongs to is lost. To save this it is possible to add the module name to the alias name. It's not very pretty, but it's possible for now. Cc: Miguel Ojeda Cc: Andreas Hindborg Cc: Danilo Krummrich Cc: Alex Gaynor Cc: rust-for-linux@vger.kernel.org Signed-off-by: Alexey Gladkov Acked-by: Danilo Krummrich Acked-by: Nicolas Schier Link: https://patch.msgid.link/1a0d0bd87a4981d465b9ed21e14f4e78eaa03ded.1758182101.git.legion@kernel.org Signed-off-by: Nathan Chancellor --- include/linux/module.h | 14 +++++++++++++- rust/kernel/device_id.rs | 8 ++++---- scripts/mod/file2alias.c | 15 ++++++++++++--- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 3319a5269d28..e31ee29fac6b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -244,10 +244,22 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) +/* + * Format: __mod_device_table__kmod_____ + * Parts of the string `__kmod_` and `__` are used as delimiters when parsing + * a symbol in file2alias.c + */ +#define __mod_device_table(type, name) \ + __PASTE(__mod_device_table__, \ + __PASTE(__KBUILD_MODNAME, \ + __PASTE(__, \ + __PASTE(type, \ + __PASTE(__, name))))) + #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -static typeof(name) __mod_device_table__##type##__##name \ +static typeof(name) __mod_device_table(type, name) \ __attribute__ ((used, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) diff --git a/rust/kernel/device_id.rs b/rust/kernel/device_id.rs index 70d57814ff79..62c42da12e9d 100644 --- a/rust/kernel/device_id.rs +++ b/rust/kernel/device_id.rs @@ -195,10 +195,10 @@ macro_rules! module_device_table { ($table_type: literal, $module_table_name:ident, $table_name:ident) => { #[rustfmt::skip] #[export_name = - concat!("__mod_device_table__", $table_type, - "__", module_path!(), - "_", line!(), - "_", stringify!($table_name)) + concat!("__mod_device_table__", line!(), + "__kmod_", module_path!(), + "__", $table_type, + "__", stringify!($table_name)) ] static $module_table_name: [::core::mem::MaybeUninit; $table_name.raw_ids().size()] = unsafe { ::core::mem::transmute_copy($table_name.raw_ids()) }; diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 00586119a25b..1260bc2287fb 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1476,7 +1476,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, { void *symval; char *zeros = NULL; - const char *type, *name; + const char *type, *name, *modname; size_t typelen; static const char *prefix = "__mod_device_table__"; @@ -1488,10 +1488,19 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) return; - /* All our symbols are of form __mod_device_table____. */ + /* All our symbols are of form __mod_device_table__kmod_____. */ if (!strstarts(symname, prefix)) return; - type = symname + strlen(prefix); + + modname = strstr(symname, "__kmod_"); + if (!modname) + return; + modname += strlen("__kmod_"); + + type = strstr(modname, "__"); + if (!type) + return; + type += strlen("__"); name = strstr(type, "__"); if (!name) -- cgit v1.2.3 From 5ab23c7923a1d2ae1890026866a2d8506b010a4a Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 18 Sep 2025 10:05:51 +0200 Subject: modpost: Create modalias for builtin modules For some modules, modalias is generated using the modpost utility and the section is added to the module file. When a module is added inside vmlinux, modpost does not generate modalias for such modules and the information is lost. As a result kmod (which uses modules.builtin.modinfo in userspace) cannot determine that modalias is handled by a builtin kernel module. $ cat /sys/devices/pci0000:00/0000:00:14.0/modalias pci:v00008086d0000A36Dsv00001043sd00008694bc0Csc03i30 $ modinfo xhci_pci name: xhci_pci filename: (builtin) license: GPL file: drivers/usb/host/xhci-pci description: xHCI PCI Host Controller Driver Missing modalias "pci:v*d*sv*sd*bc0Csc03i30*" which will be generated by modpost if the module is built separately. To fix this it is necessary to generate the same modalias for vmlinux as for the individual modules. Fortunately '.vmlinux.export.o' is already generated from which '.modinfo' can be extracted in the same way as for vmlinux.o. Signed-off-by: Masahiro Yamada Signed-off-by: Alexey Gladkov Tested-by: Stephen Rothwell Reviewed-by: Nicolas Schier Link: https://patch.msgid.link/28d4da3b0e3fc8474142746bcf469e03752c3208.1758182101.git.legion@kernel.org Signed-off-by: Nathan Chancellor --- include/linux/module.h | 4 ---- scripts/Makefile.vmlinux | 4 +++- scripts/mksysmap | 3 +++ scripts/mod/file2alias.c | 19 ++++++++++++++++++- scripts/mod/modpost.c | 15 +++++++++++++++ scripts/mod/modpost.h | 2 ++ 6 files changed, 41 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index e31ee29fac6b..e135cc79acee 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -256,14 +256,10 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); __PASTE(type, \ __PASTE(__, name))))) -#ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ static typeof(name) __mod_device_table(type, name) \ __attribute__ ((used, alias(__stringify(name)))) -#else /* !MODULE */ -#define MODULE_DEVICE_TABLE(type, name) -#endif /* Version of form [:][-]. * Or for CVS/RCS ID version, everything but the number is stripped. diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index ce7946171497..1e5e37aadcd0 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -89,11 +89,13 @@ endif remove-section-y := .modinfo remove-section-$(CONFIG_ARCH_VMLINUX_NEEDS_RELOCS) += '.rel*' +remove-symbols := -w --strip-symbol='__mod_device_table__*' + # To avoid warnings: "empty loadable segment detected at ..." from GNU objcopy, # it is necessary to remove the PT_LOAD flag from the segment. quiet_cmd_strip_relocs = OBJCOPY $@ cmd_strip_relocs = $(OBJCOPY) $(patsubst %,--set-section-flags %=noload,$(remove-section-y)) $< $@; \ - $(OBJCOPY) $(addprefix --remove-section=,$(remove-section-y)) $@ + $(OBJCOPY) $(addprefix --remove-section=,$(remove-section-y)) $(remove-symbols) $@ targets += vmlinux vmlinux: vmlinux.unstripped FORCE diff --git a/scripts/mksysmap b/scripts/mksysmap index a607a0059d11..c4531eacde20 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -59,6 +59,9 @@ # EXPORT_SYMBOL (namespace) / __kstrtabns_/d +# MODULE_DEVICE_TABLE (symbol name) +/ __mod_device_table__/d + # --------------------------------------------------------------------------- # Ignored suffixes # (do not forget '$' after each pattern) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 1260bc2287fb..7da9735e7ab3 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1477,7 +1477,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, void *symval; char *zeros = NULL; const char *type, *name, *modname; - size_t typelen; + size_t typelen, modnamelen; static const char *prefix = "__mod_device_table__"; /* We're looking for a section relative symbol */ @@ -1500,6 +1500,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, type = strstr(modname, "__"); if (!type) return; + modnamelen = type - modname; type += strlen("__"); name = strstr(type, "__"); @@ -1526,5 +1527,21 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, } } + if (mod->is_vmlinux) { + struct module_alias *alias; + + /* + * If this is vmlinux, record the name of the builtin module. + * Traverse the linked list in the reverse order, and set the + * builtin_modname unless it has already been set in the + * previous call. + */ + list_for_each_entry_reverse(alias, &mod->aliases, node) { + if (alias->builtin_modname) + break; + alias->builtin_modname = xstrndup(modname, modnamelen); + } + } + free(zeros); } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5ca7c268294e..47c8aa2a6939 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2067,11 +2067,26 @@ static void write_if_changed(struct buffer *b, const char *fname) static void write_vmlinux_export_c_file(struct module *mod) { struct buffer buf = { }; + struct module_alias *alias, *next; buf_printf(&buf, "#include \n"); add_exported_symbols(&buf, mod); + + buf_printf(&buf, + "#include \n" + "#undef __MODULE_INFO_PREFIX\n" + "#define __MODULE_INFO_PREFIX\n"); + + list_for_each_entry_safe(alias, next, &mod->aliases, node) { + buf_printf(&buf, "MODULE_INFO(%s.alias, \"%s\");\n", + alias->builtin_modname, alias->str); + list_del(&alias->node); + free(alias->builtin_modname); + free(alias); + } + write_if_changed(&buf, ".vmlinux.export.c"); free(buf.p); } diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 9133e4c3803f..2aecb8f25c87 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -99,10 +99,12 @@ buf_write(struct buffer *buf, const char *s, int len); * struct module_alias - auto-generated MODULE_ALIAS() * * @node: linked to module::aliases + * @modname: name of the builtin module (only for vmlinux) * @str: a string for MODULE_ALIAS() */ struct module_alias { struct list_head node; + char *builtin_modname; char str[]; }; -- cgit v1.2.3 From 23ef9d439769d5f35353650e771c63d13824235b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 23 Sep 2025 14:34:19 -0700 Subject: kcfi: Rename CONFIG_CFI_CLANG to CONFIG_CFI The kernel's CFI implementation uses the KCFI ABI specifically, and is not strictly tied to a particular compiler. In preparation for GCC supporting KCFI, rename CONFIG_CFI_CLANG to CONFIG_CFI (along with associated options). Use new "transitional" Kconfig option for old CONFIG_CFI_CLANG that will enable CONFIG_CFI during olddefconfig. Reviewed-by: Linus Walleij Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250923213422.1105654-3-kees@kernel.org Signed-off-by: Kees Cook --- Makefile | 2 +- arch/Kconfig | 36 +++++++++++++++++++++------------ arch/arm/Kconfig | 2 +- arch/arm/kernel/hw_breakpoint.c | 2 +- arch/arm/mm/Makefile | 2 +- arch/arm/mm/cache-fa.S | 2 +- arch/arm/mm/cache-v4.S | 2 +- arch/arm/mm/cache-v4wb.S | 4 ++-- arch/arm/mm/cache-v4wt.S | 2 +- arch/arm/mm/cache-v6.S | 2 +- arch/arm/mm/cache-v7.S | 2 +- arch/arm/mm/cache-v7m.S | 2 +- arch/arm/mm/proc-arm1020.S | 2 +- arch/arm/mm/proc-arm1020e.S | 2 +- arch/arm/mm/proc-arm1022.S | 2 +- arch/arm/mm/proc-arm1026.S | 2 +- arch/arm/mm/proc-arm920.S | 2 +- arch/arm/mm/proc-arm922.S | 2 +- arch/arm/mm/proc-arm925.S | 2 +- arch/arm/mm/proc-arm926.S | 2 +- arch/arm/mm/proc-arm940.S | 2 +- arch/arm/mm/proc-arm946.S | 2 +- arch/arm/mm/proc-feroceon.S | 2 +- arch/arm/mm/proc-mohawk.S | 2 +- arch/arm/mm/proc-xsc3.S | 2 +- arch/arm/mm/tlb-v4.S | 2 +- arch/arm64/Kconfig | 4 ++-- arch/arm64/kernel/debug-monitors.c | 2 +- arch/arm64/kernel/traps.c | 4 ++-- arch/arm64/kvm/handle_exit.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/riscv/Kconfig | 6 +++--- arch/riscv/include/asm/cfi.h | 4 ++-- arch/riscv/kernel/Makefile | 2 +- arch/riscv/net/bpf_jit_comp64.c | 4 ++-- arch/riscv/purgatory/Makefile | 2 +- arch/x86/Kconfig | 12 +++++------ arch/x86/include/asm/cfi.h | 4 ++-- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/purgatory/Makefile | 2 +- drivers/misc/lkdtm/cfi.c | 2 +- include/asm-generic/vmlinux.lds.h | 2 +- include/linux/cfi.h | 6 +++--- include/linux/cfi_types.h | 8 ++++---- include/linux/compiler.h | 2 +- init/Kconfig | 4 ++-- kernel/Makefile | 2 +- kernel/configs/hardening.config | 4 ++-- kernel/module/Kconfig | 2 +- kernel/module/tree_lookup.c | 2 +- lib/Kconfig.debug | 2 +- tools/include/linux/cfi_types.h | 6 +++--- tools/perf/util/include/linux/linkage.h | 2 +- 55 files changed, 100 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/Makefile b/Makefile index d1adb78c3596..437989d6e0be 100644 --- a/Makefile +++ b/Makefile @@ -1020,7 +1020,7 @@ KBUILD_AFLAGS += -fno-lto export CC_FLAGS_LTO endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e085..97642c08a124 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -867,22 +867,26 @@ config PROPELLER_CLANG If unsure, say N. -config ARCH_SUPPORTS_CFI_CLANG +config ARCH_SUPPORTS_CFI bool help - An architecture should select this option if it can support Clang's - Control-Flow Integrity (CFI) checking. + An architecture should select this option if it can support Kernel + Control-Flow Integrity (CFI) checking (-fsanitize=kcfi). config ARCH_USES_CFI_TRAPS bool + help + An architecture should select this option if it requires the + .kcfi_traps section for KCFI trap handling. -config CFI_CLANG - bool "Use Clang's Control Flow Integrity (CFI)" - depends on ARCH_SUPPORTS_CFI_CLANG +config CFI + bool "Use Kernel Control Flow Integrity (kCFI)" + default CFI_CLANG + depends on ARCH_SUPPORTS_CFI depends on $(cc-option,-fsanitize=kcfi) help - This option enables Clang's forward-edge Control Flow Integrity - (CFI) checking, where the compiler injects a runtime check to each + This option enables forward-edge Control Flow Integrity (CFI) + checking, where the compiler injects a runtime check to each indirect function call to ensure the target is a valid function with the correct static type. This restricts possible call targets and makes it more difficult for an attacker to exploit bugs that allow @@ -891,10 +895,16 @@ config CFI_CLANG https://clang.llvm.org/docs/ControlFlowIntegrity.html +config CFI_CLANG + bool + transitional + help + Transitional config for CFI_CLANG to CFI migration. + config CFI_ICALL_NORMALIZE_INTEGERS bool "Normalize CFI tags for integers" - depends on CFI_CLANG - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on CFI + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS help This option normalizes the CFI tags for integer types so that all integer types of the same size and signedness receive the same CFI @@ -907,7 +917,7 @@ config CFI_ICALL_NORMALIZE_INTEGERS This option is necessary for using CFI with Rust. If unsure, say N. -config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG +config HAVE_CFI_ICALL_NORMALIZE_INTEGERS def_bool y depends on $(cc-option,-fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers) # With GCOV/KASAN we need this fix: https://github.com/llvm/llvm-project/pull/104826 @@ -915,7 +925,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC def_bool y - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS depends on RUSTC_VERSION >= 107900 # With GCOV/KASAN we need this fix: https://github.com/rust-lang/rust/pull/129373 depends on (RUSTC_LLVM_VERSION >= 190103 && RUSTC_VERSION >= 108200) || \ @@ -923,7 +933,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC config CFI_PERMISSIVE bool "Use CFI in permissive mode" - depends on CFI_CLANG + depends on CFI help When selected, Control Flow Integrity (CFI) violations result in a warning instead of a kernel panic. This option should only be used diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b1f3df39ed40..36ab8625be72 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -38,7 +38,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_NEED_CMPXCHG_1_EMU if CPU_V6 select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index a12efd0f43e8..cd4b34c96e35 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -904,7 +904,7 @@ unlock: watchpoint_single_step_handler(addr); } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI static void hw_breakpoint_cfi_handler(struct pt_regs *regs) { /* diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index a195cd1d3e6d..1e2201013371 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -89,7 +89,7 @@ obj-$(CONFIG_CPU_V6) += proc-v6.o obj-$(CONFIG_CPU_V6K) += proc-v6.o obj-$(CONFIG_CPU_V7) += proc-v7.o proc-v7-bugs.o obj-$(CONFIG_CPU_V7M) += proc-v7m.o -obj-$(CONFIG_CFI_CLANG) += proc.o +obj-$(CONFIG_CFI) += proc.o obj-$(CONFIG_OUTER_CACHE) += l2c-common.o obj-$(CONFIG_CACHE_B15_RAC) += cache-b15-rac.o diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S index 4a3668b52a2d..e1641799569b 100644 --- a/arch/arm/mm/cache-fa.S +++ b/arch/arm/mm/cache-fa.S @@ -112,7 +112,7 @@ SYM_FUNC_END(fa_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(fa_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b fa_coherent_user_range #endif SYM_FUNC_END(fa_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S index 0e94e5193dbd..001d7042bd46 100644 --- a/arch/arm/mm/cache-v4.S +++ b/arch/arm/mm/cache-v4.S @@ -104,7 +104,7 @@ SYM_FUNC_END(v4_coherent_user_range) * - size - region size */ SYM_TYPED_FUNC_START(v4_flush_kern_dcache_area) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4_dma_flush_range #endif SYM_FUNC_END(v4_flush_kern_dcache_area) diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S index ce55a2eef5da..874fe5310f9a 100644 --- a/arch/arm/mm/cache-v4wb.S +++ b/arch/arm/mm/cache-v4wb.S @@ -136,7 +136,7 @@ SYM_FUNC_END(v4wb_flush_user_cache_range) */ SYM_TYPED_FUNC_START(v4wb_flush_kern_dcache_area) add r1, r0, r1 -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_flush_kern_dcache_area) @@ -152,7 +152,7 @@ SYM_FUNC_END(v4wb_flush_kern_dcache_area) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wb_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S index a97dc267b3b0..2ee62e4b2b07 100644 --- a/arch/arm/mm/cache-v4wt.S +++ b/arch/arm/mm/cache-v4wt.S @@ -108,7 +108,7 @@ SYM_FUNC_END(v4wt_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wt_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wt_coherent_user_range #endif SYM_FUNC_END(v4wt_coherent_kern_range) diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 9f415476e218..5ceea8965ea1 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -117,7 +117,7 @@ SYM_FUNC_END(v6_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v6_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v6_coherent_user_range #endif SYM_FUNC_END(v6_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 201ca05436fa..726681fb7d4d 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -261,7 +261,7 @@ SYM_FUNC_END(v7_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7_coherent_user_range #endif SYM_FUNC_END(v7_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S index 14d719eba729..7f9cfad2ea21 100644 --- a/arch/arm/mm/cache-v7m.S +++ b/arch/arm/mm/cache-v7m.S @@ -286,7 +286,7 @@ SYM_FUNC_END(v7m_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7m_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7m_coherent_user_range #endif SYM_FUNC_END(v7m_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S index d0ce3414a13e..4612a4961e81 100644 --- a/arch/arm/mm/proc-arm1020.S +++ b/arch/arm/mm/proc-arm1020.S @@ -203,7 +203,7 @@ SYM_FUNC_END(arm1020_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020_coherent_user_range #endif SYM_FUNC_END(arm1020_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S index 64f031bf6eff..b4a8a3a8eda3 100644 --- a/arch/arm/mm/proc-arm1020e.S +++ b/arch/arm/mm/proc-arm1020e.S @@ -200,7 +200,7 @@ SYM_FUNC_END(arm1020e_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020e_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020e_coherent_user_range #endif SYM_FUNC_END(arm1020e_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S index 42ed5ed07252..709870e99e19 100644 --- a/arch/arm/mm/proc-arm1022.S +++ b/arch/arm/mm/proc-arm1022.S @@ -199,7 +199,7 @@ SYM_FUNC_END(arm1022_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1022_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1022_coherent_user_range #endif SYM_FUNC_END(arm1022_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S index b3ae62cd553a..02f7370a8c5c 100644 --- a/arch/arm/mm/proc-arm1026.S +++ b/arch/arm/mm/proc-arm1026.S @@ -194,7 +194,7 @@ SYM_FUNC_END(arm1026_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1026_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1026_coherent_user_range #endif SYM_FUNC_END(arm1026_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S index a30df54ad5fa..4727f4b5b6e8 100644 --- a/arch/arm/mm/proc-arm920.S +++ b/arch/arm/mm/proc-arm920.S @@ -180,7 +180,7 @@ SYM_FUNC_END(arm920_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm920_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm920_coherent_user_range #endif SYM_FUNC_END(arm920_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S index aac4e048100d..5a4a3f4f2683 100644 --- a/arch/arm/mm/proc-arm922.S +++ b/arch/arm/mm/proc-arm922.S @@ -182,7 +182,7 @@ SYM_FUNC_END(arm922_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm922_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm922_coherent_user_range #endif SYM_FUNC_END(arm922_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S index 035941faeb2e..1c4830afe1d3 100644 --- a/arch/arm/mm/proc-arm925.S +++ b/arch/arm/mm/proc-arm925.S @@ -229,7 +229,7 @@ SYM_FUNC_END(arm925_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm925_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm925_coherent_user_range #endif SYM_FUNC_END(arm925_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S index 6f43d6af2d9a..a09cc3e02efd 100644 --- a/arch/arm/mm/proc-arm926.S +++ b/arch/arm/mm/proc-arm926.S @@ -192,7 +192,7 @@ SYM_FUNC_END(arm926_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm926_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm926_coherent_user_range #endif SYM_FUNC_END(arm926_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S index 0d30bb25c42b..545c076c36d2 100644 --- a/arch/arm/mm/proc-arm940.S +++ b/arch/arm/mm/proc-arm940.S @@ -153,7 +153,7 @@ SYM_FUNC_END(arm940_coherent_kern_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm940_coherent_user_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm940_flush_kern_dcache_area #endif SYM_FUNC_END(arm940_coherent_user_range) diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S index 27750ace2ced..f3d4e18c3fba 100644 --- a/arch/arm/mm/proc-arm946.S +++ b/arch/arm/mm/proc-arm946.S @@ -173,7 +173,7 @@ SYM_FUNC_END(arm946_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm946_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm946_coherent_user_range #endif SYM_FUNC_END(arm946_coherent_kern_range) diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S index f67b2ffac854..7f08d06c9625 100644 --- a/arch/arm/mm/proc-feroceon.S +++ b/arch/arm/mm/proc-feroceon.S @@ -208,7 +208,7 @@ SYM_FUNC_END(feroceon_flush_user_cache_range) */ .align 5 SYM_TYPED_FUNC_START(feroceon_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b feroceon_coherent_user_range #endif SYM_FUNC_END(feroceon_coherent_kern_range) diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S index 8e9f38da863a..4669c63e3121 100644 --- a/arch/arm/mm/proc-mohawk.S +++ b/arch/arm/mm/proc-mohawk.S @@ -163,7 +163,7 @@ SYM_FUNC_END(mohawk_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(mohawk_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b mohawk_coherent_user_range #endif SYM_FUNC_END(mohawk_coherent_kern_range) diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S index 14927b380452..fd25634a2ed5 100644 --- a/arch/arm/mm/proc-xsc3.S +++ b/arch/arm/mm/proc-xsc3.S @@ -223,7 +223,7 @@ SYM_FUNC_END(xsc3_flush_user_cache_range) * it also trashes the mini I-cache used by JTAG debuggers. */ SYM_TYPED_FUNC_START(xsc3_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b xsc3_coherent_user_range #endif SYM_FUNC_END(xsc3_coherent_kern_range) diff --git a/arch/arm/mm/tlb-v4.S b/arch/arm/mm/tlb-v4.S index 09ff69008d94..079774a02be6 100644 --- a/arch/arm/mm/tlb-v4.S +++ b/arch/arm/mm/tlb-v4.S @@ -52,7 +52,7 @@ SYM_FUNC_END(v4_flush_user_tlb_range) * - start - virtual address (may not be aligned) * - end - virtual address (may not be aligned) */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI SYM_TYPED_FUNC_START(v4_flush_kern_tlb_range) b .v4_flush_kern_tlb_range SYM_FUNC_END(v4_flush_kern_tlb_range) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..1e38b8885a46 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -100,7 +100,7 @@ config ARM64 select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN select ARCH_SUPPORTS_LTO_CLANG_THIN - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING @@ -212,7 +212,7 @@ config ARM64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \ if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \ - if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG && \ + if (DYNAMIC_FTRACE_WITH_ARGS && !CFI && \ (CC_IS_CLANG || !CC_OPTIMIZE_FOR_SIZE)) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 110d9ff54174..ebf010443e22 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -212,7 +212,7 @@ static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) if (esr_brk_comment(esr) == BUG_BRK_IMM) return bug_brk_handler(regs, esr); - if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) return cfi_brk_handler(regs, esr); if (esr_brk_comment(esr) == FAULT_BRK_IMM) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f528b6041f6a..5041817af267 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1015,7 +1015,7 @@ int bug_brk_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; @@ -1039,7 +1039,7 @@ int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a598072f36d2..8bdb1eed090a 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); - } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 52ffe115a8c4..28996e0a9b00 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -185,7 +185,7 @@ static inline void emit_bti(u32 insn, struct jit_ctx *ctx) static inline void emit_kcfi(u32 hash, struct jit_ctx *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit_u32_data(hash, ctx); } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659e..6043ad82b73c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -60,7 +60,7 @@ config RISCV select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW # clang >= 17: https://github.com/llvm/llvm-project/commit/62fa708ceb027713b386c7e0efda994f8bdc27e2 - select ARCH_SUPPORTS_CFI_CLANG if CLANG_VERSION >= 170000 + select ARCH_SUPPORTS_CFI if (!CC_IS_CLANG || CLANG_VERSION >= 170000) select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS if MMU @@ -76,7 +76,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_SYM_ANNOTATIONS - select ARCH_USES_CFI_TRAPS if CFI_CLANG + select ARCH_USES_CFI_TRAPS if CFI select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS @@ -154,7 +154,7 @@ config RISCV select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS - select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h index 4508aaa7a2fd..710aa8192edd 100644 --- a/arch/riscv/include/asm/cfi.h +++ b/arch/riscv/include/asm/cfi.h @@ -11,7 +11,7 @@ struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall #else @@ -19,6 +19,6 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #endif /* _ASM_RISCV_CFI_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c7b542573407..f60fce69b725 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -113,7 +113,7 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 10e01ff06312..24ba546a1c0e 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -18,7 +18,7 @@ #define RV_MAX_REG_ARGS 8 #define RV_FENTRY_NINSNS 2 #define RV_FENTRY_NBYTES (RV_FENTRY_NINSNS * 4) -#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI_CLANG) ? 1 : 0) +#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI) ? 1 : 0) /* imm that allows emit_imm to emit max count insns */ #define RV_MAX_COUNT_IMM 0x7FFF7FF7FF7FF7FF @@ -469,7 +469,7 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx) static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit(hash, ctx); } diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index 240592e3f5c2..530e497ca2f9 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -71,7 +71,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..b6da2d37cfd1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,8 +127,8 @@ config X86 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_CFI_CLANG if X86_64 - select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG + select ARCH_SUPPORTS_CFI if X86_64 + select ARCH_USES_CFI_TRAPS if X86_64 && CFI select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_RT @@ -2396,11 +2396,11 @@ config FUNCTION_PADDING_CFI default 3 if FUNCTION_ALIGNMENT_8B default 0 -# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# Basically: FUNCTION_ALIGNMENT - 5*CFI # except Kconfig can't do arithmetic :/ config FUNCTION_PADDING_BYTES int - default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_PADDING_CFI if CFI default FUNCTION_ALIGNMENT config CALL_PADDING @@ -2410,7 +2410,7 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE + depends on X86_KERNEL_IBT && CFI && MITIGATION_RETPOLINE select CALL_PADDING config FINEIBT_BHI @@ -2427,7 +2427,7 @@ config CALL_THUNKS config PREFIX_SYMBOLS def_bool y - depends on CALL_PADDING && !CFI_CLANG + depends on CALL_PADDING && !CFI menuconfig CPU_MITIGATIONS bool "Mitigations for CPU vulnerabilities" diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 1751f1eb95ef..976b90a3d190 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -113,7 +113,7 @@ extern bhi_thunk __bhi_args_end[]; struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall @@ -157,7 +157,7 @@ static inline int cfi_get_func_arity(void *func) { return 0; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #if HAS_KERNEL_IBT == 1 #define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d2a6d953be9..bc184dd38d99 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7bde68247b5f..79ae9cb50019 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1170,7 +1170,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO -#elif defined(CONFIG_CFI_CLANG) +#elif defined(CONFIG_CFI) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF @@ -1182,7 +1182,7 @@ enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; bool cfi_bhi __ro_after_init = false; #endif -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI u32 cfi_get_func_hash(void *func) { u32 hash; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6079d15dab8c..3863d7709386 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -339,7 +339,7 @@ static bool can_probe(unsigned long paddr) if (is_exception_insn(&insn)) return false; - if (IS_ENABLED(CONFIG_CFI_CLANG)) { + if (IS_ENABLED(CONFIG_CFI)) { /* * The compiler generates the following instruction sequence * for indirect call checks and cfi.c decodes this; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index e0a607a14e7e..5ce1d4263000 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -57,7 +57,7 @@ ifdef CONFIG_MITIGATION_RETPOLINE PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/drivers/misc/lkdtm/cfi.c b/drivers/misc/lkdtm/cfi.c index 6a33889d0902..c3971f7caa65 100644 --- a/drivers/misc/lkdtm/cfi.c +++ b/drivers/misc/lkdtm/cfi.c @@ -43,7 +43,7 @@ static void lkdtm_CFI_FORWARD_PROTO(void) lkdtm_indirect_call((void *)lkdtm_increment_int); pr_err("FAIL: survived mismatched prototype function call!\n"); - pr_expected_config(CONFIG_CFI_CLANG); + pr_expected_config(CONFIG_CFI); } /* diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e..a65a87366c48 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -157,7 +157,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define PATCHABLE_DISCARDS *(__patchable_function_entries) #endif -#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG +#ifndef CONFIG_ARCH_SUPPORTS_CFI /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 52a98886a455..1fd22ea6eba4 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,7 +11,7 @@ #include #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI extern bool cfi_warn; enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, @@ -52,7 +52,7 @@ static inline u32 cfi_get_func_hash(void *func) extern u32 cfi_bpf_hash; extern u32 cfi_bpf_subprog_hash; -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ static inline int cfi_get_offset(void) { return 0; } static inline u32 cfi_get_func_hash(void *func) { return 0; } @@ -60,7 +60,7 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } #define cfi_bpf_hash 0U #define cfi_bpf_subprog_hash 0U -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifdef CONFIG_ARCH_USES_CFI_TRAPS bool is_cfi_trap(unsigned long addr); diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h index 685f7181780f..a86af9bc8bdc 100644 --- a/include/linux/cfi_types.h +++ b/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ @@ -43,7 +43,7 @@ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI #define DEFINE_CFI_TYPE(name, func) \ /* \ * Force a reference to the function so the compiler generates \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6f04a1d8c720..fb27da2221ee 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -248,7 +248,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ -#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CFI) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. diff --git a/init/Kconfig b/init/Kconfig index 836320251219..67f10d8a33b7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2063,8 +2063,8 @@ config RUST depends on !GCC_PLUGIN_RANDSTRUCT depends on !RANDSTRUCT depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO) - depends on !CFI_CLANG || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC - select CFI_ICALL_NORMALIZE_INTEGERS if CFI_CLANG + depends on !CFI || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC + select CFI_ICALL_NORMALIZE_INTEGERS if CFI depends on !CALL_PADDING || RUSTC_VERSION >= 108100 depends on !KASAN_SW_TAGS depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300 diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..27e0e6a33610 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 64caaf997fc0..7c3924614e01 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 39278737bb68..2a1beebf1d37 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb..f8e8c126705c 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075..e3e69df19e78 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2894,7 +2894,7 @@ config FORTIFY_KUNIT_TEST config LONGEST_SYM_KUNIT_TEST tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS depends on KUNIT && KPROBES - depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL + depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL default KUNIT_ALL_TESTS help Tests the longest symbol possible diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h index 6b8713675765..2e098274e45c 100644 --- a/tools/include/linux/cfi_types.h +++ b/tools/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 89979ca23c3f..34e2fdfe7300 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -120,7 +120,7 @@ #endif // In the kernel sources (include/linux/cfi_types.h), this has a different -// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang +// definition when CONFIG_CFI is used, for tools/ just use the !cfi // definition: #ifndef SYM_TYPED_START #define SYM_TYPED_START(name, linkage, align...) \ -- cgit v1.2.3 From d0ca0df179c4b21e2a6c4a4fb637aa8fa14575cb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 24 Sep 2025 13:18:22 -0700 Subject: crypto: af_alg - Fix incorrect boolean values in af_alg_ctx Commit 1b34cbbf4f01 ("crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg") changed some fields from bool to 1-bit bitfields of type u32. However, some assignments to these fields, specifically 'more' and 'merge', assign values greater than 1. These relied on C's implicit conversion to bool, such that zero becomes false and nonzero becomes true. With a 1-bit bitfields of type u32 instead, mod 2 of the value is taken instead, resulting in 0 being assigned in some cases when 1 was intended. Fix this by restoring the bool type. Fixes: 1b34cbbf4f01 ("crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Linus Torvalds --- include/crypto/if_alg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 0c70f3a55575..107b797c33ec 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -152,7 +152,7 @@ struct af_alg_ctx { size_t used; atomic_t rcvused; - u32 more:1, + bool more:1, merge:1, enc:1, write:1, -- cgit v1.2.3 From 340974c4f709ce8142a672ab11f1889dff94d6dc Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:00 +0530 Subject: mailbox: Add common header for RPMI messages sent via mailbox The RPMI based mailbox controller drivers and mailbox clients need to share defines related to RPMI messages over mailbox interface so add a common header for this purpose. Acked-by: Jassi Brar Co-developed-by: Rahul Pathak Signed-off-by: Rahul Pathak Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250818040920.272664-5-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- include/linux/mailbox/riscv-rpmi-message.h | 214 +++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 include/linux/mailbox/riscv-rpmi-message.h (limited to 'include') diff --git a/include/linux/mailbox/riscv-rpmi-message.h b/include/linux/mailbox/riscv-rpmi-message.h new file mode 100644 index 000000000000..c3a98fc12c0a --- /dev/null +++ b/include/linux/mailbox/riscv-rpmi-message.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Ventana Micro Systems Inc. */ + +#ifndef _LINUX_RISCV_RPMI_MESSAGE_H_ +#define _LINUX_RISCV_RPMI_MESSAGE_H_ + +#include +#include +#include +#include + +/* RPMI version encode/decode macros */ +#define RPMI_VER_MAJOR(__ver) upper_16_bits(__ver) +#define RPMI_VER_MINOR(__ver) lower_16_bits(__ver) +#define RPMI_MKVER(__maj, __min) (((u32)(__maj) << 16) | (u16)(__min)) + +/* RPMI message header */ +struct rpmi_message_header { + __le16 servicegroup_id; + u8 service_id; + u8 flags; + __le16 datalen; + __le16 token; +}; + +/* RPMI message */ +struct rpmi_message { + struct rpmi_message_header header; + u8 data[]; +}; + +/* RPMI notification event */ +struct rpmi_notification_event { + __le16 event_datalen; + u8 event_id; + u8 reserved; + u8 event_data[]; +}; + +/* RPMI error codes */ +enum rpmi_error_codes { + RPMI_SUCCESS = 0, + RPMI_ERR_FAILED = -1, + RPMI_ERR_NOTSUPP = -2, + RPMI_ERR_INVALID_PARAM = -3, + RPMI_ERR_DENIED = -4, + RPMI_ERR_INVALID_ADDR = -5, + RPMI_ERR_ALREADY = -6, + RPMI_ERR_EXTENSION = -7, + RPMI_ERR_HW_FAULT = -8, + RPMI_ERR_BUSY = -9, + RPMI_ERR_INVALID_STATE = -10, + RPMI_ERR_BAD_RANGE = -11, + RPMI_ERR_TIMEOUT = -12, + RPMI_ERR_IO = -13, + RPMI_ERR_NO_DATA = -14, + RPMI_ERR_RESERVED_START = -15, + RPMI_ERR_RESERVED_END = -127, + RPMI_ERR_VENDOR_START = -128, +}; + +static inline int rpmi_to_linux_error(int rpmi_error) +{ + switch (rpmi_error) { + case RPMI_SUCCESS: + return 0; + case RPMI_ERR_INVALID_PARAM: + case RPMI_ERR_BAD_RANGE: + case RPMI_ERR_INVALID_STATE: + return -EINVAL; + case RPMI_ERR_DENIED: + return -EPERM; + case RPMI_ERR_INVALID_ADDR: + case RPMI_ERR_HW_FAULT: + return -EFAULT; + case RPMI_ERR_ALREADY: + return -EALREADY; + case RPMI_ERR_BUSY: + return -EBUSY; + case RPMI_ERR_TIMEOUT: + return -ETIMEDOUT; + case RPMI_ERR_IO: + return -ECOMM; + case RPMI_ERR_FAILED: + case RPMI_ERR_NOTSUPP: + case RPMI_ERR_NO_DATA: + case RPMI_ERR_EXTENSION: + default: + return -EOPNOTSUPP; + } +} + +/* RPMI Linux mailbox attribute IDs */ +enum rpmi_mbox_attribute_id { + RPMI_MBOX_ATTR_SPEC_VERSION, + RPMI_MBOX_ATTR_MAX_MSG_DATA_SIZE, + RPMI_MBOX_ATTR_SERVICEGROUP_ID, + RPMI_MBOX_ATTR_SERVICEGROUP_VERSION, + RPMI_MBOX_ATTR_IMPL_ID, + RPMI_MBOX_ATTR_IMPL_VERSION, + RPMI_MBOX_ATTR_MAX_ID +}; + +/* RPMI Linux mailbox message types */ +enum rpmi_mbox_message_type { + RPMI_MBOX_MSG_TYPE_GET_ATTRIBUTE, + RPMI_MBOX_MSG_TYPE_SET_ATTRIBUTE, + RPMI_MBOX_MSG_TYPE_SEND_WITH_RESPONSE, + RPMI_MBOX_MSG_TYPE_SEND_WITHOUT_RESPONSE, + RPMI_MBOX_MSG_TYPE_NOTIFICATION_EVENT, + RPMI_MBOX_MSG_MAX_TYPE +}; + +/* RPMI Linux mailbox message instance */ +struct rpmi_mbox_message { + enum rpmi_mbox_message_type type; + union { + struct { + enum rpmi_mbox_attribute_id id; + u32 value; + } attr; + + struct { + u32 service_id; + void *request; + unsigned long request_len; + void *response; + unsigned long max_response_len; + unsigned long out_response_len; + } data; + + struct { + u16 event_datalen; + u8 event_id; + u8 *event_data; + } notif; + }; + int error; +}; + +/* RPMI Linux mailbox message helper routines */ +static inline void rpmi_mbox_init_get_attribute(struct rpmi_mbox_message *msg, + enum rpmi_mbox_attribute_id id) +{ + msg->type = RPMI_MBOX_MSG_TYPE_GET_ATTRIBUTE; + msg->attr.id = id; + msg->attr.value = 0; + msg->error = 0; +} + +static inline void rpmi_mbox_init_set_attribute(struct rpmi_mbox_message *msg, + enum rpmi_mbox_attribute_id id, + u32 value) +{ + msg->type = RPMI_MBOX_MSG_TYPE_SET_ATTRIBUTE; + msg->attr.id = id; + msg->attr.value = value; + msg->error = 0; +} + +static inline void rpmi_mbox_init_send_with_response(struct rpmi_mbox_message *msg, + u32 service_id, + void *request, + unsigned long request_len, + void *response, + unsigned long max_response_len) +{ + msg->type = RPMI_MBOX_MSG_TYPE_SEND_WITH_RESPONSE; + msg->data.service_id = service_id; + msg->data.request = request; + msg->data.request_len = request_len; + msg->data.response = response; + msg->data.max_response_len = max_response_len; + msg->data.out_response_len = 0; + msg->error = 0; +} + +static inline void rpmi_mbox_init_send_without_response(struct rpmi_mbox_message *msg, + u32 service_id, + void *request, + unsigned long request_len) +{ + msg->type = RPMI_MBOX_MSG_TYPE_SEND_WITHOUT_RESPONSE; + msg->data.service_id = service_id; + msg->data.request = request; + msg->data.request_len = request_len; + msg->data.response = NULL; + msg->data.max_response_len = 0; + msg->data.out_response_len = 0; + msg->error = 0; +} + +static inline void *rpmi_mbox_get_msg_response(struct rpmi_mbox_message *msg) +{ + return msg ? msg->data.response : NULL; +} + +static inline int rpmi_mbox_send_message(struct mbox_chan *chan, + struct rpmi_mbox_message *msg) +{ + int ret; + + /* Send message for the underlying mailbox channel */ + ret = mbox_send_message(chan, msg); + if (ret < 0) + return ret; + + /* Explicitly signal txdone for mailbox channel */ + ret = msg->error; + mbox_client_txdone(chan, ret); + return ret; +} + +#endif /* _LINUX_RISCV_RPMI_MESSAGE_H_ */ -- cgit v1.2.3 From f32a26fab3672e60f622bd7461bf978fc72f29ec Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Wed, 24 Sep 2025 16:24:41 -0700 Subject: hfs/hfsplus: rework debug output subsystem Currently, HFS/HFS+ has very obsolete and inconvenient debug output subsystem. Also, the code is duplicated in HFS and HFS+ driver. This patch introduces linux/hfs_common.h for gathering common declarations, inline functions, and common short methods. Currently, this file contains only hfs_dbg() function that employs pr_debug() with the goal to print a debug-level messages conditionally. So, now, it is possible to enable the debug output by means of: echo 'file extent.c +p' > /proc/dynamic_debug/control echo 'func hfsplus_evict_inode +p' > /proc/dynamic_debug/control And debug output looks like this: hfs: pid 5831:fs/hfs/catalog.c:228 hfs_cat_delete(): delete_cat: 00,48 hfs: pid 5831:fs/hfs/extent.c:484 hfs_file_truncate(): truncate: 48, 409600 -> 0 hfs: pid 5831:fs/hfs/extent.c:212 hfs_dump_extent(): hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 78:4 hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 0:0 hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 0:0 v4 Debug messages have been reworked and information about new HFS/HFS+ shared declarations file has been added to MAINTAINERS file. v5 Yangtao Li suggested to clean up debug output and fix several typos. Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org cc: Johannes Thumshirn Signed-off-by: Viacheslav Dubeyko --- MAINTAINERS | 2 ++ fs/hfs/bfind.c | 4 ++-- fs/hfs/bitmap.c | 4 ++-- fs/hfs/bnode.c | 28 ++++++++++++++-------------- fs/hfs/brec.c | 8 ++++---- fs/hfs/btree.c | 2 +- fs/hfs/catalog.c | 14 +++++++------- fs/hfs/extent.c | 19 ++++++++++--------- fs/hfs/hfs_fs.h | 33 +-------------------------------- fs/hfs/inode.c | 4 ++-- fs/hfsplus/attributes.c | 8 ++++---- fs/hfsplus/bfind.c | 4 ++-- fs/hfsplus/bitmap.c | 10 +++++----- fs/hfsplus/bnode.c | 28 ++++++++++++++-------------- fs/hfsplus/brec.c | 10 +++++----- fs/hfsplus/btree.c | 4 ++-- fs/hfsplus/catalog.c | 6 +++--- fs/hfsplus/extents.c | 27 +++++++++++++-------------- fs/hfsplus/hfsplus_fs.h | 35 +---------------------------------- fs/hfsplus/super.c | 16 ++++++++++++---- fs/hfsplus/xattr.c | 4 ++-- include/linux/hfs_common.h | 20 ++++++++++++++++++++ 22 files changed, 128 insertions(+), 162 deletions(-) create mode 100644 include/linux/hfs_common.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index f5e1540cb702..83579b15ba43 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10794,6 +10794,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfs.rst F: fs/hfs/ +F: include/linux/hfs_common.h HFSPLUS FILESYSTEM M: Viacheslav Dubeyko @@ -10804,6 +10805,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfsplus.rst F: fs/hfsplus/ +F: include/linux/hfs_common.h HGA FRAMEBUFFER DRIVER M: Ferenc Bakonyi diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index e46f650b5e9c..c2f840c49e60 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -26,7 +26,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFS_CAT_CNID: @@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 28307bc9ec1e..5e84833a4743 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg("start %u, count %u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index e8cd1a31f247..fcfffe75d84e 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, { struct page *src_page, *dst_page; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg_cont(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - hfs_dbg_cont(BNODE_MOD, " (%d,%d", - tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg(" (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index b01db1fae147..e49a141c87e5 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -94,7 +94,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -192,7 +192,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -261,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d, new %d, next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -397,7 +397,7 @@ again: newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index e86e1e235658..22e62fe7448b 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index b6b18ee68135..caebabb6642f 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; @@ -238,7 +238,7 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) s64 leaf_tail; s64 node_id; - hfs_dbg(CAT_MOD, "correct next unused CNID: cnid %u, next_id %lld\n", + hfs_dbg("cnid %u, next_id %lld\n", cnid, atomic64_read(&HFS_SB(sb)->next_id)); if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) { @@ -274,7 +274,7 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) return -ENOENT; } - hfs_dbg(CAT_MOD, "node_id %lld, leaf_tail %lld, leaf_head %lld\n", + hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n", node_id, leaf_tail, leaf_head); hfs_bnode_dump(node); @@ -309,13 +309,13 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) if (rec.type == HFS_CDR_DIR) { found_cnid = be32_to_cpu(rec.dir.DirID); - hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_dbg("found_cnid %u\n", found_cnid); hfs_set_next_unused_CNID(sb, cnid, found_cnid); hfs_bnode_put(node); return 0; } else if (rec.type == HFS_CDR_FIL) { found_cnid = be32_to_cpu(rec.file.FlNum); - hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_dbg("found_cnid %u\n", found_cnid); hfs_set_next_unused_CNID(sb, cnid, found_cnid); hfs_bnode_put(node); return 0; @@ -343,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) struct hfs_readdir_data *rd; int res, type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) @@ -417,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 580c62981dbd..a097908b269d 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent: "); for (i = 0; i < 3; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" block %u, count %u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg("\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -456,7 +457,7 @@ out: return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; @@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index c732a066de78..fff149af89da 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,12 +9,6 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -24,35 +18,10 @@ #include #include +#include #include "hfs.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 - -//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) -//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - - /* * struct hfs_inode_info * diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 490b6ad5d981..9cd449913dc8 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -249,7 +249,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); atomic64_dec(&HFS_SB(sb)->folder_count); @@ -436,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index eeebe80c6be4..ba26980cc503 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); @@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg("cnid %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 26ebac4c6042..afc9c89e8c6a 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); @@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index bd8dcea85588..1b3af8c87cad 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg("size %u, offset %u, len %u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -155,7 +155,7 @@ done: *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg("start %u, max %u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg("offset %u, count %u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 407d5152eb41..63e652ad1e0d 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -173,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -231,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) void *src_ptr, *dst_ptr; int l; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -351,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -369,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - hfs_dbg_cont(BNODE_MOD, " (%d", tmp); + hfs_dbg(" (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -415,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg("btree delete level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -470,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -510,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -656,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -669,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 1918544a7871..b4645102feec 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -92,7 +92,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -193,7 +193,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d - new %d - next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -383,7 +383,7 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -395,7 +395,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - hfs_dbg(BNODE_MOD, "splitting index node\n"); + hfs_dbg("splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index fe6a54c4083c..7cc5aea14572 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -434,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap_local(data); nidx = node->next; if (!nidx) { - hfs_dbg(BNODE_MOD, "create new bmap node\n"); + hfs_dbg("create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -460,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 1995bafee839..02c1eee4a4b8 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) int err, off; u16 type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1699b3c246a..8e886514d27f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg("ino %lu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent "); for (i = 0; i < 8; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" start_block %u, block_count %u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg("\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -359,8 +359,7 @@ found: if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = 0; @@ -370,8 +369,7 @@ found: count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -521,7 +520,7 @@ out: return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9dd18de0bc89..89e8b19c127b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,47 +11,14 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include #include #include +#include #include "hfsplus_raw.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 -#define DBG_ATTR_MOD 0x00000080 - -#if 0 -#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) -#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#endif -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 77ec048021a0..16bc4abc67e0 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -163,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -178,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -197,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - hfs_dbg(SUPER, "hfsplus_sync_fs\n"); + hfs_dbg("starting...\n"); /* * Explicitly write out the special metadata inodes. @@ -228,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); + hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n", + sbi->free_blocks, sbi->next_cnid, + sbi->folder_count, sbi->file_count); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); write_backup = 1; @@ -253,6 +257,8 @@ out: if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); + hfs_dbg("finished: err %d\n", error); + return error; } @@ -301,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - hfs_dbg(SUPER, "hfsplus_put_super\n"); + hfs_dbg("starting...\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -323,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb) kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); call_rcu(&sbi->rcu, delayed_free); + + hfs_dbg("finished\n"); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index c951fa9835aa..ece4d29c0ab9 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file, u32 used_bmp_bytes; u64 tmp; - hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + hfs_dbg("clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ @@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb) struct page *page; int old_state = HFSPLUS_EMPTY_ATTR_TREE; - hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID); check_attr_tree_state_again: switch (atomic_read(&sbi->attr_tree_state)) { diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h new file mode 100644 index 000000000000..8838ca2f3d08 --- /dev/null +++ b/include/linux/hfs_common.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HFS/HFS+ common definitions, inline functions, + * and shared functionality. + */ + +#ifndef _HFS_COMMON_H_ +#define _HFS_COMMON_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define hfs_dbg(fmt, ...) \ + pr_debug("pid %d:%s:%d %s(): " fmt, \ + current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ + +#endif /* _HFS_COMMON_H_ */ -- cgit v1.2.3 From ba879dfc0574878f3e08f217b2b4fdf845c426c0 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:01 +0530 Subject: mailbox: Allow controller specific mapping using fwnode Introduce optional fw_node() callback which allows a mailbox controller driver to provide controller specific mapping using fwnode. The Linux OF framework already implements fwnode operations for the Linux DD framework so the fw_xlate() callback works fine with device tree as well. Acked-by: Jassi Brar Reviewed-by: Andy Shevchenko Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250818040920.272664-6-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- drivers/mailbox/mailbox.c | 65 +++++++++++++++++++++++--------------- include/linux/mailbox_controller.h | 3 ++ 2 files changed, 43 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 5cd8ae222073..2acc6ec229a4 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "mailbox.h" @@ -383,34 +384,56 @@ EXPORT_SYMBOL_GPL(mbox_bind_client); */ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) { - struct device *dev = cl->dev; + struct fwnode_reference_args fwspec; + struct fwnode_handle *fwnode; struct mbox_controller *mbox; struct of_phandle_args spec; struct mbox_chan *chan; + struct device *dev; + unsigned int i; int ret; - if (!dev || !dev->of_node) { - pr_debug("%s: No owner device node\n", __func__); + dev = cl->dev; + if (!dev) { + pr_debug("No owner device\n"); return ERR_PTR(-ENODEV); } - ret = of_parse_phandle_with_args(dev->of_node, "mboxes", "#mbox-cells", - index, &spec); + fwnode = dev_fwnode(dev); + if (!fwnode) { + dev_dbg(dev, "No owner fwnode\n"); + return ERR_PTR(-ENODEV); + } + + ret = fwnode_property_get_reference_args(fwnode, "mboxes", "#mbox-cells", + 0, index, &fwspec); if (ret) { - dev_err(dev, "%s: can't parse \"mboxes\" property\n", __func__); + dev_err(dev, "%s: can't parse \"%s\" property\n", __func__, "mboxes"); return ERR_PTR(ret); } + spec.np = to_of_node(fwspec.fwnode); + spec.args_count = fwspec.nargs; + for (i = 0; i < spec.args_count; i++) + spec.args[i] = fwspec.args[i]; + scoped_guard(mutex, &con_mutex) { chan = ERR_PTR(-EPROBE_DEFER); - list_for_each_entry(mbox, &mbox_cons, node) - if (mbox->dev->of_node == spec.np) { - chan = mbox->of_xlate(mbox, &spec); - if (!IS_ERR(chan)) - break; + list_for_each_entry(mbox, &mbox_cons, node) { + if (device_match_fwnode(mbox->dev, fwspec.fwnode)) { + if (mbox->fw_xlate) { + chan = mbox->fw_xlate(mbox, &fwspec); + if (!IS_ERR(chan)) + break; + } else if (mbox->of_xlate) { + chan = mbox->of_xlate(mbox, &spec); + if (!IS_ERR(chan)) + break; + } } + } - of_node_put(spec.np); + fwnode_handle_put(fwspec.fwnode); if (IS_ERR(chan)) return chan; @@ -427,15 +450,8 @@ EXPORT_SYMBOL_GPL(mbox_request_channel); struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl, const char *name) { - struct device_node *np = cl->dev->of_node; - int index; - - if (!np) { - dev_err(cl->dev, "%s() currently only supports DT\n", __func__); - return ERR_PTR(-EINVAL); - } + int index = device_property_match_string(cl->dev, "mbox-names", name); - index = of_property_match_string(np, "mbox-names", name); if (index < 0) { dev_err(cl->dev, "%s() could not locate channel named \"%s\"\n", __func__, name); @@ -470,9 +486,8 @@ void mbox_free_channel(struct mbox_chan *chan) } EXPORT_SYMBOL_GPL(mbox_free_channel); -static struct mbox_chan * -of_mbox_index_xlate(struct mbox_controller *mbox, - const struct of_phandle_args *sp) +static struct mbox_chan *fw_mbox_index_xlate(struct mbox_controller *mbox, + const struct fwnode_reference_args *sp) { int ind = sp->args[0]; @@ -523,8 +538,8 @@ int mbox_controller_register(struct mbox_controller *mbox) spin_lock_init(&chan->lock); } - if (!mbox->of_xlate) - mbox->of_xlate = of_mbox_index_xlate; + if (!mbox->fw_xlate && !mbox->of_xlate) + mbox->fw_xlate = fw_mbox_index_xlate; scoped_guard(mutex, &con_mutex) list_add_tail(&mbox->node, &mbox_cons); diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index ad01c4082358..80a427c7ca29 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -66,6 +66,7 @@ struct mbox_chan_ops { * no interrupt rises. Ignored if 'txdone_irq' is set. * @txpoll_period: If 'txdone_poll' is in effect, the API polls for * last TX's status after these many millisecs + * @fw_xlate: Controller driver specific mapping of channel via fwnode * @of_xlate: Controller driver specific mapping of channel via DT * @poll_hrt: API private. hrtimer used to poll for TXDONE on all * channels. @@ -79,6 +80,8 @@ struct mbox_controller { bool txdone_irq; bool txdone_poll; unsigned txpoll_period; + struct mbox_chan *(*fw_xlate)(struct mbox_controller *mbox, + const struct fwnode_reference_args *sp); struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox, const struct of_phandle_args *sp); /* Internal to API */ -- cgit v1.2.3 From 6f01c24f3a7525e953d514367a6d91b39c8f1f6a Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:02 +0530 Subject: byteorder: Add memcpy_to_le32() and memcpy_from_le32() Add common memcpy APIs for copying u32 array to/from __le32 array. Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Signed-off-by: Anup Patel Reviewed-by: Linus Walleij Acked-by: Jassi Brar Link: https://lore.kernel.org/r/20250818040920.272664-7-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- include/linux/byteorder/generic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index c9a4c96c9943..b3705e8bbe2b 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words) } } +static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words) +{ + size_t i; + + for (i = 0; i < words; i++) + dst[i] = le32_to_cpu(src[i]); +} + +static inline void memcpy_to_le32(__le32 *dst, const u32 *src, size_t words) +{ + size_t i; + + for (i = 0; i < words; i++) + dst[i] = cpu_to_le32(src[i]); +} + static inline void be16_add_cpu(__be16 *var, u16 val) { *var = cpu_to_be16(be16_to_cpu(*var) + val); -- cgit v1.2.3 From 253757797973c54ea967f8fd8f40d16e4a78e6d4 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Wed, 24 Sep 2025 17:16:19 +0800 Subject: scsi: ufs: core: Change MCQ interrupt enable flow Move the MCQ interrupt enable process to ufshcd_mcq_make_queues_operational() to ensure that interrupts are set correctly when making queues operational, similar to ufshcd_make_hba_operational(). This change addresses the issue where ufshcd_mcq_make_queues_operational() was not fully operational due to missing interrupt enablement. This change only affects host drivers that call ufshcd_mcq_make_queues_operational(), i.e. ufs-mediatek. Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 11 +++++++++++ drivers/ufs/core/ufshcd.c | 12 +----------- include/ufs/ufshcd.h | 1 + 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 1e50675772fe..d33173aa0b9a 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -29,6 +29,10 @@ #define MCQ_ENTRY_SIZE_IN_DWORD 8 #define CQE_UCD_BA GENMASK_ULL(63, 7) +#define UFSHCD_ENABLE_MCQ_INTRS (UTP_TASK_REQ_COMPL |\ + UFSHCD_ERROR_MASK |\ + MCQ_CQ_EVENT_STATUS) + /* Max mcq register polling time in microseconds */ #define MCQ_POLL_US 500000 @@ -355,9 +359,16 @@ EXPORT_SYMBOL_GPL(ufshcd_mcq_poll_cqe_lock); void ufshcd_mcq_make_queues_operational(struct ufs_hba *hba) { struct ufs_hw_queue *hwq; + u32 intrs; u16 qsize; int i; + /* Enable required interrupts */ + intrs = UFSHCD_ENABLE_MCQ_INTRS; + if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_INTR) + intrs &= ~MCQ_CQ_EVENT_STATUS; + ufshcd_enable_intr(hba, intrs); + for (i = 0; i < hba->nr_hw_queues; i++) { hwq = &hba->uhq[i]; hwq->id = i; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index cfc149f8238e..fdccab0454b2 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -45,11 +45,6 @@ UTP_TASK_REQ_COMPL |\ UFSHCD_ERROR_MASK) -#define UFSHCD_ENABLE_MCQ_INTRS (UTP_TASK_REQ_COMPL |\ - UFSHCD_ERROR_MASK |\ - MCQ_CQ_EVENT_STATUS) - - /* UIC command timeout, unit: ms */ enum { UIC_CMD_TIMEOUT_DEFAULT = 500, @@ -372,7 +367,7 @@ EXPORT_SYMBOL_GPL(ufshcd_disable_irq); * @hba: per adapter instance * @intrs: interrupt bits */ -static void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs) +void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs) { u32 old_val = ufshcd_readl(hba, REG_INTERRUPT_ENABLE); u32 new_val = old_val | intrs; @@ -8909,16 +8904,11 @@ err: static void ufshcd_config_mcq(struct ufs_hba *hba) { int ret; - u32 intrs; ret = ufshcd_mcq_vops_config_esi(hba); hba->mcq_esi_enabled = !ret; dev_info(hba->dev, "ESI %sconfigured\n", ret ? "is not " : ""); - intrs = UFSHCD_ENABLE_MCQ_INTRS; - if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_INTR) - intrs &= ~MCQ_CQ_EVENT_STATUS; - ufshcd_enable_intr(hba, intrs); ufshcd_mcq_make_queues_operational(hba); ufshcd_mcq_config_mac(hba, hba->nutrs); diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index ea0021f067c9..d8e06de0afbb 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1292,6 +1292,7 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg) void ufshcd_enable_irq(struct ufs_hba *hba); void ufshcd_disable_irq(struct ufs_hba *hba); +void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs); int ufshcd_alloc_host(struct device *, struct ufs_hba **); int ufshcd_hba_enable(struct ufs_hba *hba); int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int); -- cgit v1.2.3 From 79dde5f7dc7c038eec903745dc1550cd4139980e Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Wed, 17 Sep 2025 17:41:43 +0800 Subject: scsi: ufs: core: Fix data race in CPU latency PM QoS request handling The cpu_latency_qos_add/remove/update_request interfaces lack internal synchronization by design, requiring the caller to ensure thread safety. The current implementation relies on the 'pm_qos_enabled' flag, which is insufficient to prevent concurrent access and cannot serve as a proper synchronization mechanism. This has led to data races and list corruption issues. A typical race condition call trace is: [Thread A] ufshcd_pm_qos_exit() --> cpu_latency_qos_remove_request() --> cpu_latency_qos_apply(); --> pm_qos_update_target() --> plist_del <--(1) delete plist node --> memset(req, 0, sizeof(*req)); --> hba->pm_qos_enabled = false; [Thread B] ufshcd_devfreq_target --> ufshcd_devfreq_scale --> ufshcd_scale_clks --> ufshcd_pm_qos_update <--(2) pm_qos_enabled is true --> cpu_latency_qos_update_request --> pm_qos_update_target --> plist_del <--(3) plist node use-after-free Introduces a dedicated mutex to serialize PM QoS operations, preventing data races and ensuring safe access to PM QoS resources, including sysfs interface reads. Fixes: 2777e73fc154 ("scsi: ufs: core: Add CPU latency QoS support for UFS driver") Signed-off-by: Zhongqiu Han Reviewed-by: Bart Van Assche Tested-by: Huan Tang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-sysfs.c | 2 ++ drivers/ufs/core/ufshcd.c | 9 +++++++++ include/ufs/ufshcd.h | 3 +++ 3 files changed, 14 insertions(+) (limited to 'include') diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c index 4bd7d491e3c5..0086816b27cd 100644 --- a/drivers/ufs/core/ufs-sysfs.c +++ b/drivers/ufs/core/ufs-sysfs.c @@ -512,6 +512,8 @@ static ssize_t pm_qos_enable_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); + guard(mutex)(&hba->pm_qos_mutex); + return sysfs_emit(buf, "%d\n", hba->pm_qos_enabled); } diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index fdccab0454b2..b2e103aa4e62 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -1045,6 +1045,7 @@ EXPORT_SYMBOL_GPL(ufshcd_is_hba_active); */ void ufshcd_pm_qos_init(struct ufs_hba *hba) { + guard(mutex)(&hba->pm_qos_mutex); if (hba->pm_qos_enabled) return; @@ -1061,6 +1062,8 @@ void ufshcd_pm_qos_init(struct ufs_hba *hba) */ void ufshcd_pm_qos_exit(struct ufs_hba *hba) { + guard(mutex)(&hba->pm_qos_mutex); + if (!hba->pm_qos_enabled) return; @@ -1075,6 +1078,8 @@ void ufshcd_pm_qos_exit(struct ufs_hba *hba) */ static void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on) { + guard(mutex)(&hba->pm_qos_mutex); + if (!hba->pm_qos_enabled) return; @@ -10743,6 +10748,10 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) mutex_init(&hba->ee_ctrl_mutex); mutex_init(&hba->wb_mutex); + + /* Initialize mutex for PM QoS request synchronization */ + mutex_init(&hba->pm_qos_mutex); + init_rwsem(&hba->clk_scaling_lock); ufshcd_init_clk_gating(hba); diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index d8e06de0afbb..9425cfd9d00e 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -938,6 +938,7 @@ enum ufshcd_mcq_opr { * @ufs_rtc_update_work: A work for UFS RTC periodic update * @pm_qos_req: PM QoS request handle * @pm_qos_enabled: flag to check if pm qos is enabled + * @pm_qos_mutex: synchronizes PM QoS request and status updates * @critical_health_count: count of critical health exceptions * @dev_lvl_exception_count: count of device level exceptions since last reset * @dev_lvl_exception_id: vendor specific information about the @@ -1110,6 +1111,8 @@ struct ufs_hba { struct delayed_work ufs_rtc_update_work; struct pm_qos_request pm_qos_req; bool pm_qos_enabled; + /* synchronizes PM QoS request and status updates */ + struct mutex pm_qos_mutex; int critical_health_count; atomic_t dev_lvl_exception_count; -- cgit v1.2.3 From edcc8a38b5ac1a3dbd05e113a38a25b937ebefe5 Mon Sep 17 00:00:00 2001 From: Qi Xi Date: Tue, 9 Sep 2025 19:29:10 +0800 Subject: once: fix race by moving DO_ONCE to separate section The commit c2c60ea37e5b ("once: use __section(".data.once")") moved DO_ONCE's ___done variable to .data.once section, which conflicts with DO_ONCE_LITE() that also uses the same section. This creates a race condition when clear_warn_once is used: Thread 1 (DO_ONCE) Thread 2 (DO_ONCE) __do_once_start read ___done (false) acquire once_lock execute func __do_once_done write ___done (true) __do_once_start release once_lock // Thread 3 clear_warn_once reset ___done read ___done (false) acquire once_lock execute func schedule once_work __do_once_done once_deferred: OK write ___done (true) static_branch_disable release once_lock schedule once_work once_deferred: BUG_ON(!static_key_enabled) DO_ONCE_LITE() in once_lite.h is used by WARN_ON_ONCE() and other warning macros. Keep its ___done flag in the .data..once section and allow resetting by clear_warn_once, as originally intended. In contrast, DO_ONCE() is used for functions like get_random_once() and relies on its ___done flag for internal synchronization. We should not reset DO_ONCE() by clear_warn_once. Fix it by isolating DO_ONCE's ___done into a separate .data..do_once section, shielding it from clear_warn_once. Fixes: c2c60ea37e5b ("once: use __section(".data.once")") Reported-by: Hulk Robot Signed-off-by: Qi Xi Signed-off-by: Arnd Bergmann --- include/asm-generic/vmlinux.lds.h | 1 + include/linux/once.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e..8efbe8c4874e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -361,6 +361,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __start_once = .; \ *(.data..once) \ __end_once = .; \ + *(.data..do_once) \ STRUCT_ALIGN(); \ *(__tracepoints) \ /* implement dynamic printk debug */ \ diff --git a/include/linux/once.h b/include/linux/once.h index 30346fcdc799..449a0e34ad5a 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -46,7 +46,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data..once") ___done = false; \ + static bool __section(".data..do_once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ @@ -64,7 +64,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE_SLEEPABLE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data..once") ___done = false; \ + static bool __section(".data..do_once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ ___ret = __do_once_sleepable_start(&___done); \ -- cgit v1.2.3 From a19239ba14525c26ad097d59fd52cd9198b5bcdb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Sep 2025 13:49:49 +0100 Subject: afs: Add support for RENAME_NOREPLACE and RENAME_EXCHANGE Add support for RENAME_NOREPLACE and RENAME_EXCHANGE, if the server supports them. The default is translated to YFS.Rename_Replace, falling back to YFS.Rename; RENAME_NOREPLACE is translated to YFS.Rename_NoReplace and RENAME_EXCHANGE to YFS.Rename_Exchange, both of which fall back to reporting EINVAL. Signed-off-by: David Howells Link: https://lore.kernel.org/740476.1758718189@warthog.procyon.org.uk cc: Marc Dionne cc: Dan Carpenter cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/dir.c | 223 +++++++++++++++++++++++++++++++--------- fs/afs/dir_edit.c | 18 ++-- fs/afs/dir_silly.c | 11 ++ fs/afs/internal.h | 15 ++- fs/afs/misc.c | 1 + fs/afs/protocol_yfs.h | 3 + fs/afs/rotate.c | 17 +++- fs/afs/yfsclient.c | 249 +++++++++++++++++++++++++++++++++++++++++++++ fs/dcache.c | 1 + include/trace/events/afs.h | 6 ++ 10 files changed, 480 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index bfb69e066672..89d36e3e5c79 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1823,7 +1823,8 @@ error: static void afs_rename_success(struct afs_operation *op) { - struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; _enter("op=%08x", op->debug_id); @@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); /* If we're moving a subdir between dirs, we need to update * its DV counter too as the ".." will be altered. */ - if (S_ISDIR(vnode->netfs.inode.i_mode) && - op->file[0].vnode != op->file[1].vnode) { - u64 new_dv; + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; - write_seqlock(&vnode->cb_lock); + write_seqlock(&vnode->cb_lock); - new_dv = vnode->status.data_version + 1; - trace_afs_set_dv(vnode, new_dv); - vnode->status.data_version = new_dv; - inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); - write_sequnlock(&vnode->cb_lock); + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } } } @@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op) if (S_ISDIR(vnode->netfs.inode.i_mode) && new_dvnode != orig_dvnode && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_edit_dir_update_dotdot(vnode, new_dvnode, - afs_edit_dir_for_rename_sub); + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); new_inode = d_inode(new_dentry); if (new_inode) { @@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op) /* Now we can update d_fsdata on the dentries to reflect their * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. */ afs_update_dentry_version(op, new_dvp, op->dentry); afs_update_dentry_version(op, new_dvp, op->dentry_2); @@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op) fscache_end_operation(&new_cres); } +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } +} + static void afs_rename_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); @@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = { .put = afs_rename_put, }; +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ @@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; /* Don't allow silly-rename files be moved around. */ @@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (ret < 0) goto error; + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; @@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old_dentry; op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; op->rename.new_negative = d_is_negative(new_dentry); - op->ops = &afs_rename_operation; - /* For non-directories, check whether the target is busy and if so, - * make a copy of the dentry and then do a silly-rename. If the - * silly-rename succeeds, the copied dentry is hashed and becomes the - * new target. - */ - if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { - /* To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - op->rename.rehash = new_dentry; - } + op->ops = &afs_rename_operation; - if (d_count(new_dentry) > 2) { - /* copy the target dentry's name */ - op->rename.tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!op->rename.tmp) { - afs_op_nomem(op); - goto error; + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; } - ret = afs_sillyrename(new_dvnode, - AFS_FS_I(d_inode(new_dentry)), - new_dentry, op->key); - if (ret) { - afs_op_set_error(op, ret); - goto error; + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } - - op->dentry_2 = op->rename.tmp; - op->rename.rehash = NULL; - op->rename.new_negative = true; } } @@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, d_drop(old_dentry); ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; out: afs_dir_unuse_cookie(orig_dvnode, ret); if (new_dvnode != orig_dvnode) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 60a549f1d9c5..4b1342c72089 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -522,11 +522,11 @@ error: } /* - * Edit a subdirectory that has been moved between directories to update the - * ".." entry. + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. */ -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why) +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *block; union afs_xdr_dirent *de; @@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto already_invalidated; - slot = afs_dir_scan_block(block, &dotdot_name, b); + slot = afs_dir_scan_block(block, name, b); if (slot >= 0) goto found_dirent; @@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d /* Didn't find the dirent to clobber. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); goto out; @@ -576,7 +576,7 @@ found_dirent: de->u.unique = htonl(new_dvnode->fid.unique); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, - ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); kunmap_local(block); netfs_single_mark_inode_dirty(&vnode->netfs.inode); @@ -589,12 +589,12 @@ out: already_invalidated: kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; error: trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 0b80eb93fa40..014495d4b868 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode if (IS_ERR(op)) return PTR_ERR(op); + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; @@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old; op->dentry_2 = new; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1124ea4000cb..444a3ea4fdf6 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -562,6 +562,7 @@ struct afs_server { #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ @@ -891,9 +892,10 @@ struct afs_operation { bool need_rehash; } unlink; struct { - struct dentry *rehash; - struct dentry *tmp; - bool new_negative; + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; @@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping, extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* @@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 8f2b3a177690..c8a7f266080d 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code) case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; default: return -EREMOTEIO; } diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index e4cd89c44c46..b2f06c1917c2 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -50,6 +50,9 @@ enum YFS_FS_Operations { YFSREMOVEACL = 64171, YFSREMOVEFILE2 = 64173, YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ YFSSTOREDATA64 = 64538, /* YFS Store file data */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a1c24f589d9e..6a4e7da10fc4 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op) afs_op_set_error(op, -EDQUOT); goto failed_but_online; + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + default: afs_op_accumulate_error(op, error, abort_code); failed_but_online: @@ -620,12 +630,13 @@ iterate_address: op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); - op->volsync.creation = TIME64_MIN; - op->volsync.update = TIME64_MIN; - op->call_responded = false; _debug("address [%u] %u/%u %pISp", op->server_index, addr_index, alist->nr_addrs, rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _leave(" = t"); return true; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 257af259c04a..febf13a49f0b 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op) _enter(""); + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -1070,6 +1073,252 @@ void yfs_fs_rename(struct afs_operation *op) afs_make_op_call(op, call, GFP_NOFS); } +/* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + /* * YFS.StoreData64 operation type. */ diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..9e2bd890b728 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) write_sequnlock(&rename_lock); } +EXPORT_SYMBOL(d_exchange); /** * d_ancestor - search for an ancestor diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 7f83d242c8e9..1b3c48b5591d 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -69,6 +69,9 @@ enum afs_fs_operation { yfs_FS_RemoveACL = 64171, yfs_FS_RemoveFile2 = 64173, yfs_FS_StoreOpaqueACL2 = 64174, + yfs_FS_Rename_Replace = 64176, + yfs_FS_Rename_NoReplace = 64177, + yfs_FS_Rename_Exchange = 64187, yfs_FS_InlineBulkStatus = 64536, /* YFS Fetch multiple file statuses with errors */ yfs_FS_FetchData64 = 64537, /* YFS Fetch file data */ yfs_FS_StoreData64 = 64538, /* YFS Store file data */ @@ -300,6 +303,9 @@ enum yfs_cm_operation { EM(yfs_FS_RemoveACL, "YFS.RemoveACL") \ EM(yfs_FS_RemoveFile2, "YFS.RemoveFile2") \ EM(yfs_FS_StoreOpaqueACL2, "YFS.StoreOpaqueACL2") \ + EM(yfs_FS_Rename_Replace, "YFS.Rename_Replace") \ + EM(yfs_FS_Rename_NoReplace, "YFS.Rename_NoReplace") \ + EM(yfs_FS_Rename_Exchange, "YFS.Rename_Exchange") \ EM(yfs_FS_InlineBulkStatus, "YFS.InlineBulkStatus") \ EM(yfs_FS_FetchData64, "YFS.FetchData64") \ EM(yfs_FS_StoreData64, "YFS.StoreData64") \ -- cgit v1.2.3 From 10cdfcd37ade7ce736bc4a1927680f390a6b1f7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:58 +0200 Subject: nstree: make struct ns_tree private Don't expose it directly. There's no need to do that. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nstree.h | 13 ------------- kernel/nstree.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 29ad6402260c..8b8636690473 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -9,19 +9,6 @@ #include #include -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; -}; - extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; diff --git a/kernel/nstree.c b/kernel/nstree.c index bbe8bedc924c..ecc88b013eff 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -4,6 +4,20 @@ #include #include +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + struct ns_tree mnt_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), -- cgit v1.2.3 From 4055526d35746ce8b04bfa5e14e14f28bb163186 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:59 +0200 Subject: ns: move ns type into struct ns_common It's misplaced in struct proc_ns_operations and ns->ops might be NULL if the namespace is compiled out but we still want to know the type of the namespace for the initial namespace struct. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 6 +++--- fs/nsfs.c | 18 +++++++++--------- include/linux/ns_common.h | 30 +++++++++++++++++++++++++----- include/linux/proc_ns.h | 1 - init/version-timestamp.c | 1 + ipc/msgutil.c | 1 + ipc/namespace.c | 1 - kernel/cgroup/cgroup.c | 1 + kernel/cgroup/namespace.c | 1 - kernel/nscommon.c | 5 +++-- kernel/nsproxy.c | 4 ++-- kernel/nstree.c | 8 ++++---- kernel/pid.c | 1 + kernel/pid_namespace.c | 2 -- kernel/time/namespace.c | 3 +-- kernel/user.c | 1 + kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - net/core/net_namespace.c | 1 - 19 files changed, 52 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index d65917ec5544..01334d5038a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4927,7 +4927,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5830,7 +5830,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -6016,6 +6016,7 @@ struct mnt_namespace init_mnt_ns = { .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), @@ -6333,7 +6334,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/nsfs.c b/fs/nsfs.c index dc0a4404b971..e7fd8a790aaa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -219,9 +219,9 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; @@ -234,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -273,7 +273,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return ret; } case NS_GET_MNTNS_ID: - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; fallthrough; case NS_GET_ID: { @@ -293,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -314,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -453,7 +453,7 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } fid->ns_id = ns->ns_id; - fid->ns_type = ns->ops->type; + fid->ns_type = ns->ns_type; fid->ns_inum = inode->i_ino; return FILEID_NSFS; } @@ -489,14 +489,14 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); if (!__ns_ref_get(ns)) return NULL; } - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: if (!current_in_namespace(to_cg_ns(ns))) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 56492cd9ff8d..f5b68b8abb54 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,6 +4,7 @@ #include #include +#include struct proc_ns_operations; @@ -37,6 +38,7 @@ extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; @@ -51,7 +53,7 @@ struct ns_common { }; }; -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ @@ -106,10 +108,28 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_init(__ns) \ - __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 08016f6e0e6f..e81b8e596e4f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 376b7c856d4d..d071835121c2 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,6 +8,7 @@ #include struct uts_namespace init_uts_ns = { + .ns.ns_type = ns_common_type(&init_uts_ns), .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index dca6c8ec8f5f..7a03f6d03de3 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -33,6 +33,7 @@ struct ipc_namespace init_ipc_ns = { #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif + .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index d89dfd718d2b..76abac74a5c3 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -248,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245b43ff2fa4..9b75102e81cb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -224,6 +224,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.ops = &cgroupns_operations, .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 04c98338ac08..241ca05f07c8 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -137,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 3cef89ddef41..92c9df1e8774 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -7,7 +7,7 @@ #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); @@ -52,12 +52,13 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) } #endif -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; + ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8d62449237b6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c index ecc88b013eff..b24a320a11a6 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -106,7 +106,7 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree->ns_tree_lock); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); /* @@ -128,7 +128,7 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) { VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree->ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); @@ -197,7 +197,7 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) if (!node) return NULL; - VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); return node_to_ns(node); } @@ -225,7 +225,7 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, if (list_is_head(list, &ns_tree->ns_list)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); return list_entry_rcu(list, struct ns_common, ns_list_node); } diff --git a/kernel/pid.c b/kernel/pid.c index 7e8c66e0bf67..0c2dcddb317a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a262a3f19443..f5b222c8ac39 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -443,7 +443,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -454,7 +453,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 9f26e61be044..530cf99c2212 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -462,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -472,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -480,6 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { + .ns.ns_type = ns_common_type(&init_time_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = ns_init_inum(&init_time_ns), diff --git a/kernel/user.c b/kernel/user.c index b2a53674d506..0163665914c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,6 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, + .ns.ns_type = ns_common_type(&init_user_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e1559e8a8a02..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1400,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, diff --git a/kernel/utsname.c b/kernel/utsname.c index 00001592ad13..a8cdc84648ee 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -146,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bdea7d5fac56..dfe84bd35f98 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1543,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, -- cgit v1.2.3 From 4ae8d9aa9f9dc7137ea5e564d79c5aa5af1bc45c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Sep 2025 23:02:41 +0200 Subject: sched/deadline: Fix dl_server getting stuck John found it was easy to hit lockup warnings when running locktorture on a 2 CPU VM, which he bisected down to: commit cccb45d7c429 ("sched/deadline: Less agressive dl_server handling"). While debugging it seems there is a chance where we end up with the dl_server dequeued, with dl_se->dl_server_active. This causes dl_server_start() to return without enqueueing the dl_server, thus it fails to run when RT tasks starve the cpu. When this happens, dl_server_timer() catches the '!dl_se->server_has_tasks(dl_se)' case, which then calls replenish_dl_entity() and dl_server_stopped() and finally return HRTIMER_NO_RESTART. This ends in no new timer and also no enqueue, leaving the dl_server 'dead', allowing starvation. What should have happened is for the bandwidth timer to start the zero-laxity timer, which in turn would enqueue the dl_server and cause dl_se->server_pick_task() to be called -- which will stop the dl_server if no fair tasks are observed for a whole period. IOW, it is totally irrelevant if there are fair tasks at the moment of bandwidth refresh. This removes all dl_se->server_has_tasks() users, so remove the whole thing. Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Tested-by: John Stultz --- include/linux/sched.h | 1 - kernel/sched/deadline.c | 12 +----------- kernel/sched/fair.c | 7 +------ kernel/sched/sched.h | 4 ---- 4 files changed, 2 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b833350..f89313b150e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -733,7 +733,6 @@ struct sched_dl_entity { * runnable task. */ struct rq *rq; - dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e47..5a5080b3a670 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; -static bool dl_server_stopped(struct sched_dl_entity *dl_se); - static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->dl_runtime) return HRTIMER_NORESTART; - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); - dl_server_stopped(dl_se); - return HRTIMER_NORESTART; - } - if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1625,11 +1617,9 @@ static bool dl_server_stopped(struct sched_dl_entity *dl_se) } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..8ce56a8d507f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..f10d6277dca1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -365,9 +365,6 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * @@ -383,7 +380,6 @@ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); -- cgit v1.2.3 From a3a70caf7906708bf9bbc80018752a6b36543808 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Sep 2025 12:03:20 +0200 Subject: sched/deadline: Fix dl_server behaviour John reported undesirable behaviour with the dl_server since commit: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling"). When starving fair tasks on purpose (starting spinning FIFO tasks), his fair workload, which often goes (briefly) idle, would delay fair invocations for a second, running one invocation per second was both unexpected and terribly slow. The reason this happens is that when dl_se->server_pick_task() returns NULL, indicating no runnable tasks, it would yield, pushing any later jobs out a whole period (1 second). Instead simply stop the server. This should restore behaviour in that a later wakeup (which restarts the server) will be able to continue running (subject to the CBS wakeup rules). Notably, this does not re-introduce the behaviour cccb45d7c4295 set out to solve, any start/stop cycle is naturally throttled by the timer period (no active cancel). Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Tested-by: John Stultz --- include/linux/sched.h | 1 - kernel/sched/deadline.c | 23 ++--------------------- kernel/sched/sched.h | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f89313b150e6..e4ce0a76831e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -706,7 +706,6 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; - unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a5080b3a670..72c1f72463c7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1571,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) { - dl_se->dl_server_idle = 0; + if (dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); - } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1602,20 +1600,6 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } -static bool dl_server_stopped(struct sched_dl_entity *dl_se) -{ - if (!dl_se->dl_server_active) - return true; - - if (dl_se->dl_server_idle) { - dl_server_stop(dl_se); - return true; - } - - dl_se->dl_server_idle = 1; - return false; -} - void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task) { @@ -2384,10 +2368,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (!dl_server_stopped(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f10d6277dca1..cf2109b67f9a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -371,10 +371,39 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); -- cgit v1.2.3 From 88a90315a99a9120cd471bf681515cc77cd7cdb8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:14 +0800 Subject: rcu: Replace preempt.h with sched.h in include/linux/rcupdate.h In the next commit, we will move the definition of migrate_enable() and migrate_disable() to linux/sched.h. However, migrate_enable/migrate_disable will be used in commit 1b93c03fb319 ("rcu: add rcu_read_lock_dont_migrate()") in bpf-next tree. In order to fix potential compiling error, replace linux/preempt.h with linux/sched.h in include/linux/rcupdate.h. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..8f346c847ee5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 378b7708194fff77c9020392067329931c3fcc04 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:15 +0800 Subject: sched: Make migrate_{en,dis}able() inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For now, migrate_enable and migrate_disable are global, which makes them become hotspots in some case. Take BPF for example, the function calling to migrate_enable and migrate_disable in BPF trampoline can introduce significant overhead, and following is the 'perf top' of FENTRY's benchmark (./tools/testing/selftests/bpf/bench trig-fentry): 54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] bpf_prog_2dcccf652aac1793_bench_trigger_fentry 10.43% [kernel] [k] migrate_enable 10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037 8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 1.32% [kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% [kernel] [k] bpf_prog_test_run_raw_tp So in this commit, we make migrate_enable/migrate_disable inline to obtain better performance. The struct rq is defined internally in kernel/sched/sched.h, and the field "nr_pinned" is accessed in migrate_enable/migrate_disable, which makes it hard to make them inline. Alexei Starovoitov suggests to generate the offset of "nr_pinned" in [1], so we can define the migrate_enable/migrate_disable in include/linux/sched.h and access "this_rq()->nr_pinned" with "(void *)this_rq() + RQ_nr_pinned". The offset of "nr_pinned" is generated in include/generated/rq-offsets.h by kernel/sched/rq-offsets.c. Generally speaking, we move the definition of migrate_enable and migrate_disable to include/linux/sched.h from kernel/sched/core.c. The calling to __set_cpus_allowed_ptr() is leaved in ___migrate_enable(). The "struct rq" is not available in include/linux/sched.h, so we can't access the "runqueues" with this_cpu_ptr(), as the compilation will fail in this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): typeof((ptr) + 0) So we introduce the this_rq_raw() and access the runqueues with arch_raw_cpu_ptr/PERCPU_PTR directly. The variable "runqueues" is not visible in the kernel modules, and export it is not a good idea. As Peter Zijlstra advised in [2], we define and export migrate_enable/migrate_disable in kernel/sched/core.c too, and use them for the modules. Before this patch, the performance of BPF FENTRY is: fentry : 113.030 ± 0.149M/s fentry : 112.501 ± 0.187M/s fentry : 112.828 ± 0.267M/s fentry : 115.287 ± 0.241M/s After this patch, the performance of BPF FENTRY increases to: fentry : 143.644 ± 0.670M/s fentry : 149.764 ± 0.362M/s fentry : 149.642 ± 0.156M/s fentry : 145.263 ± 0.221M/s Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/CAADnVQ+5sEDKHdsJY5ZsfGDO_1SEhhQWHrt2SMBG5SYyQ+jt7w@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/20250819123214.GH4067720@noisy.programming.kicks-ass.net/ [2] --- Kbuild | 13 +++++- include/linux/preempt.h | 3 -- include/linux/sched.h | 113 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 1 + kernel/sched/core.c | 63 ++++++-------------------- kernel/sched/rq-offsets.c | 12 +++++ 6 files changed, 152 insertions(+), 53 deletions(-) create mode 100644 kernel/sched/rq-offsets.c (limited to 'include') diff --git a/Kbuild b/Kbuild index f327ca86990c..13324b4bbe23 100644 --- a/Kbuild +++ b/Kbuild @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file) $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) +# Generate rq-offsets.h + +rq-offsets-file := include/generated/rq-offsets.h + +targets += kernel/sched/rq-offsets.s + +kernel/sched/rq-offsets.s: $(offsets-file) + +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE + $(call filechk,offsets,__RQ_OFFSETS_H__) + # Check for missing system calls quiet_cmd_syscalls = CALL $< cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) PHONY += missing-syscalls -missing-syscalls: scripts/checksyscalls.sh $(offsets-file) +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file) $(call cmd,syscalls) # Check the manual modification of atomic headers diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1fad1c8a4c76..92237c319035 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * work-conserving schedulers. * */ -extern void migrate_disable(void); -extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section @@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void) DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC diff --git a/include/linux/sched.h b/include/linux/sched.h index 644a01bdae70..d60ecaccdffc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,9 @@ #include #include #include +#ifndef COMPILE_OFFSETS +#include +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -2317,4 +2320,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) + +static inline void __migrate_enable(void) +{ + struct task_struct *p = current; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; +} + +static inline void __migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ + +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static inline void migrate_disable(void) +{ + __migrate_disable(); +} + +static inline void migrate_enable(void) +{ + __migrate_enable(); +} +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..de9078a9df3a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23855,6 +23855,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feb750aae71b..7f1e5cb94c53 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include #include #include #include @@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) __do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2410,35 +2391,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 000000000000..a23747bbe25b --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include +#include +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} -- cgit v1.2.3 From 45b7f780739a3145aeef24d2dfa02517a6c82ed6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:16 +0800 Subject: sched: Fix some typos in include/linux/preempt.h There are some typos in the comments of migrate in include/linux/preempt.h: elegible -> eligible it's -> its migirate_disable -> migrate_disable abritrary -> arbitrary Just fix them. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- include/linux/preempt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 92237c319035..102202185d7a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, /* * Migrate-Disable and why it is undesired. * - * When a preempted task becomes elegible to run under the ideal model (IOW it + * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() @@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be - * in a migrate_disable() section, reducing it's available bandwidth. + * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully @@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, - * all these primitives employ migirate_disable() to restore this implicit + * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting @@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need - * a schedulability theory that does not depend on abritrary migration. + * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. -- cgit v1.2.3 From 25c550464acd40803d63868dfa4a42506df48b88 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:04 +0200 Subject: net: gro: remove is_ipv6 from napi_gro_cb Remove is_ipv6 from napi_gro_cb and use sk->sk_family instead. This frees up space for another ip_fixedid bit that will be added in the next commit. udp_sock_create always creates either a AF_INET or a AF_INET6 socket, so using sk->sk_family is reliable. In IPv6-FOU, cfg->ipv6_v6only is always enabled. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-2-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 --- net/ipv4/fou_core.c | 32 ++++++++++++++------------------ net/ipv4/udp_offload.c | 2 -- net/ipv6/udp_offload.c | 2 -- 4 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index a0fca7ac6e7e..87c68007f949 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -71,9 +71,6 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 3e30745e2c09..3970b6b7ace5 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -228,21 +228,27 @@ drop: return 0; } +static const struct net_offload *fou_gro_ops(const struct sock *sk, + int proto) +{ + const struct net_offload __rcu **offloads; + + /* FOU doesn't allow IPv4 on IPv6 sockets. */ + offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads; + return rcu_dereference(offloads[proto]); +} + static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; - u8 proto; if (!fou) goto out; - proto = fou->protocol; - /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply @@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -268,10 +273,8 @@ out: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - u8 proto; int err; if (!fou) { @@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, goto out; } - proto = fou->protocol; - - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; @@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -450,8 +449,7 @@ next_proto: /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -467,7 +465,6 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); - const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; @@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return err; } - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b1f3fd302e9d..19d0b5b09ffa 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 0; - if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index d8445ac1b2e4..046f13b1d77a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -154,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 1; - if (static_branch_unlikely(&udpv6_encap_needed_key)) sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); -- cgit v1.2.3 From 21f7484220ace6c355cb0023d14d83da6fe5843d Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:05 +0200 Subject: net: gro: only merge packets with incrementing or fixed outer ids Only merge encapsulated packets if their outer IDs are either incrementing or fixed, just like for inner IDs and IDs of non-encapsulated packets. Add another ip_fixedid bit for a total of two bits: one for outer IDs (and for unencapsulated packets) and one for inner IDs. This commit preserves the current behavior of GSO where only the IDs of the inner-most headers are restored correctly. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-3-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 26 +++++++++++--------------- net/ipv4/tcp_offload.c | 5 ++++- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 87c68007f949..e7997a9fb30b 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -75,7 +75,7 @@ struct napi_gro_cb { u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ - u8 ip_fixedid:1; + u8 ip_fixedid:2; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -442,29 +442,26 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, - struct sk_buff *p, bool outer) + struct sk_buff *p, bool inner) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; const u32 df = id & IP_DF; - int flush; /* All fields must match except length and checksum. */ - flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); - - if (flush | (outer && df)) - return flush; + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF))) + return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && df && !ipid_offset) - NAPI_GRO_CB(p)->ip_fixedid = true; + NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; - return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); + return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) @@ -479,7 +476,7 @@ static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, - bool outer) + bool inner) { const void *nh = th - diff; const void *nh2 = th2 - diff; @@ -487,19 +484,18 @@ static inline int __gro_receive_network_flush(const void *th, const void *th2, if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else - return inet_gro_flush(nh, nh2, p, outer); + return inet_gro_flush(nh, nh2, p, inner); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { - const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; - flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); - if (encap_mark) - flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false); + if (NAPI_GRO_CB(p)->encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true); return flush; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index e6612bd84d09..1949eede9ec9 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,6 +471,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -484,8 +485,10 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); + is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); + (is_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From 3271f19bf7b9df665549666d789b9f126b4420c7 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:06 +0200 Subject: net: gso: restore ids of outer ip headers correctly Currently, NETIF_F_TSO_MANGLEID indicates that the inner-most ID can be mangled. Outer IDs can always be mangled. Make GSO preserve outer IDs by default, with NETIF_F_TSO_MANGLEID allowing both inner and outer IDs to be mangled. This commit also modifies a few drivers that use SKB_GSO_FIXEDID directly. Signed-off-by: Richard Gobert Reviewed-by: Edward Cree # for sfc Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-4-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/segmentation-offloads.rst | 22 ++++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 ++++++-- drivers/net/ethernet/sfc/ef100_tx.c | 17 +++++++++++++---- include/linux/netdevice.h | 9 +++++++-- include/linux/skbuff.h | 8 +++++++- net/core/dev.c | 10 ++++++++-- net/ipv4/af_inet.c | 13 ++++++------- net/ipv4/tcp_offload.c | 6 ++---- 8 files changed, 63 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst index 085e8fab03fd..72f69b22b28c 100644 --- a/Documentation/networking/segmentation-offloads.rst +++ b/Documentation/networking/segmentation-offloads.rst @@ -43,10 +43,19 @@ also point to the TCP header of the packet. For IPv4 segmentation we support one of two types in terms of the IP ID. The default behavior is to increment the IP ID with every segment. If the GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP -ID and all segments will use the same IP ID. If a device has -NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO -and we will either increment the IP ID for all frames, or leave it at a -static value based on driver preference. +ID and all segments will use the same IP ID. + +For encapsulated packets, SKB_GSO_TCP_FIXEDID refers only to the outer header. +SKB_GSO_TCP_FIXEDID_INNER can be used to specify the same for the inner header. +Any combination of these two GSO types is allowed. + +If a device has NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when +performing TSO and we will either increment the IP ID for all frames, or leave +it at a static value based on driver preference. For encapsulated packets, +NETIF_F_TSO_MANGLEID is relevant for both outer and inner headers, unless the +DF bit is not set on the outer header, in which case the device driver must +guarantee that the IP ID field is incremented in the outer header with every +segment. UDP Fragmentation Offload @@ -124,10 +133,7 @@ Generic Receive Offload Generic receive offload is the complement to GSO. Ideally any frame assembled by GRO should be segmented to create an identical sequence of frames using GSO, and any sequence of frames segmented by GSO should be -able to be reassembled back to the original by GRO. The only exception to -this is IPv4 ID in the case that the DF bit is set for a given IP header. -If the value of the IPv4 ID is not sequentially incrementing it will be -altered so that it is when a frame assembled via GRO is segmented via GSO. +able to be reassembled back to the original by GRO. Partial Generic Segmentation Offload diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 4ed43ee9aa35..263d5628ee44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1290,8 +1290,12 @@ static void mlx5e_shampo_update_ipv4_tcp_hdr(struct mlx5e_rq *rq, struct iphdr * tcp->check = ~tcp_v4_check(skb->len - tcp_off, ipv4->saddr, ipv4->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; - if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) { + bool encap = rq->hw_gro_data->fk.control.flags & FLOW_DIS_ENCAPSULATION; + + skb_shinfo(skb)->gso_type |= encap ? SKB_GSO_TCP_FIXEDID_INNER : + SKB_GSO_TCP_FIXEDID; + } skb->csum_start = (unsigned char *)tcp - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index e6b6be549581..03005757c060 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -189,6 +189,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, { bool gso_partial = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL; unsigned int len, ip_offset, tcp_offset, payload_segs; + u32 mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; u32 mangleid = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; unsigned int outer_ip_offset, outer_l4_offset; u16 vlan_tci = skb_vlan_tag_get(skb); @@ -200,8 +201,17 @@ static void ef100_make_tso_desc(struct efx_nic *efx, bool outer_csum; u32 paylen; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) - mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (encap) { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID_INNER) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } else { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } + if (efx->net_dev->features & NETIF_F_HW_VLAN_CTAG_TX) vlan_enable = skb_vlan_tag_present(skb); @@ -245,8 +255,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, ESF_GZ_TX_TSO_OUTER_L4_OFF_W, outer_l4_offset >> 1, ESF_GZ_TX_TSO_ED_OUTER_UDP_LEN, udp_encap && !gso_partial, ESF_GZ_TX_TSO_ED_OUTER_IP_LEN, encap && !gso_partial, - ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, encap ? mangleid : - ESE_GZ_TX_DESC_IP4_ID_NO_OP, + ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, mangleid_outer, ESF_GZ_TX_TSO_VLAN_INSERT_EN, vlan_enable, ESF_GZ_TX_TSO_VLAN_INSERT_TCI, vlan_tci ); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c54d44805fa..1b85454116f6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5320,13 +5320,18 @@ void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { - netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; + netdev_features_t feature; + + if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER)) + gso_type |= __SKB_GSO_TCP_FIXEDID; + + feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); - BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 78ecfa7d00d0..fb3fec9affaa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -674,7 +674,7 @@ enum { /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, - SKB_GSO_TCP_FIXEDID = 1 << 3, + __SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, @@ -707,6 +707,12 @@ enum { SKB_GSO_FRAGLIST = 1 << 18, SKB_GSO_TCP_ACCECN = 1 << 19, + + /* These indirectly map onto the same netdev feature. + * If NETIF_F_TSO_MANGLEID is set it may mangle both inner and outer IDs. + */ + SKB_GSO_TCP_FIXEDID = 1 << 30, + SKB_GSO_TCP_FIXEDID_INNER = 1 << 31, }; #if BITS_PER_LONG > 32 diff --git a/net/core/dev.c b/net/core/dev.c index fc4993526ead..8b54fdf0289a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e298dacb4a06..804c51296c55 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,14 +1395,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + /* fixed ID is invalid if DF bit is not set */ + fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); + if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) + goto out; - /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; - } + if (!skb->encapsulation || encap) + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 1949eede9ec9..2cb93da93abc 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,7 +471,6 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); - bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -485,10 +484,9 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); - is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; - + BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (is_fixedid * SKB_GSO_TCP_FIXEDID); + (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From f095a358faf263bf1d8ae712bd38e13b71286819 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:07 +0200 Subject: net: gro: remove unnecessary df checks Currently, packets with fixed IDs will be merged only if their don't-fragment bit is set. This restriction is unnecessary since packets without the don't-fragment bit will be forwarded as-is even if they were merged together. The merged packets will be segmented into their original forms before being forwarded, either by GSO or by TSO. The IDs will also remain identical unless NETIF_F_TSO_MANGLEID is set, in which case the IDs can become incrementing, which is also fine. Clean up the code by removing the unnecessary don't-fragment checks. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-5-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 5 ++--- net/ipv4/af_inet.c | 3 --- tools/testing/selftests/net/gro.c | 9 ++++----- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index e7997a9fb30b..e3affb2e2ca8 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -448,17 +448,16 @@ static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *ip const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; - const u32 df = id & IP_DF; /* All fields must match except length and checksum. */ - if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF))) + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF)) return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ - if (count == 1 && df && !ipid_offset) + if (count == 1 && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 804c51296c55..3109c5ec38f3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,10 +1395,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - /* fixed ID is invalid if DF bit is not set */ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; if (!skb->encapsulation || encap) udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index d5824eadea10..3d4a82a2607c 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -670,7 +670,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 3: /* DF=0, Fixed - should not coalesce */ + case 3: /* DF=0, Fixed - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -1188,10 +1188,9 @@ static void gro_receiver(void) correct_payload[0] = PAYLOAD_LEN * 2; check_recv_pkts(rxfd, correct_payload, 1); - printf("DF=0, Fixed - should not coalesce: "); - correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); correct_payload[0] = PAYLOAD_LEN * 2; -- cgit v1.2.3 From 60ac65a31041b0e5dfd736a79027314b9d533ef5 Mon Sep 17 00:00:00 2001 From: Sung-Chi Li Date: Thu, 11 Sep 2025 06:56:34 +0000 Subject: platform/chrome: update pwm fan control host commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update cros_ec_commands.h to include definitions for getting PWM fan duty, getting and setting the fan control mode. Signed-off-by: Sung-Chi Li Acked-by: Tzung-Bi Shih Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250911-cros_ec_fan-v6-1-a1446cc098af@google.com Signed-off-by: Guenter Roeck --- include/linux/platform_data/cros_ec_commands.h | 29 +++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index c19b404e3d8d..69294f79cc88 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1825,6 +1825,16 @@ struct ec_response_pwm_get_duty { uint16_t duty; /* Duty cycle, EC_PWM_MAX_DUTY = 100% */ } __ec_align2; +#define EC_CMD_PWM_GET_FAN_DUTY 0x0027 + +struct ec_params_pwm_get_fan_duty { + uint8_t fan_idx; +} __ec_align1; + +struct ec_response_pwm_get_fan_duty { + uint32_t percent; /* Percentage of duty cycle, ranging from 0 ~ 100 */ +} __ec_align4; + /*****************************************************************************/ /* * Lightbar commands. This looks worse than it is. Since we only use one HOST @@ -3127,14 +3137,31 @@ struct ec_params_thermal_set_threshold_v1 { /****************************************************************************/ -/* Toggle automatic fan control */ +/* Set or get fan control mode */ #define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x0052 +enum ec_auto_fan_ctrl_cmd { + EC_AUTO_FAN_CONTROL_CMD_SET = 0, + EC_AUTO_FAN_CONTROL_CMD_GET, +}; + /* Version 1 of input params */ struct ec_params_auto_fan_ctrl_v1 { uint8_t fan_idx; } __ec_align1; +/* Version 2 of input params */ +struct ec_params_auto_fan_ctrl_v2 { + uint8_t fan_idx; + uint8_t cmd; /* enum ec_auto_fan_ctrl_cmd */ + uint8_t set_auto; /* only used with EC_AUTO_FAN_CONTROL_CMD_SET - bool + */ +} __ec_align4; + +struct ec_response_auto_fan_control { + uint8_t is_auto; /* bool */ +} __ec_align1; + /* Get/Set TMP006 calibration data */ #define EC_CMD_TMP006_GET_CALIBRATION 0x0053 #define EC_CMD_TMP006_SET_CALIBRATION 0x0054 -- cgit v1.2.3 From 5ba9f520f41a33c99fd5d1eb81b5650ed3517b88 Mon Sep 17 00:00:00 2001 From: Rahul Pathak Date: Mon, 18 Aug 2025 09:39:06 +0530 Subject: clk: Add clock driver for the RISC-V RPMI clock service group The RPMI specification defines a clock service group which can be accessed via SBI MPXY extension or dedicated S-mode RPMI transport. Add mailbox client based clock driver for the RISC-V RPMI clock service group. Reviewed-by: Stephen Boyd Reviewed-by: Andy Shevchenko Co-developed-by: Anup Patel Signed-off-by: Anup Patel Signed-off-by: Rahul Pathak Link: https://lore.kernel.org/r/20250818040920.272664-11-apatel@ventanamicro.com [pjw@kernel.org: converted rpmi_clkrate_u64 macro to a function; replaced bare constant with a macro] Signed-off-by: Paul Walmsley --- drivers/clk/Kconfig | 8 + drivers/clk/Makefile | 1 + drivers/clk/clk-rpmi.c | 620 +++++++++++++++++++++++++++++ include/linux/mailbox/riscv-rpmi-message.h | 16 + 4 files changed, 645 insertions(+) create mode 100644 drivers/clk/clk-rpmi.c (limited to 'include') diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index 4d56475f94fc..4d51fa589c1e 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -501,6 +501,14 @@ config COMMON_CLK_SP7021 Not all features of the PLL are currently supported by the driver. +config COMMON_CLK_RPMI + tristate "Clock driver based on RISC-V RPMI" + depends on MAILBOX + default RISCV + help + Support for clocks based on the clock service group defined by + the RISC-V platform management interface (RPMI) specification. + source "drivers/clk/actions/Kconfig" source "drivers/clk/analogbits/Kconfig" source "drivers/clk/baikal-t1/Kconfig" diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile index 18ed29cfdc11..b74a1767ca27 100644 --- a/drivers/clk/Makefile +++ b/drivers/clk/Makefile @@ -86,6 +86,7 @@ obj-$(CONFIG_COMMON_CLK_PWM) += clk-pwm.o obj-$(CONFIG_CLK_QORIQ) += clk-qoriq.o obj-$(CONFIG_COMMON_CLK_RK808) += clk-rk808.o obj-$(CONFIG_COMMON_CLK_RP1) += clk-rp1.o +obj-$(CONFIG_COMMON_CLK_RPMI) += clk-rpmi.o obj-$(CONFIG_COMMON_CLK_HI655X) += clk-hi655x.o obj-$(CONFIG_COMMON_CLK_S2MPS11) += clk-s2mps11.o obj-$(CONFIG_COMMON_CLK_SCMI) += clk-scmi.o diff --git a/drivers/clk/clk-rpmi.c b/drivers/clk/clk-rpmi.c new file mode 100644 index 000000000000..921296aafa68 --- /dev/null +++ b/drivers/clk/clk-rpmi.c @@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V MPXY Based Clock Driver + * + * Copyright (C) 2025 Ventana Micro Systems Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPMI_CLK_DISCRETE_MAX_NUM_RATES 16 +#define RPMI_CLK_NAME_LEN 16 + +#define to_rpmi_clk(clk) container_of(clk, struct rpmi_clk, hw) + +enum rpmi_clk_config { + RPMI_CLK_DISABLE = 0, + RPMI_CLK_ENABLE = 1, + RPMI_CLK_CONFIG_MAX_IDX +}; + +#define RPMI_CLK_TYPE_MASK GENMASK(1, 0) +enum rpmi_clk_type { + RPMI_CLK_DISCRETE = 0, + RPMI_CLK_LINEAR = 1, + RPMI_CLK_TYPE_MAX_IDX +}; + +struct rpmi_clk_context { + struct device *dev; + struct mbox_chan *chan; + struct mbox_client client; + u32 max_msg_data_size; +}; + +/* + * rpmi_clk_rates represents the rates format + * as specified by the RPMI specification. + * No other data format (e.g., struct linear_range) + * is required to avoid to and from conversion. + */ +union rpmi_clk_rates { + u64 discrete[RPMI_CLK_DISCRETE_MAX_NUM_RATES]; + struct { + u64 min; + u64 max; + u64 step; + } linear; +}; + +struct rpmi_clk { + struct rpmi_clk_context *context; + u32 id; + u32 num_rates; + u32 transition_latency; + enum rpmi_clk_type type; + union rpmi_clk_rates *rates; + char name[RPMI_CLK_NAME_LEN]; + struct clk_hw hw; +}; + +struct rpmi_clk_rate_discrete { + __le32 lo; + __le32 hi; +}; + +struct rpmi_clk_rate_linear { + __le32 min_lo; + __le32 min_hi; + __le32 max_lo; + __le32 max_hi; + __le32 step_lo; + __le32 step_hi; +}; + +struct rpmi_get_num_clocks_rx { + __le32 status; + __le32 num_clocks; +}; + +struct rpmi_get_attrs_tx { + __le32 clkid; +}; + +struct rpmi_get_attrs_rx { + __le32 status; + __le32 flags; + __le32 num_rates; + __le32 transition_latency; + char name[RPMI_CLK_NAME_LEN]; +}; + +struct rpmi_get_supp_rates_tx { + __le32 clkid; + __le32 clk_rate_idx; +}; + +struct rpmi_get_supp_rates_rx { + __le32 status; + __le32 flags; + __le32 remaining; + __le32 returned; + __le32 rates[]; +}; + +struct rpmi_get_rate_tx { + __le32 clkid; +}; + +struct rpmi_get_rate_rx { + __le32 status; + __le32 lo; + __le32 hi; +}; + +struct rpmi_set_rate_tx { + __le32 clkid; + __le32 flags; + __le32 lo; + __le32 hi; +}; + +struct rpmi_set_rate_rx { + __le32 status; +}; + +struct rpmi_set_config_tx { + __le32 clkid; + __le32 config; +}; + +struct rpmi_set_config_rx { + __le32 status; +}; + +static inline u64 rpmi_clkrate_u64(u32 __hi, u32 __lo) +{ + return (((u64)(__hi) << 32) | (u32)(__lo)); +} + +static u32 rpmi_clk_get_num_clocks(struct rpmi_clk_context *context) +{ + struct rpmi_get_num_clocks_rx rx, *resp; + struct rpmi_mbox_message msg; + int ret; + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_NUM_CLOCKS, + NULL, 0, &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return 0; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp || resp->status) + return 0; + + return le32_to_cpu(resp->num_clocks); +} + +static int rpmi_clk_get_attrs(u32 clkid, struct rpmi_clk *rpmi_clk) +{ + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_get_attrs_tx tx; + struct rpmi_get_attrs_rx rx, *resp; + u8 format; + int ret; + + tx.clkid = cpu_to_le32(clkid); + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_ATTRIBUTES, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + rpmi_clk->id = clkid; + rpmi_clk->num_rates = le32_to_cpu(resp->num_rates); + rpmi_clk->transition_latency = le32_to_cpu(resp->transition_latency); + strscpy(rpmi_clk->name, resp->name, RPMI_CLK_NAME_LEN); + + format = le32_to_cpu(resp->flags) & RPMI_CLK_TYPE_MASK; + if (format >= RPMI_CLK_TYPE_MAX_IDX) + return -EINVAL; + + rpmi_clk->type = format; + + return 0; +} + +static int rpmi_clk_get_supported_rates(u32 clkid, struct rpmi_clk *rpmi_clk) +{ + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_clk_rate_discrete *rate_discrete; + struct rpmi_clk_rate_linear *rate_linear; + struct rpmi_get_supp_rates_tx tx; + struct rpmi_get_supp_rates_rx *resp; + struct rpmi_mbox_message msg; + size_t clk_rate_idx; + int ret, rateidx, j; + + tx.clkid = cpu_to_le32(clkid); + tx.clk_rate_idx = 0; + + /* + * Make sure we allocate rx buffer sufficient to be accommodate all + * the rates sent in one RPMI message. + */ + struct rpmi_get_supp_rates_rx *rx __free(kfree) = + kzalloc(context->max_msg_data_size, GFP_KERNEL); + if (!rx) + return -ENOMEM; + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_SUPPORTED_RATES, + &tx, sizeof(tx), rx, context->max_msg_data_size); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + if (!le32_to_cpu(resp->returned)) + return -EINVAL; + + if (rpmi_clk->type == RPMI_CLK_DISCRETE) { + rate_discrete = (struct rpmi_clk_rate_discrete *)resp->rates; + + for (rateidx = 0; rateidx < le32_to_cpu(resp->returned); rateidx++) { + rpmi_clk->rates->discrete[rateidx] = + rpmi_clkrate_u64(le32_to_cpu(rate_discrete[rateidx].hi), + le32_to_cpu(rate_discrete[rateidx].lo)); + } + + /* + * Keep sending the request message until all + * the rates are received. + */ + clk_rate_idx = 0; + while (le32_to_cpu(resp->remaining)) { + clk_rate_idx += le32_to_cpu(resp->returned); + tx.clk_rate_idx = cpu_to_le32(clk_rate_idx); + + rpmi_mbox_init_send_with_response(&msg, + RPMI_CLK_SRV_GET_SUPPORTED_RATES, + &tx, sizeof(tx), + rx, context->max_msg_data_size); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + if (!le32_to_cpu(resp->returned)) + return -EINVAL; + + for (j = 0; j < le32_to_cpu(resp->returned); j++) { + if (rateidx >= clk_rate_idx + le32_to_cpu(resp->returned)) + break; + rpmi_clk->rates->discrete[rateidx++] = + rpmi_clkrate_u64(le32_to_cpu(rate_discrete[j].hi), + le32_to_cpu(rate_discrete[j].lo)); + } + } + } else if (rpmi_clk->type == RPMI_CLK_LINEAR) { + rate_linear = (struct rpmi_clk_rate_linear *)resp->rates; + + rpmi_clk->rates->linear.min = rpmi_clkrate_u64(le32_to_cpu(rate_linear->min_hi), + le32_to_cpu(rate_linear->min_lo)); + rpmi_clk->rates->linear.max = rpmi_clkrate_u64(le32_to_cpu(rate_linear->max_hi), + le32_to_cpu(rate_linear->max_lo)); + rpmi_clk->rates->linear.step = rpmi_clkrate_u64(le32_to_cpu(rate_linear->step_hi), + le32_to_cpu(rate_linear->step_lo)); + } + + return 0; +} + +static unsigned long rpmi_clk_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_get_rate_tx tx; + struct rpmi_get_rate_rx rx, *resp; + int ret; + + tx.clkid = cpu_to_le32(rpmi_clk->id); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_RATE, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + return rpmi_clkrate_u64(le32_to_cpu(resp->hi), le32_to_cpu(resp->lo)); +} + +static int rpmi_clk_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + u64 fmin, fmax, ftmp; + + /* + * Keep the requested rate if the clock format + * is of discrete type. Let the platform which + * is actually controlling the clock handle that. + */ + if (rpmi_clk->type == RPMI_CLK_DISCRETE) + return 0; + + fmin = rpmi_clk->rates->linear.min; + fmax = rpmi_clk->rates->linear.max; + + if (req->rate <= fmin) { + req->rate = fmin; + return 0; + } else if (req->rate >= fmax) { + req->rate = fmax; + return 0; + } + + ftmp = req->rate - fmin; + ftmp += rpmi_clk->rates->linear.step - 1; + do_div(ftmp, rpmi_clk->rates->linear.step); + + req->rate = ftmp * rpmi_clk->rates->linear.step + fmin; + + return 0; +} + +static int rpmi_clk_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_set_rate_tx tx; + struct rpmi_set_rate_rx rx, *resp; + int ret; + + tx.clkid = cpu_to_le32(rpmi_clk->id); + tx.lo = cpu_to_le32(lower_32_bits(rate)); + tx.hi = cpu_to_le32(upper_32_bits(rate)); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_SET_RATE, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + return 0; +} + +static int rpmi_clk_enable(struct clk_hw *hw) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_set_config_tx tx; + struct rpmi_set_config_rx rx, *resp; + int ret; + + tx.config = cpu_to_le32(RPMI_CLK_ENABLE); + tx.clkid = cpu_to_le32(rpmi_clk->id); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_SET_CONFIG, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + return 0; +} + +static void rpmi_clk_disable(struct clk_hw *hw) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_set_config_tx tx; + struct rpmi_set_config_rx rx; + + tx.config = cpu_to_le32(RPMI_CLK_DISABLE); + tx.clkid = cpu_to_le32(rpmi_clk->id); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_SET_CONFIG, + &tx, sizeof(tx), &rx, sizeof(rx)); + + rpmi_mbox_send_message(context->chan, &msg); +} + +static const struct clk_ops rpmi_clk_ops = { + .recalc_rate = rpmi_clk_recalc_rate, + .determine_rate = rpmi_clk_determine_rate, + .set_rate = rpmi_clk_set_rate, + .prepare = rpmi_clk_enable, + .unprepare = rpmi_clk_disable, +}; + +static struct clk_hw *rpmi_clk_enumerate(struct rpmi_clk_context *context, u32 clkid) +{ + struct device *dev = context->dev; + unsigned long min_rate, max_rate; + union rpmi_clk_rates *rates; + struct rpmi_clk *rpmi_clk; + struct clk_init_data init = {}; + struct clk_hw *clk_hw; + int ret; + + rates = devm_kzalloc(dev, sizeof(*rates), GFP_KERNEL); + if (!rates) + return ERR_PTR(-ENOMEM); + + rpmi_clk = devm_kzalloc(dev, sizeof(*rpmi_clk), GFP_KERNEL); + if (!rpmi_clk) + return ERR_PTR(-ENOMEM); + + rpmi_clk->context = context; + rpmi_clk->rates = rates; + + ret = rpmi_clk_get_attrs(clkid, rpmi_clk); + if (ret) + return dev_err_ptr_probe(dev, ret, + "Failed to get clk-%u attributes\n", + clkid); + + ret = rpmi_clk_get_supported_rates(clkid, rpmi_clk); + if (ret) + return dev_err_ptr_probe(dev, ret, + "Get supported rates failed for clk-%u\n", + clkid); + + init.flags = CLK_GET_RATE_NOCACHE; + init.num_parents = 0; + init.ops = &rpmi_clk_ops; + init.name = rpmi_clk->name; + clk_hw = &rpmi_clk->hw; + clk_hw->init = &init; + + ret = devm_clk_hw_register(dev, clk_hw); + if (ret) + return dev_err_ptr_probe(dev, ret, + "Unable to register clk-%u\n", + clkid); + + if (rpmi_clk->type == RPMI_CLK_DISCRETE) { + min_rate = rpmi_clk->rates->discrete[0]; + max_rate = rpmi_clk->rates->discrete[rpmi_clk->num_rates - 1]; + } else { + min_rate = rpmi_clk->rates->linear.min; + max_rate = rpmi_clk->rates->linear.max; + } + + clk_hw_set_rate_range(clk_hw, min_rate, max_rate); + + return clk_hw; +} + +static void rpmi_clk_mbox_chan_release(void *data) +{ + struct mbox_chan *chan = data; + + mbox_free_channel(chan); +} + +static int rpmi_clk_probe(struct platform_device *pdev) +{ + int ret; + unsigned int num_clocks, i; + struct clk_hw_onecell_data *clk_data; + struct rpmi_clk_context *context; + struct rpmi_mbox_message msg; + struct clk_hw *hw_ptr; + struct device *dev = &pdev->dev; + + context = devm_kzalloc(dev, sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + context->dev = dev; + platform_set_drvdata(pdev, context); + + context->client.dev = context->dev; + context->client.rx_callback = NULL; + context->client.tx_block = false; + context->client.knows_txdone = true; + context->client.tx_tout = 0; + + context->chan = mbox_request_channel(&context->client, 0); + if (IS_ERR(context->chan)) + return PTR_ERR(context->chan); + + ret = devm_add_action_or_reset(dev, rpmi_clk_mbox_chan_release, context->chan); + if (ret) + return dev_err_probe(dev, ret, "Failed to add rpmi mbox channel cleanup\n"); + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_SPEC_VERSION); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get spec version\n"); + if (msg.attr.value < RPMI_MKVER(1, 0)) { + return dev_err_probe(dev, -EINVAL, + "msg protocol version mismatch, expected 0x%x, found 0x%x\n", + RPMI_MKVER(1, 0), msg.attr.value); + } + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_SERVICEGROUP_ID); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get service group ID\n"); + if (msg.attr.value != RPMI_SRVGRP_CLOCK) { + return dev_err_probe(dev, -EINVAL, + "service group match failed, expected 0x%x, found 0x%x\n", + RPMI_SRVGRP_CLOCK, msg.attr.value); + } + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_SERVICEGROUP_VERSION); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get service group version\n"); + if (msg.attr.value < RPMI_MKVER(1, 0)) { + return dev_err_probe(dev, -EINVAL, + "service group version failed, expected 0x%x, found 0x%x\n", + RPMI_MKVER(1, 0), msg.attr.value); + } + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_MAX_MSG_DATA_SIZE); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get max message data size\n"); + + context->max_msg_data_size = msg.attr.value; + num_clocks = rpmi_clk_get_num_clocks(context); + if (!num_clocks) + return dev_err_probe(dev, -ENODEV, "No clocks found\n"); + + clk_data = devm_kzalloc(dev, struct_size(clk_data, hws, num_clocks), + GFP_KERNEL); + if (!clk_data) + return dev_err_probe(dev, -ENOMEM, "No memory for clock data\n"); + clk_data->num = num_clocks; + + for (i = 0; i < clk_data->num; i++) { + hw_ptr = rpmi_clk_enumerate(context, i); + if (IS_ERR(hw_ptr)) { + return dev_err_probe(dev, PTR_ERR(hw_ptr), + "Failed to register clk-%d\n", i); + } + clk_data->hws[i] = hw_ptr; + } + + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, clk_data); + if (ret) + return dev_err_probe(dev, ret, "Failed to register clock HW provider\n"); + + return 0; +} + +static const struct of_device_id rpmi_clk_of_match[] = { + { .compatible = "riscv,rpmi-clock" }, + { } +}; +MODULE_DEVICE_TABLE(of, rpmi_clk_of_match); + +static struct platform_driver rpmi_clk_driver = { + .driver = { + .name = "riscv-rpmi-clock", + .of_match_table = rpmi_clk_of_match, + }, + .probe = rpmi_clk_probe, +}; +module_platform_driver(rpmi_clk_driver); + +MODULE_AUTHOR("Rahul Pathak "); +MODULE_DESCRIPTION("Clock Driver based on RPMI message protocol"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mailbox/riscv-rpmi-message.h b/include/linux/mailbox/riscv-rpmi-message.h index c3a98fc12c0a..8176d33747fe 100644 --- a/include/linux/mailbox/riscv-rpmi-message.h +++ b/include/linux/mailbox/riscv-rpmi-message.h @@ -90,6 +90,22 @@ static inline int rpmi_to_linux_error(int rpmi_error) } } +/* RPMI service group IDs */ +#define RPMI_SRVGRP_CLOCK 0x00008 + +/* RPMI clock service IDs */ +enum rpmi_clock_service_id { + RPMI_CLK_SRV_ENABLE_NOTIFICATION = 0x01, + RPMI_CLK_SRV_GET_NUM_CLOCKS = 0x02, + RPMI_CLK_SRV_GET_ATTRIBUTES = 0x03, + RPMI_CLK_SRV_GET_SUPPORTED_RATES = 0x04, + RPMI_CLK_SRV_SET_CONFIG = 0x05, + RPMI_CLK_SRV_GET_CONFIG = 0x06, + RPMI_CLK_SRV_SET_RATE = 0x07, + RPMI_CLK_SRV_GET_RATE = 0x08, + RPMI_CLK_SRV_ID_MAX_COUNT +}; + /* RPMI Linux mailbox attribute IDs */ enum rpmi_mbox_attribute_id { RPMI_MBOX_ATTR_SPEC_VERSION, -- cgit v1.2.3 From 495c8d35035edb66e3284113bef01f3b1b843832 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 25 Sep 2025 13:51:07 -0500 Subject: PM: hibernate: Add pm_hibernation_mode_is_suspend() Some drivers have different flows for hibernation and suspend. If the driver opportunistically will skip thaw() then it needs a hint to know what is happening after the hibernate. Introduce a new symbol pm_hibernation_mode_is_suspend() that drivers can call to determine if suspending the system for this purpose. Tested-by: Ionut Nechita Tested-by: Kenneth Crudup Acked-by: Alex Deucher Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 2 ++ kernel/power/hibernate.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 317ae31e89b3..0664c685f0b2 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -276,6 +276,7 @@ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; +bool pm_hibernation_mode_is_suspend(void); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL @@ -288,6 +289,7 @@ static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e0cdb8d9fd45..5d146218cae8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -80,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); -- cgit v1.2.3 From 10e1dcb62a7e874af43e7dfbef13ef8e3a2ad4a9 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Thu, 28 Aug 2025 08:51:02 +0300 Subject: dt-bindings: thermal: add Tegra114 soctherm header This adds header for the Tegra114 SOCTHERM device tree node. Signed-off-by: Svyatoslav Ryhel Acked-by: Conor Dooley Reviewed-by: Mikko Perttunen Link: https://lore.kernel.org/r/20250828055104.8073-5-clamor95@gmail.com Signed-off-by: Daniel Lezcano --- include/dt-bindings/thermal/tegra114-soctherm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/dt-bindings/thermal/tegra114-soctherm.h (limited to 'include') diff --git a/include/dt-bindings/thermal/tegra114-soctherm.h b/include/dt-bindings/thermal/tegra114-soctherm.h new file mode 100644 index 000000000000..b766a61cd1ce --- /dev/null +++ b/include/dt-bindings/thermal/tegra114-soctherm.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for binding nvidia,tegra114-soctherm. + */ + +#ifndef _DT_BINDINGS_THERMAL_TEGRA114_SOCTHERM_H +#define _DT_BINDINGS_THERMAL_TEGRA114_SOCTHERM_H + +#define TEGRA114_SOCTHERM_SENSOR_CPU 0 +#define TEGRA114_SOCTHERM_SENSOR_MEM 1 +#define TEGRA114_SOCTHERM_SENSOR_GPU 2 +#define TEGRA114_SOCTHERM_SENSOR_PLLX 3 + +#define TEGRA114_SOCTHERM_THROT_LEVEL_NONE 0 +#define TEGRA114_SOCTHERM_THROT_LEVEL_LOW 1 +#define TEGRA114_SOCTHERM_THROT_LEVEL_MED 2 +#define TEGRA114_SOCTHERM_THROT_LEVEL_HIGH 3 + +#endif -- cgit v1.2.3 From aa43953e862c031ff66e44353c88beb7a449e80d Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:09 +0530 Subject: irqchip: Add driver for the RPMI system MSI service group The RPMI specification defines a system MSI service group which allows application processors to receive MSIs upon system events such as graceful shutdown/reboot request, CPU hotplug event, memory hotplug event, etc. Add an irqchip driver for the RISC-V RPMI system MSI service group to directly receive system MSIs in Linux kernel. Reviewed-by: Thomas Gleixner Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250818040920.272664-14-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- drivers/irqchip/Kconfig | 7 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-riscv-rpmi-sysmsi.c | 287 +++++++++++++++++++++++++++++ include/linux/mailbox/riscv-rpmi-message.h | 13 ++ 4 files changed, 308 insertions(+) create mode 100644 drivers/irqchip/irq-riscv-rpmi-sysmsi.c (limited to 'include') diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 6d12c6ab9ea4..e047ba36df16 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -634,6 +634,13 @@ config RISCV_IMSIC select GENERIC_MSI_IRQ select IRQ_MSI_LIB +config RISCV_RPMI_SYSMSI + bool + depends on MAILBOX + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ + default RISCV + config SIFIVE_PLIC bool depends on RISCV diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 93e3ced023bb..3de083f5484c 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_RISCV_INTC) += irq-riscv-intc.o obj-$(CONFIG_RISCV_APLIC) += irq-riscv-aplic-main.o irq-riscv-aplic-direct.o obj-$(CONFIG_RISCV_APLIC_MSI) += irq-riscv-aplic-msi.o obj-$(CONFIG_RISCV_IMSIC) += irq-riscv-imsic-state.o irq-riscv-imsic-early.o irq-riscv-imsic-platform.o +obj-$(CONFIG_RISCV_RPMI_SYSMSI) += irq-riscv-rpmi-sysmsi.o obj-$(CONFIG_SIFIVE_PLIC) += irq-sifive-plic.o obj-$(CONFIG_STARFIVE_JH8100_INTC) += irq-starfive-jh8100-intc.o obj-$(CONFIG_ACLINT_SSWI) += irq-aclint-sswi.o diff --git a/drivers/irqchip/irq-riscv-rpmi-sysmsi.c b/drivers/irqchip/irq-riscv-rpmi-sysmsi.c new file mode 100644 index 000000000000..92e8847dfccc --- /dev/null +++ b/drivers/irqchip/irq-riscv-rpmi-sysmsi.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Ventana Micro Systems Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct rpmi_sysmsi_get_attrs_rx { + __le32 status; + __le32 sys_num_msi; + __le32 flag0; + __le32 flag1; +}; + +#define RPMI_SYSMSI_MSI_ATTRIBUTES_FLAG0_PREF_PRIV BIT(0) + +struct rpmi_sysmsi_set_msi_state_tx { + __le32 sys_msi_index; + __le32 sys_msi_state; +}; + +struct rpmi_sysmsi_set_msi_state_rx { + __le32 status; +}; + +#define RPMI_SYSMSI_MSI_STATE_ENABLE BIT(0) +#define RPMI_SYSMSI_MSI_STATE_PENDING BIT(1) + +struct rpmi_sysmsi_set_msi_target_tx { + __le32 sys_msi_index; + __le32 sys_msi_address_low; + __le32 sys_msi_address_high; + __le32 sys_msi_data; +}; + +struct rpmi_sysmsi_set_msi_target_rx { + __le32 status; +}; + +struct rpmi_sysmsi_priv { + struct device *dev; + struct mbox_client client; + struct mbox_chan *chan; + u32 nr_irqs; + u32 gsi_base; +}; + +static int rpmi_sysmsi_get_num_msi(struct rpmi_sysmsi_priv *priv) +{ + struct rpmi_sysmsi_get_attrs_rx rx; + struct rpmi_mbox_message msg; + int ret; + + rpmi_mbox_init_send_with_response(&msg, RPMI_SYSMSI_SRV_GET_ATTRIBUTES, + NULL, 0, &rx, sizeof(rx)); + ret = rpmi_mbox_send_message(priv->chan, &msg); + if (ret) + return ret; + if (rx.status) + return rpmi_to_linux_error(le32_to_cpu(rx.status)); + + return le32_to_cpu(rx.sys_num_msi); +} + +static int rpmi_sysmsi_set_msi_state(struct rpmi_sysmsi_priv *priv, + u32 sys_msi_index, u32 sys_msi_state) +{ + struct rpmi_sysmsi_set_msi_state_tx tx; + struct rpmi_sysmsi_set_msi_state_rx rx; + struct rpmi_mbox_message msg; + int ret; + + tx.sys_msi_index = cpu_to_le32(sys_msi_index); + tx.sys_msi_state = cpu_to_le32(sys_msi_state); + rpmi_mbox_init_send_with_response(&msg, RPMI_SYSMSI_SRV_SET_MSI_STATE, + &tx, sizeof(tx), &rx, sizeof(rx)); + ret = rpmi_mbox_send_message(priv->chan, &msg); + if (ret) + return ret; + if (rx.status) + return rpmi_to_linux_error(le32_to_cpu(rx.status)); + + return 0; +} + +static int rpmi_sysmsi_set_msi_target(struct rpmi_sysmsi_priv *priv, + u32 sys_msi_index, struct msi_msg *m) +{ + struct rpmi_sysmsi_set_msi_target_tx tx; + struct rpmi_sysmsi_set_msi_target_rx rx; + struct rpmi_mbox_message msg; + int ret; + + tx.sys_msi_index = cpu_to_le32(sys_msi_index); + tx.sys_msi_address_low = cpu_to_le32(m->address_lo); + tx.sys_msi_address_high = cpu_to_le32(m->address_hi); + tx.sys_msi_data = cpu_to_le32(m->data); + rpmi_mbox_init_send_with_response(&msg, RPMI_SYSMSI_SRV_SET_MSI_TARGET, + &tx, sizeof(tx), &rx, sizeof(rx)); + ret = rpmi_mbox_send_message(priv->chan, &msg); + if (ret) + return ret; + if (rx.status) + return rpmi_to_linux_error(le32_to_cpu(rx.status)); + + return 0; +} + +static void rpmi_sysmsi_irq_mask(struct irq_data *d) +{ + struct rpmi_sysmsi_priv *priv = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + int ret; + + ret = rpmi_sysmsi_set_msi_state(priv, hwirq, 0); + if (ret) + dev_warn(priv->dev, "Failed to mask hwirq %lu (error %d)\n", hwirq, ret); + irq_chip_mask_parent(d); +} + +static void rpmi_sysmsi_irq_unmask(struct irq_data *d) +{ + struct rpmi_sysmsi_priv *priv = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + int ret; + + irq_chip_unmask_parent(d); + ret = rpmi_sysmsi_set_msi_state(priv, hwirq, RPMI_SYSMSI_MSI_STATE_ENABLE); + if (ret) + dev_warn(priv->dev, "Failed to unmask hwirq %lu (error %d)\n", hwirq, ret); +} + +static void rpmi_sysmsi_write_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct rpmi_sysmsi_priv *priv = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + int ret; + + /* For zeroed MSI, do nothing as of now */ + if (!msg->address_hi && !msg->address_lo && !msg->data) + return; + + ret = rpmi_sysmsi_set_msi_target(priv, hwirq, msg); + if (ret) + dev_warn(priv->dev, "Failed to set target for hwirq %lu (error %d)\n", hwirq, ret); +} + +static void rpmi_sysmsi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->desc = desc; + arg->hwirq = desc->data.icookie.value; +} + +static int rpmi_sysmsi_translate(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *hwirq, unsigned int *type) +{ + struct msi_domain_info *info = d->host_data; + struct rpmi_sysmsi_priv *priv = info->data; + + if (WARN_ON(fwspec->param_count < 1)) + return -EINVAL; + + /* For DT, gsi_base is always zero. */ + *hwirq = fwspec->param[0] - priv->gsi_base; + *type = IRQ_TYPE_NONE; + return 0; +} + +static const struct msi_domain_template rpmi_sysmsi_template = { + .chip = { + .name = "RPMI-SYSMSI", + .irq_mask = rpmi_sysmsi_irq_mask, + .irq_unmask = rpmi_sysmsi_irq_unmask, +#ifdef CONFIG_SMP + .irq_set_affinity = irq_chip_set_affinity_parent, +#endif + .irq_write_msi_msg = rpmi_sysmsi_write_msg, + .flags = IRQCHIP_SET_TYPE_MASKED | + IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_MASK_ON_SUSPEND, + }, + + .ops = { + .set_desc = rpmi_sysmsi_set_desc, + .msi_translate = rpmi_sysmsi_translate, + }, + + .info = { + .bus_token = DOMAIN_BUS_WIRED_TO_MSI, + .flags = MSI_FLAG_USE_DEV_FWNODE, + .handler = handle_simple_irq, + .handler_name = "simple", + }, +}; + +static int rpmi_sysmsi_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rpmi_sysmsi_priv *priv; + int rc; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + priv->dev = dev; + + /* Setup mailbox client */ + priv->client.dev = priv->dev; + priv->client.rx_callback = NULL; + priv->client.tx_block = false; + priv->client.knows_txdone = true; + priv->client.tx_tout = 0; + + /* Request mailbox channel */ + priv->chan = mbox_request_channel(&priv->client, 0); + if (IS_ERR(priv->chan)) + return PTR_ERR(priv->chan); + + /* Get number of system MSIs */ + rc = rpmi_sysmsi_get_num_msi(priv); + if (rc < 1) { + mbox_free_channel(priv->chan); + if (rc) + return dev_err_probe(dev, rc, "Failed to get number of system MSIs\n"); + else + return dev_err_probe(dev, -ENODEV, "No system MSIs found\n"); + } + priv->nr_irqs = rc; + + /* + * The device MSI domain for platform devices on RISC-V architecture + * is only available after the MSI controller driver is probed so, + * explicitly configure here. + */ + if (!dev_get_msi_domain(dev)) { + /* + * The device MSI domain for OF devices is only set at the + * time of populating/creating OF device. If the device MSI + * domain is discovered later after the OF device is created + * then we need to set it explicitly before using any platform + * MSI functions. + */ + if (dev_of_node(dev)) + of_msi_configure(dev, dev_of_node(dev)); + + if (!dev_get_msi_domain(dev)) { + mbox_free_channel(priv->chan); + return -EPROBE_DEFER; + } + } + + if (!msi_create_device_irq_domain(dev, MSI_DEFAULT_DOMAIN, + &rpmi_sysmsi_template, + priv->nr_irqs, priv, priv)) { + mbox_free_channel(priv->chan); + return dev_err_probe(dev, -ENOMEM, "failed to create MSI irq domain\n"); + } + + dev_info(dev, "%u system MSIs registered\n", priv->nr_irqs); + return 0; +} + +static const struct of_device_id rpmi_sysmsi_match[] = { + { .compatible = "riscv,rpmi-system-msi" }, + {} +}; + +static struct platform_driver rpmi_sysmsi_driver = { + .driver = { + .name = "rpmi-sysmsi", + .of_match_table = rpmi_sysmsi_match, + }, + .probe = rpmi_sysmsi_probe, +}; +builtin_platform_driver(rpmi_sysmsi_driver); diff --git a/include/linux/mailbox/riscv-rpmi-message.h b/include/linux/mailbox/riscv-rpmi-message.h index 8176d33747fe..e135c6564d0c 100644 --- a/include/linux/mailbox/riscv-rpmi-message.h +++ b/include/linux/mailbox/riscv-rpmi-message.h @@ -91,6 +91,7 @@ static inline int rpmi_to_linux_error(int rpmi_error) } /* RPMI service group IDs */ +#define RPMI_SRVGRP_SYSTEM_MSI 0x00002 #define RPMI_SRVGRP_CLOCK 0x00008 /* RPMI clock service IDs */ @@ -106,6 +107,18 @@ enum rpmi_clock_service_id { RPMI_CLK_SRV_ID_MAX_COUNT }; +/* RPMI system MSI service IDs */ +enum rpmi_sysmsi_service_id { + RPMI_SYSMSI_SRV_ENABLE_NOTIFICATION = 0x01, + RPMI_SYSMSI_SRV_GET_ATTRIBUTES = 0x02, + RPMI_SYSMSI_SRV_GET_MSI_ATTRIBUTES = 0x03, + RPMI_SYSMSI_SRV_SET_MSI_STATE = 0x04, + RPMI_SYSMSI_SRV_GET_MSI_STATE = 0x05, + RPMI_SYSMSI_SRV_SET_MSI_TARGET = 0x06, + RPMI_SYSMSI_SRV_GET_MSI_TARGET = 0x07, + RPMI_SYSMSI_SRV_ID_MAX_COUNT +}; + /* RPMI Linux mailbox attribute IDs */ enum rpmi_mbox_attribute_id { RPMI_MBOX_ATTR_SPEC_VERSION, -- cgit v1.2.3 From 1a2b423be6a89dd07d5fc27ea042be68697a6a49 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 14 Sep 2025 22:24:13 +0200 Subject: i2c: boardinfo: Annotate code used in init phase only Annotate two places in boardinfo code: - __i2c_first_dynamic_bus_num is set in init phase. Annotate it as __ro_after_init to prevent later changes. - i2c_register_board_info() is used in init phase only, so annotate it as __init, allowing to free the memory after init phase. This is safe, see comment: "done in board-specific init code near arch_initcall() time" Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-boardinfo.c | 4 ++-- include/linux/i2c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 4df8ad092df3..338800321f8b 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock); LIST_HEAD(__i2c_board_list); EXPORT_SYMBOL_GPL(__i2c_board_list); -int __i2c_first_dynamic_bus_num; +int __i2c_first_dynamic_bus_num __ro_after_init; EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); @@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); * The board info passed can safely be __initdata, but be careful of embedded * pointers (for platform_data, functions, etc) since that won't be copied. */ -int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) { int status; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 20fd41b51d5c..11a19241e360 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev) * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO -int +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else -- cgit v1.2.3 From 14967a9c7d247841b0312c48dcf8cd29e55a4cc8 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 15 Sep 2025 18:45:20 -0600 Subject: mm/hugetlb: fix copy_hugetlb_page_range() to use ->pt_share_count commit 59d9094df3d79 ("mm: hugetlb: independent PMD page table shared count") introduced ->pt_share_count dedicated to hugetlb PMD share count tracking, but omitted fixing copy_hugetlb_page_range(), leaving the function relying on page_count() for tracking that no longer works. When lazy page table copy for hugetlb is disabled, that is, revert commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()") fork()'ing with hugetlb PMD sharing quickly lockup - [ 239.446559] watchdog: BUG: soft lockup - CPU#75 stuck for 27s! [ 239.446611] RIP: 0010:native_queued_spin_lock_slowpath+0x7e/0x2e0 [ 239.446631] Call Trace: [ 239.446633] [ 239.446636] _raw_spin_lock+0x3f/0x60 [ 239.446639] copy_hugetlb_page_range+0x258/0xb50 [ 239.446645] copy_page_range+0x22b/0x2c0 [ 239.446651] dup_mmap+0x3e2/0x770 [ 239.446654] dup_mm.constprop.0+0x5e/0x230 [ 239.446657] copy_process+0xd17/0x1760 [ 239.446660] kernel_clone+0xc0/0x3e0 [ 239.446661] __do_sys_clone+0x65/0xa0 [ 239.446664] do_syscall_64+0x82/0x930 [ 239.446668] ? count_memcg_events+0xd2/0x190 [ 239.446671] ? syscall_trace_enter+0x14e/0x1f0 [ 239.446676] ? syscall_exit_work+0x118/0x150 [ 239.446677] ? arch_exit_to_user_mode_prepare.constprop.0+0x9/0xb0 [ 239.446681] ? clear_bhb_loop+0x30/0x80 [ 239.446684] ? clear_bhb_loop+0x30/0x80 [ 239.446686] entry_SYSCALL_64_after_hwframe+0x76/0x7e There are two options to resolve the potential latent issue: 1. warn against PMD sharing in copy_hugetlb_page_range(), 2. fix it. This patch opts for the second option. While at it, simplify the comment, the details are not actually relevant anymore. Link: https://lkml.kernel.org/r/20250916004520.1604530-1-jane.chu@oracle.com Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: Jane Chu Reviewed-by: Harry Yoo Acked-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jann Horn Cc: Liu Shixin Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ mm/hugetlb.c | 15 +++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..a643fae8a349 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } + +static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc) +{ + return !!ptdesc_pmd_pts_count(ptdesc); +} #else static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eed59cfb5d21..6cfe0b43ab8f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5594,18 +5594,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * - * dst_pte == src_pte is the common case of src/dest sharing. - * However, src could have 'unshared' and dst shares with - * another vma. So page_count of ptep page is checked instead - * to reliably determine whether pte is shared. - */ - if (page_count(virt_to_page(dst_pte)) > 1) { +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + /* If the pagetables are shared, there is nothing to do */ + if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) { addr |= last_addr_mask; continue; } +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -7602,7 +7597,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; - if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) + if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); -- cgit v1.2.3 From 7e89979f6695fb56e8739b7d19614256e637131d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 13 Sep 2025 17:03:39 -0700 Subject: include/linux/pgtable.h: convert arch_enter_lazy_mmu_mode() and friends to static inlines commit c519c3c0a113 ("mm/kasan: avoid lazy MMU mode hazards") introduced the use of arch_enter_lazy_mmu_mode(), which results in the compiler complaining about "statement has no effect", when __HAVE_ARCH_LAZY_MMU_MODE is not defined in include/linux/pgtable.h The exact warning/error is: In file included from ./include/linux/kasan.h:37, from mm/kasan/shadow.c:14: mm/kasan/shadow.c: In function kasan_populate_vmalloc_pte: ./include/linux/pgtable.h:247:41: error: statement with no effect [-Werror=unused-value] 247 | #define arch_enter_lazy_mmu_mode() (LAZY_MMU_DEFAULT) | ^ mm/kasan/shadow.c:322:9: note: in expansion of macro arch_enter_lazy_mmu_mode> 322 | arch_enter_lazy_mmu_mode(); | ^~~~~~~~~~~~~~~~~~~~~~~~ switching these "functions" to static inlines fixes this up. Fixes: c519c3c0a113 ("mm/kasan: avoid lazy MMU mode hazards") Reported-by: Balbir Singh Closes: https://lkml.kernel.org/r/20250912235515.367061-1-balbirs@nvidia.com Cc: Alexander Gordeev Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b..25a7257052ff 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint -- cgit v1.2.3 From 17f0d1f6321caa95699b8f96baf12e654d7b8d60 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 26 Sep 2025 01:50:28 +0800 Subject: bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE The stacktrace map can be easily full, which will lead to failure in obtaining the stack. In addition to increasing the size of the map, another solution is to delete the stack_id after looking it up from the user, so extend the existing bpf_map_lookup_and_delete_elem() functionality to stacktrace map types. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250925175030.1615837-1-chen.dylane@linux.dev --- include/linux/bpf.h | 2 +- kernel/bpf/stackmap.c | 16 ++++++++++++++-- kernel/bpf/syscall.c | 8 +++++--- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea2ed6771cc6..6338e54a9b1f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2724,7 +2724,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..2e182a3ac4ce 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall */ -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return bpf_stackmap_extract(map, key, value, true); +} + +/* Called from syscall */ +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; @@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); - old_bucket = xchg(&smap->buckets[id], bucket); + if (delete) + old_bucket = bucket; + else + old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; @@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, + .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adb05d235011..a48fa86f82a7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -320,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); + err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { @@ -1666,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); -int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { return -ENOTSUPP; } @@ -2197,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); -- cgit v1.2.3 From 212b0f07cf021575ec25e0b2336df77c7a4d2e68 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 3 Sep 2025 14:59:43 +0200 Subject: locking/local_lock: Expose dep_map in local_trylock_t. lockdep_is_held() macro assumes that "struct lockdep_map dep_map;" is a top level field of any lock that participates in LOCKDEP. Make it so for local_trylock_t. Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/local_lock_internal.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index d80b5306a2c0..949de37700db 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -17,7 +17,10 @@ typedef struct { /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { - local_lock_t llock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif u8 acquired; } local_trylock_t; @@ -31,7 +34,7 @@ typedef struct { .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ - .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { @@ -81,7 +84,7 @@ do { \ local_lock_debug_init(lock); \ } while (0) -#define __local_trylock_init(lock) __local_lock_init(lock.llock) +#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) #define __spinlock_nested_bh_init(lock) \ do { \ -- cgit v1.2.3 From 2d517aa09bbc4203f10cdee7e1d42f3bbdc1b1cd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:45 +0200 Subject: slab: add opt-in caching layer of percpu sheaves Specifying a non-zero value for a new struct kmem_cache_args field sheaf_capacity will setup a caching layer of percpu arrays called sheaves of given capacity for the created cache. Allocations from the cache will allocate via the percpu sheaves (main or spare) as long as they have no NUMA node preference. Frees will also put the object back into one of the sheaves. When both percpu sheaves are found empty during an allocation, an empty sheaf may be replaced with a full one from the per-node barn. If none are available and the allocation is allowed to block, an empty sheaf is refilled from slab(s) by an internal bulk alloc operation. When both percpu sheaves are full during freeing, the barn can replace a full one with an empty one, unless over a full sheaves limit. In that case a sheaf is flushed to slab(s) by an internal bulk free operation. Flushing sheaves and barns is also wired to the existing cpu flushing and cache shrinking operations. The sheaves do not distinguish NUMA locality of the cached objects. If an allocation is requested with kmem_cache_alloc_node() (or a mempolicy with strict_numa mode enabled) with a specific node (not NUMA_NO_NODE), the sheaves are bypassed. The bulk operations exposed to slab users also try to utilize the sheaves as long as the necessary (full or empty) sheaves are available on the cpu or in the barn. Once depleted, they will fallback to bulk alloc/free to slabs directly to avoid double copying. The sheaf_capacity value is exported in sysfs for observability. Sysfs CONFIG_SLUB_STATS counters alloc_cpu_sheaf and free_cpu_sheaf count objects allocated or freed using the sheaves (and thus not counting towards the other alloc/free path counters). Counters sheaf_refill and sheaf_flush count objects filled or flushed from or to slab pages, and can be used to assess how effective the caching is. The refill and flush operations will also count towards the usual alloc_fastpath/slowpath, free_fastpath/slowpath and other counters for the backing slabs. For barn operations, barn_get and barn_put count how many full sheaves were get from or put to the barn, the _fail variants count how many such requests could not be satisfied mainly because the barn was either empty or full. While the barn also holds empty sheaves to make some operations easier, these are not as critical to mandate own counters. Finally, there are sheaf_alloc/sheaf_free counters. Access to the percpu sheaves is protected by local_trylock() when potential callers include irq context, and local_lock() otherwise (such as when we already know the gfp flags allow blocking). The trylock failures should be rare and we can easily fallback. Each per-NUMA-node barn has a spin_lock. When slub_debug is enabled for a cache with sheaf_capacity also specified, the latter is ignored so that allocations and frees reach the slow path where debugging hooks are processed. Similarly, we ignore it with CONFIG_SLUB_TINY which prefers low memory usage to performance. [boot failure: https://lore.kernel.org/all/583eacf5-c971-451a-9f76-fed0e341b815@linux.ibm.com/ ] Reported-and-tested-by: Venkat Rao Bagalkote Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 31 ++ mm/slab.h | 2 + mm/slab_common.c | 5 +- mm/slub.c | 1164 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 1142 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..49acbcdc6696 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -335,6 +335,37 @@ struct kmem_cache_args { * %NULL means no constructor. */ void (*ctor)(void *); + /** + * @sheaf_capacity: Enable sheaves of given capacity for the cache. + * + * With a non-zero value, allocations from the cache go through caching + * arrays called sheaves. Each cpu has a main sheaf that's always + * present, and a spare sheaf that may be not present. When both become + * empty, there's an attempt to replace an empty sheaf with a full sheaf + * from the per-node barn. + * + * When no full sheaf is available, and gfp flags allow blocking, a + * sheaf is allocated and filled from slab(s) using bulk allocation. + * Otherwise the allocation falls back to the normal operation + * allocating a single object from a slab. + * + * Analogically when freeing and both percpu sheaves are full, the barn + * may replace it with an empty sheaf, unless it's over capacity. In + * that case a sheaf is bulk freed to slab pages. + * + * The sheaves do not enforce NUMA placement of objects, so allocations + * via kmem_cache_alloc_node() with a node specified other than + * NUMA_NO_NODE will bypass them. + * + * Bulk allocation and free operations also try to use the cpu sheaves + * and barn, but fallback to using slab pages directly. + * + * When slub_debug is enabled for the cache, the sheaf_capacity argument + * is ignored. + * + * %0 means no sheaves will be created. + */ + unsigned int sheaf_capacity; }; struct kmem_cache *__kmem_cache_create_args(const char *name, diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..206987ce44a4 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -235,6 +235,7 @@ struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; #endif + struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; @@ -248,6 +249,7 @@ struct kmem_cache { /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif + unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ diff --git a/mm/slab_common.c b/mm/slab_common.c index bfe7c40eeee1..e2b197e47866 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif + if (s->cpu_sheaves) + return 1; + /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize) + if (!args->usersize && !args->sheaf_capacity) s = __kmem_cache_alias(name, object_size, args->align, flags, args->ctor); if (s) diff --git a/mm/slub.c b/mm/slub.c index 9f671ec76131..cba188b7e04d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -363,8 +363,10 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -389,6 +391,14 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ NR_SLUB_STAT_ITEMS }; @@ -435,6 +445,32 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v) #endif } +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + }; + unsigned int size; + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ +}; + /* * The slab lists for all objects. */ @@ -447,6 +483,7 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif + struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -454,6 +491,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) return s->node[node]; } +/* Get the barn of the current cpu's memory node */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + return get_node(s, numa_mem_id())->barn; +} + /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) @@ -470,12 +513,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) */ static nodemask_t slab_nodes; -#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; -#endif + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); /******************************************************************** * Core slab cache functions @@ -2473,6 +2523,360 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, + s->sheaf_capacity), gfp); + + if (unlikely(!sheaf)) + return NULL; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_empty) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full >= MAX_FULL_SHEAVES) { + empty = ERR_PTR(-E2BIG); + } else if (!barn->nr_empty) { + empty = ERR_PTR(-ENOMEM); + } else { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + struct list_head empty_list; + struct list_head full_list; + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + INIT_LIST_HEAD(&empty_list); + INIT_LIST_HEAD(&full_list); + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + /* * Slab allocation and freeing */ @@ -3344,11 +3748,40 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) put_partials_cpu(s, c); } -struct slub_flush_work { - struct work_struct work; - struct kmem_cache *s; - bool skip; -}; +static inline void flush_this_cpu_slab(struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + put_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +#else /* CONFIG_SLUB_TINY */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } +static inline void flush_this_cpu_slab(struct kmem_cache *s) { } +#endif /* CONFIG_SLUB_TINY */ + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->main->size); +} /* * Flush cpu slab. @@ -3358,30 +3791,18 @@ struct slub_flush_work { static void flush_cpu_slab(struct work_struct *w) { struct kmem_cache *s; - struct kmem_cache_cpu *c; struct slub_flush_work *sfw; sfw = container_of(w, struct slub_flush_work, work); s = sfw->s; - c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); - - put_partials(s); -} -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + if (s->cpu_sheaves) + pcs_flush_all(s); - return c->slab || slub_percpu_partial(c); + flush_this_cpu_slab(s); } -static DEFINE_MUTEX(flush_lock); -static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); - static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; @@ -3392,7 +3813,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { sfw->skip = true; continue; } @@ -3428,19 +3849,15 @@ static int slub_cpu_dead(unsigned int cpu) struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) + list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } mutex_unlock(&slab_mutex); return 0; } -#else /* CONFIG_SLUB_TINY */ -static inline void flush_all_cpus_locked(struct kmem_cache *s) { } -static inline void flush_all(struct kmem_cache *s) { } -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline int slub_cpu_dead(unsigned int cpu) { return 0; } -#endif /* CONFIG_SLUB_TINY */ - /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -4191,30 +4608,240 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, } /* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. + * Replace the empty main sheaf with a (at least partially) full sheaf. * - * Otherwise we can simply pick the next object from the lockless free list. + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. */ -static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) { - void *object; - bool init = false; + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; - s = slab_pre_alloc_hook(s, gfpflags); - if (unlikely(!s)) - return NULL; + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp) +{ + struct slub_percpu_sheaves *pcs; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa)) { + if (current->mempolicy) + return NULL; + } +#endif + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[--pcs->main->size]; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); + + return object; +} + +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + full = barn_replace_empty_sheaf(get_barn(s), pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + bool init = false; + + s = slab_pre_alloc_hook(s, gfpflags); + if (unlikely(!s)) + return NULL; object = kfence_alloc(s, orig_size, gfpflags); if (unlikely(object)) goto out; - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (s->cpu_sheaves && node == NUMA_NO_NODE) + object = alloc_from_pcs(s, gfpflags); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -4591,6 +5218,295 @@ slab_empty: discard_slab(s, slab); } +/* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) +{ + struct node_barn *barn; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + barn = get_barn(s); + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + if (!size) + return; + continue; + } + + i++; + } + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -4677,7 +5593,10 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + + if (!s->cpu_sheaves || !free_to_pcs(s, object)) do_slab_free(s, slab, object, object, 1, addr); } @@ -5273,6 +6192,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!size) return; + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); + return; + } + do { struct detached_freelist df; @@ -5391,7 +6319,7 @@ error: int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - int i; + unsigned int i = 0; if (!size) return 0; @@ -5400,9 +6328,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p); - if (unlikely(i == 0)) - return 0; + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + return 0; + } + } /* * memcg and kmem_cache debug support and memory initialization. @@ -5412,11 +6351,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s), s->object_size))) { return 0; } - return i; + + return size; } EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5550,7 +6489,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -5560,6 +6499,9 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } #ifndef CONFIG_SLUB_TINY @@ -5590,6 +6532,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) } #endif /* CONFIG_SLUB_TINY */ +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -5625,7 +6587,7 @@ static void early_kmem_cache_node_alloc(int node) slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; kmem_cache_node->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, NULL); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -5641,6 +6603,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } + s->node[node] = NULL; kmem_cache_free(kmem_cache_node, n); } @@ -5649,6 +6618,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); #ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); #endif @@ -5661,18 +6632,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - - if (!n) + if (!n) { + kfree(barn); return 0; + } + + init_kmem_cache_node(n, barn); - init_kmem_cache_node(n); s->node[node] = n; } return 1; @@ -5929,6 +6911,8 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -6132,6 +7116,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); + if (n->barn) + barn_shrink(s, n->barn); + spin_lock_irqsave(&n->list_lock, flags); /* @@ -6211,12 +7198,24 @@ static int slab_mem_going_online_callback(int nid) */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + /* * The structure may already exist if the node was previously * onlined and offlined. */ if (get_node(s, nid)) continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -6224,10 +7223,13 @@ static int slab_mem_going_online_callback(int nid) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } /* @@ -6440,6 +7442,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } + #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -6456,6 +7469,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; + } + err = 0; /* Mutex is not taken during early boot */ @@ -6908,6 +7927,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(order); +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); +} +SLAB_ATTR_RO(sheaf_capacity); + static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%lu\n", s->min_partial); @@ -7255,8 +8280,10 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -7281,6 +8308,14 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -7311,6 +8346,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, &objects_partial_attr.attr, @@ -7342,8 +8378,10 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -7368,6 +8406,14 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- cgit v1.2.3 From 04a91570ac67760301e5458d65eaf1342ecca314 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 16 Sep 2025 23:22:49 -0400 Subject: ext4: implemet new ioctls to set and get superblock parameters Implement the EXT4_IOC_GET_TUNE_SB_PARAM and EXT4_IOC_SET_TUNE_SB_PARAM ioctls, which allow certains superblock parameters to be set while the file system is mounted, without needing write access to the block device. Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Message-ID: <20250916-tune2fs-v2-3-d594dc7486f0@mit.edu> Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 312 ++++++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/ext4.h | 53 ++++++++ 2 files changed, 358 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 84e3c73952d7..a93a7baae990 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,14 +27,16 @@ #include "fsmap.h" #include -typedef void ext4_update_sb_callback(struct ext4_super_block *es, - const void *arg); +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); /* * Superblock modification callback function for changing file system * label */ -static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); @@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) * Superblock modification callback function for changing file system * UUID. */ -static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } @@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, goto out_err; lock_buffer(bh); - func(es, arg); + func(sbi, es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); @@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb, unlock_buffer(bh); goto out_bh; } - func(es, arg); + func(EXT4_SB(sb), es, arg); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); @@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp, return ret; } + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1616,6 +1908,11 @@ resizefs_out: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #endif -static void set_overhead(struct ext4_super_block *es, const void *arg) +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h index 1c4c2dd29112..411dcc1e4a35 100644 --- a/include/uapi/linux/ext4.h +++ b/include/uapi/linux/ext4.h @@ -33,6 +33,8 @@ #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) #define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) #define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) +#define EXT4_IOC_GET_TUNE_SB_PARAM _IOR('f', 45, struct ext4_tune_sb_params) +#define EXT4_IOC_SET_TUNE_SB_PARAM _IOW('f', 46, struct ext4_tune_sb_params) #define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32) @@ -108,6 +110,57 @@ struct ext4_new_group_input { __u16 unused; }; +struct ext4_tune_sb_params { + __u32 set_flags; + __u32 checkinterval; + __u16 errors_behavior; + __u16 mnt_count; + __u16 max_mnt_count; + __u16 raid_stride; + __u64 last_check_time; + __u64 reserved_blocks; + __u64 blocks_count; + __u32 default_mnt_opts; + __u32 reserved_uid; + __u32 reserved_gid; + __u32 raid_stripe_width; + __u16 encoding; + __u16 encoding_flags; + __u8 def_hash_alg; + __u8 pad_1; + __u16 pad_2; + __u32 feature_compat; + __u32 feature_incompat; + __u32 feature_ro_compat; + __u32 set_feature_compat_mask; + __u32 set_feature_incompat_mask; + __u32 set_feature_ro_compat_mask; + __u32 clear_feature_compat_mask; + __u32 clear_feature_incompat_mask; + __u32 clear_feature_ro_compat_mask; + __u8 mount_opts[64]; + __u8 pad[64]; +}; + +#define EXT4_TUNE_FL_ERRORS_BEHAVIOR 0x00000001 +#define EXT4_TUNE_FL_MNT_COUNT 0x00000002 +#define EXT4_TUNE_FL_MAX_MNT_COUNT 0x00000004 +#define EXT4_TUNE_FL_CHECKINTRVAL 0x00000008 +#define EXT4_TUNE_FL_LAST_CHECK_TIME 0x00000010 +#define EXT4_TUNE_FL_RESERVED_BLOCKS 0x00000020 +#define EXT4_TUNE_FL_RESERVED_UID 0x00000040 +#define EXT4_TUNE_FL_RESERVED_GID 0x00000080 +#define EXT4_TUNE_FL_DEFAULT_MNT_OPTS 0x00000100 +#define EXT4_TUNE_FL_DEF_HASH_ALG 0x00000200 +#define EXT4_TUNE_FL_RAID_STRIDE 0x00000400 +#define EXT4_TUNE_FL_RAID_STRIPE_WIDTH 0x00000800 +#define EXT4_TUNE_FL_MOUNT_OPTS 0x00001000 +#define EXT4_TUNE_FL_FEATURES 0x00002000 +#define EXT4_TUNE_FL_EDIT_FEATURES 0x00004000 +#define EXT4_TUNE_FL_FORCE_FSCK 0x00008000 +#define EXT4_TUNE_FL_ENCODING 0x00010000 +#define EXT4_TUNE_FL_ENCODING_FLAGS 0x00020000 + /* * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. * It indicates that the entry in extent status cache is for a hole. -- cgit v1.2.3 From bbfe987c5a2854705393ad79813074e5eadcbde6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 13:10:22 +0200 Subject: PM: hibernate: Fix pm_hibernation_mode_is_suspend() build breakage Commit 495c8d35035e ("PM: hibernate: Add pm_hibernation_mode_is_suspend()") that introduced pm_hibernation_mode_is_suspend() did not define it in the case when CONFIG_HIBERNATION is unset, but CONFIG_SUSPEND is set. Subsequent commit 0a6e9e098fcc ("drm/amd: Fix hybrid sleep") made the amdgpu driver use that function which led to kernel build breakage in the case mentioned above [1]. Address this by using appropriate #ifdeffery around the definition of pm_hibernation_mode_is_suspend(). Fixes: 0a6e9e098fcc ("drm/amd: Fix hybrid sleep") Reported-by: KernelCI bot Closes: https://groups.io/g/kernelci-results/topic/regression_pm_testing/115439919 [1] Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- include/linux/suspend.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 0664c685f0b2..b02876f1ae38 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -276,7 +276,6 @@ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; -bool pm_hibernation_mode_is_suspend(void); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL @@ -289,7 +288,6 @@ static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } -static inline bool pm_hibernation_mode_is_suspend(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } @@ -420,6 +418,12 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { } #endif /* CONFIG_HIBERNATION */ +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_SUSPEND) +bool pm_hibernation_mode_is_suspend(void); +#else +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } +#endif + int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV -- cgit v1.2.3 From a036bb0e60ad2828c3498bff7465bcbb247b7436 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 25 Sep 2025 14:56:30 -0500 Subject: of: base: Add of_get_next_child_with_prefix() stub 1fcc67e3a354 ("of: base: Add for_each_child_of_node_with_prefix()") added of_get_next_child_with_prefix() but did not add a stub for the !CONFIG_OF case. Add a of_get_next_child_with_prefix() stub so users of for_each_child_of_node_with_prefix() can be built for compile testing even when !CONFIG_OF. Fixes: 1fcc67e3a354 ("of: base: Add for_each_child_of_node_with_prefix()") Signed-off-by: Bjorn Helgaas Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index a62154aeda1b..5e2c6ed9370a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -550,6 +550,13 @@ static inline struct device_node *of_get_next_child( return NULL; } +static inline struct device_node *of_get_next_child_with_prefix( + const struct device_node *node, struct device_node *prev, + const char *prefix) +{ + return NULL; +} + static inline struct device_node *of_get_next_available_child( const struct device_node *node, struct device_node *prev) { -- cgit v1.2.3 From 4e9510f16218802b5fc0d593d8707d4e7ebf9774 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Sep 2025 01:27:07 -0400 Subject: ptr_ring: drop duplicated tail zeroing code We have some rather subtle code around zeroing tail entries, minimizing cache bouncing. Let's put it all in one place. Doing this also reduces the text size slightly, e.g. for drivers/vhost/net.o Before: text: 15,114 bytes After: text: 15,082 bytes Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/adb9d941de4a2b619ddb2be271a9939849e70687.1758690291.git.mst@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 551329220e4f..a736b16859a6 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -243,6 +243,24 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) return ret; } +/* Zero entries from tail to specified head. + * NB: if consumer_head can be >= r->size need to fixup tail later. + */ +static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head) +{ + int head = consumer_head - 1; + + /* Zero out entries in the reverse order: this way we touch the + * cache line that producer might currently be reading the last; + * producer won't make progress and touch other cache lines + * besides the first one until we write out all entries. + */ + while (likely(head >= r->consumer_tail)) + r->queue[head--] = NULL; + + r->consumer_tail = consumer_head; +} + /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { @@ -261,8 +279,7 @@ static inline void __ptr_ring_discard_one(struct ptr_ring *r) /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ - int consumer_head = r->consumer_head; - int head = consumer_head++; + int consumer_head = r->consumer_head + 1; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. @@ -270,16 +287,9 @@ static inline void __ptr_ring_discard_one(struct ptr_ring *r) * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || - consumer_head >= r->size)) { - /* Zero out entries in the reverse order: this way we touch the - * cache line that producer might currently be reading the last; - * producer won't make progress and touch other cache lines - * besides the first one until we write out all entries. - */ - while (likely(head >= r->consumer_tail)) - r->queue[head--] = NULL; - r->consumer_tail = consumer_head; - } + consumer_head >= r->size)) + __ptr_ring_zero_tail(r, consumer_head); + if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; @@ -513,7 +523,6 @@ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; - int head; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); @@ -525,17 +534,14 @@ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ - head = r->consumer_head - 1; - while (likely(head >= r->consumer_tail)) - r->queue[head--] = NULL; - r->consumer_tail = r->consumer_head; + __ptr_ring_zero_tail(r, r->consumer_head); /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { - head = r->consumer_head - 1; + int head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { -- cgit v1.2.3 From cc2f08129925b437bf28f7f7822f20dac083a87c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 24 Sep 2025 12:40:33 +0000 Subject: ethtool: add FEC bins histogram report IEEE 802.3ck-2022 defines counters for FEC bins and 802.3df-2024 clarifies it a bit further. Implement reporting interface through as addition to FEC stats available in ethtool. Drivers can leave bin counter uninitialized if per-lane values are provided. In this case the core will recalculate summ for the bin. Signed-off-by: Vadim Fedorenko Reviewed-by: Aleksandr Loktionov Link: https://patch.msgid.link/20250924124037.1508846-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 29 +++++++++ Documentation/networking/ethtool-netlink.rst | 5 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +- .../net/ethernet/fungible/funeth/funeth_ethtool.c | 3 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 +- .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +- drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 3 +- drivers/net/ethernet/sfc/ethtool.c | 3 +- drivers/net/ethernet/sfc/siena/ethtool.c | 3 +- drivers/net/netdevsim/ethtool.c | 25 +++++++- include/linux/ethtool.h | 25 +++++++- include/uapi/linux/ethtool_netlink_generated.h | 12 ++++ net/ethtool/fec.c | 75 +++++++++++++++++++++- 15 files changed, 186 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 7a7594713f1f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1219,6 +1219,30 @@ attribute-sets: name: udp-ports type: nest nested-attributes: tunnel-udp + - + name: fec-hist + attr-cnt-name: --ethtool-a-fec-hist-cnt + attributes: + - + name: pad + type: pad + - + name: bin-low + type: u32 + doc: Low bound of FEC bin (inclusive) + - + name: bin-high + type: u32 + doc: High bound of FEC bin (inclusive) + - + name: bin-val + type: uint + doc: Error count in the bin (optional if per-lane values exist) + - + name: bin-val-per-lane + type: binary + sub-type: u64 + doc: An array of per-lane error counters in the bin (optional) - name: fec-stat attr-cnt-name: __ethtool-a-fec-stat-cnt @@ -1242,6 +1266,11 @@ attribute-sets: name: corr-bits type: binary sub-type: u64 + - + name: hist + type: nest + multi-attr: True + nested-attributes: fec-hist - name: fec attr-cnt-name: __ethtool-a-fec-cnt diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ab20c644af24..b270886c5f5d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1541,6 +1541,11 @@ Drivers fill in the statistics in the following structure: .. kernel-doc:: include/linux/ethtool.h :identifiers: ethtool_fec_stats +Statistics may have FEC bins histogram attribute ``ETHTOOL_A_FEC_STAT_HIST`` +as defined in IEEE 802.3ck-2022 and 802.3df-2024. Nested attributes will have +the range of FEC errors in the bin (inclusive) and the amount of error events +in the bin. + FEC_SET ======= diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index be32ef8f5c96..41686a6f84b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3208,7 +3208,8 @@ static int bnxt_get_fecparam(struct net_device *dev, } static void bnxt_get_fec_stats(struct net_device *dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct bnxt *bp = netdev_priv(dev); u64 *rx; diff --git a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c index ba83dbf4ed22..1966dba512f8 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c @@ -930,7 +930,8 @@ static void fun_get_rmon_stats(struct net_device *netdev, } static void fun_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *stats) + struct ethtool_fec_stats *stats, + struct ethtool_fec_hist *hist) { const struct funeth_priv *fp = netdev_priv(netdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index a752d0e3db3a..a5eefa28454c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1659,7 +1659,8 @@ static void hns3_set_msglevel(struct net_device *netdev, u32 msg_level) } static void hns3_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct hnae3_handle *handle = hns3_get_handle(netdev); struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(handle); diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 348acd46a0ef..dc131779d426 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4624,10 +4624,12 @@ static int ice_get_port_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, * ice_get_fec_stats - returns FEC correctable, uncorrectable stats per netdev * @netdev: network interface device structure * @fec_stats: buffer to hold FEC statistics for given port + * @hist: buffer to put FEC histogram statistics for given port * */ static void ice_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_port_topology port_topology; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 998c734ff839..b90e23dc49de 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -1283,7 +1283,8 @@ end: } static void otx2_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct otx2_nic *pfvf = netdev_priv(netdev); struct cgx_fw_data *rsp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d507366d773e..bcc3bbb78cc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1927,7 +1927,8 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) } static void mlx5e_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index fecb8c602024..d55d2ac1c3b9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -1718,7 +1718,8 @@ fbnic_get_pause_stats(struct net_device *netdev, static void fbnic_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct fbnic_net *fbn = netdev_priv(netdev); struct fbnic_phy_stats *phy_stats; diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 23c6a7df78d0..18fe5850a978 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = efx_netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c index 994909789bfe..8c3ebd0617fb 100644 --- a/drivers/net/ethernet/sfc/siena/ethtool.c +++ b/drivers/net/ethernet/sfc/siena/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index f631d90c428a..36a201533aae 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -165,11 +165,34 @@ nsim_set_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) return 0; } +static const struct ethtool_fec_hist_range netdevsim_fec_ranges[] = { + { 0, 0}, + { 1, 3}, + { 4, 7}, + { 0, 0} +}; + static void -nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats) +nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { + struct ethtool_fec_hist_value *values = hist->values; + + hist->ranges = netdevsim_fec_ranges; + fec_stats->corrected_blocks.total = 123; fec_stats->uncorrectable_blocks.total = 4; + + values[0].per_lane[0] = 125; + values[0].per_lane[1] = 120; + values[0].per_lane[2] = 100; + values[0].per_lane[3] = 100; + values[1].sum = 12; + values[2].sum = 2; + values[2].per_lane[0] = 2; + values[2].per_lane[1] = 0; + values[2].per_lane[2] = 0; + values[2].per_lane[3] = 0; } static int nsim_get_ts_info(struct net_device *dev, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c869b7f8bce8..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,29 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 +/** + * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for + * the end-of-list marker, total 17 items + */ +#define ETHTOOL_FEC_HIST_MAX 17 +/** + * struct ethtool_fec_hist_range - error bits range for FEC histogram + * statistics + * @low: low bound of the bin (inclusive) + * @high: high bound of the bin (inclusive) + */ +struct ethtool_fec_hist_range { + u16 low; + u16 high; +}; +struct ethtool_fec_hist { + struct ethtool_fec_hist_value { + u64 sum; + u64 per_lane[ETHTOOL_MAX_LANES]; + } values[ETHTOOL_FEC_HIST_MAX]; + const struct ethtool_fec_hist_range *ranges; +}; /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC @@ -1214,7 +1236,8 @@ struct ethtool_ops { int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, - struct ethtool_fec_stats *fec_stats); + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index e3b8813465d7..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -561,12 +561,24 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +enum { + ETHTOOL_A_FEC_HIST_PAD = 1, + ETHTOOL_A_FEC_HIST_BIN_LOW, + ETHTOOL_A_FEC_HIST_BIN_HIGH, + ETHTOOL_A_FEC_HIST_BIN_VAL, + ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + + __ETHTOOL_A_FEC_HIST_CNT, + ETHTOOL_A_FEC_HIST_MAX = (__ETHTOOL_A_FEC_HIST_CNT - 1) +}; + enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, ETHTOOL_A_FEC_STAT_CORRECTED, ETHTOOL_A_FEC_STAT_UNCORR, ETHTOOL_A_FEC_STAT_CORR_BITS, + ETHTOOL_A_FEC_STAT_HIST, __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; -- cgit v1.2.3 From 105ce7ad57e492b75ab40f2dc591db645fadbaa2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 24 Sep 2025 23:14:53 +0200 Subject: net: airoha: npu: Add a NPU callback to initialize flow stats Introduce a NPU callback to initialize flow stats and remove NPU stats initialization from airoha_npu_get routine. Add num_stats_entries to airoha_npu_ppe_stats_setup routine. This patch makes the code more readable since NPU statistic are now initialized on demand by the NPU consumer (at the moment NPU statistic are configured just by the airoha_eth driver). Moreover this patch allows the NPU consumer (PPE module) to explicitly enable/disable NPU flow stats. Signed-off-by: Lorenzo Bianconi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250924-airoha-npu-init-stats-callback-v1-1-88bdf3c941b2@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_npu.c | 24 ++++++------------------ drivers/net/ethernet/airoha/airoha_ppe.c | 19 +++++++++++++------ include/linux/soc/airoha/airoha_offload.h | 7 ++++--- 3 files changed, 23 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/airoha/airoha_npu.c b/drivers/net/ethernet/airoha/airoha_npu.c index e1d131d6115c..8c883f2b2d36 100644 --- a/drivers/net/ethernet/airoha/airoha_npu.c +++ b/drivers/net/ethernet/airoha/airoha_npu.c @@ -379,15 +379,13 @@ out: return err; } -static int airoha_npu_stats_setup(struct airoha_npu *npu, - dma_addr_t foe_stats_addr) +static int airoha_npu_ppe_stats_setup(struct airoha_npu *npu, + dma_addr_t foe_stats_addr, + u32 num_stats_entries) { - int err, size = PPE_STATS_NUM_ENTRIES * sizeof(*npu->stats); + int err, size = num_stats_entries * sizeof(*npu->stats); struct ppe_mbox_data *ppe_data; - if (!size) /* flow stats are disabled */ - return 0; - ppe_data = kzalloc(sizeof(*ppe_data), GFP_ATOMIC); if (!ppe_data) return -ENOMEM; @@ -542,7 +540,7 @@ static void airoha_npu_wlan_irq_disable(struct airoha_npu *npu, int q) regmap_clear_bits(npu->regmap, REG_IRQ_RXDONE(q), NPU_IRQ_RX_MASK(q)); } -struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr) +struct airoha_npu *airoha_npu_get(struct device *dev) { struct platform_device *pdev; struct device_node *np; @@ -581,17 +579,6 @@ struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr) goto error_module_put; } - if (stats_addr) { - int err; - - err = airoha_npu_stats_setup(npu, *stats_addr); - if (err) { - dev_err(dev, "failed to allocate npu stats buffer\n"); - npu = ERR_PTR(err); - goto error_module_put; - } - } - return npu; error_module_put: @@ -643,6 +630,7 @@ static int airoha_npu_probe(struct platform_device *pdev) npu->dev = dev; npu->ops.ppe_init = airoha_npu_ppe_init; npu->ops.ppe_deinit = airoha_npu_ppe_deinit; + npu->ops.ppe_init_stats = airoha_npu_ppe_stats_setup; npu->ops.ppe_flush_sram_entries = airoha_npu_ppe_flush_sram_entries; npu->ops.ppe_foe_commit_entry = airoha_npu_foe_commit_entry; npu->ops.wlan_init_reserved_memory = airoha_npu_wlan_init_memory; diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 78473527ff50..691361b25407 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -1243,12 +1243,11 @@ static int airoha_ppe_flush_sram_entries(struct airoha_ppe *ppe, static struct airoha_npu *airoha_ppe_npu_get(struct airoha_eth *eth) { - struct airoha_npu *npu = airoha_npu_get(eth->dev, - ð->ppe->foe_stats_dma); + struct airoha_npu *npu = airoha_npu_get(eth->dev); if (IS_ERR(npu)) { request_module("airoha-npu"); - npu = airoha_npu_get(eth->dev, ð->ppe->foe_stats_dma); + npu = airoha_npu_get(eth->dev); } return npu; @@ -1257,6 +1256,7 @@ static struct airoha_npu *airoha_ppe_npu_get(struct airoha_eth *eth) static int airoha_ppe_offload_setup(struct airoha_eth *eth) { struct airoha_npu *npu = airoha_ppe_npu_get(eth); + struct airoha_ppe *ppe = eth->ppe; int err; if (IS_ERR(npu)) @@ -1266,12 +1266,19 @@ static int airoha_ppe_offload_setup(struct airoha_eth *eth) if (err) goto error_npu_put; - airoha_ppe_hw_init(eth->ppe); - err = airoha_ppe_flush_sram_entries(eth->ppe, npu); + if (PPE_STATS_NUM_ENTRIES) { + err = npu->ops.ppe_init_stats(npu, ppe->foe_stats_dma, + PPE_STATS_NUM_ENTRIES); + if (err) + goto error_npu_put; + } + + airoha_ppe_hw_init(ppe); + err = airoha_ppe_flush_sram_entries(ppe, npu); if (err) goto error_npu_put; - airoha_ppe_foe_flow_stats_reset(eth->ppe, npu); + airoha_ppe_foe_flow_stats_reset(ppe, npu); rcu_assign_pointer(eth->npu, npu); synchronize_rcu(); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 1dc5b4e35ef9..6f66eb339b3f 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -181,6 +181,8 @@ struct airoha_npu { struct { int (*ppe_init)(struct airoha_npu *npu); int (*ppe_deinit)(struct airoha_npu *npu); + int (*ppe_init_stats)(struct airoha_npu *npu, + dma_addr_t addr, u32 num_stats_entries); int (*ppe_flush_sram_entries)(struct airoha_npu *npu, dma_addr_t foe_addr, int sram_num_entries); @@ -206,7 +208,7 @@ struct airoha_npu { }; #if (IS_BUILTIN(CONFIG_NET_AIROHA_NPU) || IS_MODULE(CONFIG_NET_AIROHA_NPU)) -struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr); +struct airoha_npu *airoha_npu_get(struct device *dev); void airoha_npu_put(struct airoha_npu *npu); static inline int airoha_npu_wlan_init_reserved_memory(struct airoha_npu *npu) @@ -256,8 +258,7 @@ static inline void airoha_npu_wlan_disable_irq(struct airoha_npu *npu, int q) npu->ops.wlan_disable_irq(npu, q); } #else -static inline struct airoha_npu *airoha_npu_get(struct device *dev, - dma_addr_t *foe_stats_addr) +static inline struct airoha_npu *airoha_npu_get(struct device *dev) { return NULL; } -- cgit v1.2.3 From c5273f6ca166c4edfaa6a87570e111453a0576ad Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:40 +0200 Subject: mptcp: pm: rename 'subflows' to 'extra_subflows' A few variables linked to the Path-Managers are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows', which in fact represents the number of extra subflows: all the additional subflows created after the initial one, and not the total number of subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-5-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 13 ++++++------ net/mptcp/pm_kernel.c | 24 +++++++++++------------ net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- tools/testing/selftests/bpf/progs/mptcp_subflow.c | 2 +- 7 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 15eef878690b..f807c8dba56e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -42,6 +42,7 @@ struct mptcp_info { __u8 mptcpi_subflows; + #define mptcpi_extra_subflows mptcpi_subflows __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 584cab90aa6e..332e96bdadc0 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -489,7 +489,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); - pm->subflows++; + pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } @@ -498,8 +498,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, - subflows_max, READ_ONCE(pm->accept_subflow)); + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, + pm->extra_subflows, subflows_max, + READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -507,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < subflows_max; - if (ret && ++pm->subflows == subflows_max) + ret = pm->extra_subflows < subflows_max; + if (ret && ++pm->extra_subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -594,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); - pm->subflows--; + pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index a82c077b8a20..20bee6fc0625 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -175,7 +175,7 @@ fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; *addrs = remote; return 1; @@ -218,10 +218,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, /* forbid creating multiple address towards this id */ __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); + msk->pm.extra_subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.extra_subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -441,10 +441,10 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, if (is_id0) local->addr.id = 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } rcu_read_unlock(); @@ -483,10 +483,10 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, continue; msk->pm.local_addr_used++; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -510,7 +510,7 @@ fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; return 1; } @@ -586,7 +586,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) + msk->pm.extra_subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1427,7 +1427,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a715dcbe0146..8cbc1920afb4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); else - msk->pm.subflows++; + msk->pm.extra_subflows++; spin_unlock_bh(&msk->pm.lock); create_err: diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index cbe54331e5c7..ca68f9a75801 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -235,7 +235,7 @@ struct mptcp_pm_data { u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; - u8 subflows; + u8 extra_subflows; u8 status; ); @@ -1188,7 +1188,7 @@ unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2abe6f1e9940..17966da80239 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) memset(info, 0, sizeof(*info)); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); @@ -996,7 +996,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; - info->mptcpi_subflows_total = info->mptcpi_subflows + + info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c index 70302477e326..41389e579578 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c @@ -117,7 +117,7 @@ int _getsockopt_subflow(struct bpf_sockopt *ctx) return 1; msk = bpf_core_cast(sk, struct mptcp_sock); - if (msk->pm.subflows != 1) { + if (msk->pm.extra_subflows != 1) { ctx->retval = -1; return 1; } -- cgit v1.2.3 From 3eb3c9a9596a53880f7d7eff28ac5622f3e0ba37 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:41 +0200 Subject: mptcp: pm: in-kernel: rename 'subflows_max' to 'limit_extra_subflows' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows_max', which in fact represents the limit of extra subflows: the limit set via 'ip mptcp limit subflows X' for example. It is not linked to the maximum number of created / possible subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-6-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 12 ++++++------ net/mptcp/pm_kernel.c | 48 ++++++++++++++++++++++++---------------------- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 37 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f807c8dba56e..314200c61f15 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -46,6 +46,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; + #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 332e96bdadc0..502f6c235e06 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -483,7 +483,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { @@ -496,10 +496,10 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) return false; } - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, - pm->extra_subflows, subflows_max, + pm->extra_subflows, limit_extra_subflows, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ @@ -508,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->extra_subflows < subflows_max; - if (ret && ++pm->extra_subflows == subflows_max) + ret = pm->extra_subflows < limit_extra_subflows; + if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -1029,7 +1029,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { - bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 20bee6fc0625..db0d254d0e6b 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -23,7 +23,7 @@ struct pm_nl_pernet { unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; - unsigned int subflows_max; + unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -62,13 +62,13 @@ unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->subflows_max); + return READ_ONCE(pernet->limit_extra_subflows); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { @@ -190,10 +190,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int i = 0; - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* Forbid creation of new subflows matching existing ones, possibly * already created by incoming ADD_ADDR @@ -221,7 +221,7 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -274,18 +274,18 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_signal_max; bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; - unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.extra_subflows, subflows_max); + msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.extra_subflows < subflows_max) { + msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -402,14 +402,15 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, bool c_flag_case) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { @@ -444,7 +445,7 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } rcu_read_unlock(); @@ -459,13 +460,14 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, { unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); while (msk->pm.local_addr_used < local_addr_max) { local = &locals[i]; @@ -486,7 +488,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -544,14 +546,14 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - unsigned int subflows_max; bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, @@ -586,7 +588,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.extra_subflows >= subflows_max) + msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1285,13 +1287,13 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - subflows = pernet->subflows_max; + subflows = pernet->limit_extra_subflows; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); + WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: spin_unlock_bh(&pernet->lock); @@ -1318,7 +1320,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) + READ_ONCE(pernet->limit_extra_subflows))) goto fail; genlmsg_end(msg, reply); @@ -1427,7 +1429,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); @@ -1462,7 +1464,7 @@ static int __net_init pm_nl_init_net(struct net *net) INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; + pernet->limit_extra_subflows = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ca68f9a75801..4c777f87b049 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,13 +1182,13 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 17966da80239..4e82bcfcd34e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -972,8 +972,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { - info->mptcpi_subflows_max = - mptcp_pm_get_subflows_max(msk); + info->mptcpi_limit_extra_subflows = + mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_accepted_max = -- cgit v1.2.3 From 45cae570664d58c562e21a3c7409fc02147bba46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:42 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_signal_max' to 'endp_signal_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_signal_max', which in fact represents the maximum number of 'signal' endpoints that can be used to announced addresses, and not the number of ADD_ADDR that can be signalled. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_signal_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-7-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 26 +++++++++++++------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 314200c61f15..69fc20db1c2f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -48,6 +48,7 @@ struct mptcp_info { __u8 mptcpi_subflows_max; #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; + #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; __u32 mptcpi_token; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 502f6c235e06..1100ba8b1ce8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1037,7 +1037,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows_allowed) || - !!mptcp_pm_get_add_addr_signal_max(msk)); + !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index db0d254d0e6b..740f0b20b941 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -20,7 +20,7 @@ struct pm_nl_pernet { struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; + unsigned int endp_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int limit_extra_subflows; @@ -46,13 +46,13 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_signal_max); + return READ_ONCE(pernet->endp_signal_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { @@ -275,15 +275,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_signal_max; bool signal_and_subflow = false; + unsigned int endp_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + endp_signal_max = mptcp_pm_get_endp_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); @@ -312,11 +312,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { + if (msk->pm.add_addr_signaled < endp_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -699,8 +699,8 @@ find_next: pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1098,8 +1098,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1185,7 +1185,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { - WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4c777f87b049..86c30cd6c1f2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1180,7 +1180,7 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4e82bcfcd34e..4688e0f25d15 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -974,8 +974,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (mptcp_pm_is_kernel(msk)) { info->mptcpi_limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - info->mptcpi_add_addr_signal_max = - mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_endp_signal_max = + mptcp_pm_get_endp_signal_max(msk); info->mptcpi_add_addr_accepted_max = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_local_addr_max = -- cgit v1.2.3 From 37712d84dfc2e80d4d218ff9be490c86e604aa69 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:43 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_accept_max' to 'limit_add_addr_accepted' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_accept_max', which in fact represents the limit of ADD_ADDR that can be accepted: the limit set via 'ip mptcp limit add_addr_accepted X' for example. It is not linked to the maximum number of accepted ADD_ADDR. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_add_addr_accepted. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-8-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 27 +++++++++++++++------------ net/mptcp/protocol.h | 4 ++-- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 69fc20db1c2f..1c275ce96b52 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -50,6 +50,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal_max; #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; + #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 1100ba8b1ce8..e13bfec50ef8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1039,7 +1039,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, - !!mptcp_pm_get_add_addr_accept_max(msk) && + !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 740f0b20b941..92f7419485a8 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,7 +21,7 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; @@ -54,13 +54,13 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_accept_max); + return READ_ONCE(pernet->limit_add_addr_accepted); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { @@ -547,16 +547,16 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; struct mptcp_addr_info remote; bool sf_created = false; int i, nr; - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); remote = msk->pm.remote; @@ -587,7 +587,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || + if (msk->pm.add_addr_accepted >= limit_add_addr_accepted || msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } @@ -596,10 +596,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + unsigned int limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + if (--msk->pm.add_addr_accepted < limit_add_addr_accepted) WRITE_ONCE(msk->pm.accept_addr, true); } } @@ -1282,7 +1285,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) int ret; spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; + rcv_addrs = pernet->limit_add_addr_accepted; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; @@ -1292,7 +1295,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs); WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: @@ -1316,7 +1319,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) + READ_ONCE(pernet->limit_add_addr_accepted))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 86c30cd6c1f2..114995e1352d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,7 +1181,7 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); @@ -1203,7 +1203,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && - mptcp_pm_get_add_addr_accept_max(msk) == 0 && + mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4688e0f25d15..5ab9909dbe79 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -976,8 +976,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - info->mptcpi_add_addr_accepted_max = - mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); } -- cgit v1.2.3 From e7757b6d3a623671705388be24851af7360b54ba Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:44 +0200 Subject: mptcp: pm: in-kernel: rename 'local_addr_max' to 'endp_subflow_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'local_addr_max', which in fact represents the maximum number of 'subflow' endpoints that can be used to create new subflows, and not the number of local addresses that have been used to create subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_subflow_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. Also move the variable and function next to the other 'endp_X_max' ones. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-9-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 40 ++++++++++++++++++++-------------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 1c275ce96b52..5ec996977b3f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -58,6 +58,7 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; + #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e13bfec50ef8..2ff1b9499568 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1035,7 +1035,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, - (!!mptcp_pm_get_local_addr_max(msk) && + (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 92f7419485a8..e62e21eb9da1 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,8 +21,8 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; + unsigned int endp_subflow_max; unsigned int limit_add_addr_accepted; - unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -54,6 +54,14 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_subflow_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); + unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -70,14 +78,6 @@ unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { @@ -276,15 +276,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; bool signal_and_subflow = false; + unsigned int endp_subflow_max; unsigned int endp_signal_max; - unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); + endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ @@ -311,7 +311,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, + msk->pm.local_addr_used, endp_subflow_max, msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); @@ -352,7 +352,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && + while (msk->pm.local_addr_used < endp_subflow_max && msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; @@ -458,7 +458,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { - unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + unsigned int endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; @@ -469,7 +469,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, mptcp_local_address((struct sock_common *)msk, &mpc_addr); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - while (msk->pm.local_addr_used < local_addr_max) { + while (msk->pm.local_addr_used < endp_subflow_max) { local = &locals[i]; if (!select_local_address(pernet, msk, local)) @@ -706,8 +706,8 @@ find_next: WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } pernet->addrs++; @@ -1105,8 +1105,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } pernet->addrs--; @@ -1189,7 +1189,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); + WRITE_ONCE(pernet->endp_subflow_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 114995e1352d..df8f977039d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,9 +1181,9 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 5ab9909dbe79..92a2a2742627 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -978,8 +978,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_endp_signal_max(msk); info->mptcpi_limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); - info->mptcpi_local_addr_max = - mptcp_pm_get_local_addr_max(msk); + info->mptcpi_endp_subflow_max = + mptcp_pm_get_endp_subflow_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From 539f6b9de39ec5d827b16f6f5c8f3cfd58669e93 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:50 +0200 Subject: mptcp: pm: in-kernel: add laminar endpoints Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag is not used), the in-kernel PM will create new subflows using the local address the routing configuration will pick. It would be easier to pick local addresses from a selected list of endpoints, and use it only once, than relying on routing rules. Use case: both the client (C) and the server (S) have two addresses (a and b). The client establishes the connection between C(a) and S(a). Once established, the server announces its additional address S(b). Once received, the client connects to it using its second address C(b). Compared to a situation without the 'laminar' endpoint for C(b), the client didn't use this address C(b) to establish a subflow to the server's primary address S(a). So at the end, we have: C S C(a) --- S(a) C(b) --- S(b) In case of a 3rd address on each side (C(c) and S(c)), upon the reception of an ADD_ADDR with S(c), the client should not pick C(b) because it has already been used. C(c) should then be used. Note that this situation is currently possible if C doesn't add any endpoint, but configure the routing in order to pick C(b) for the route to S(b), and pick C(c) for the route to S(c). That doesn't sound very practical because it means knowing in advance the IP addresses that will be used and announced by the server. 'laminar', like the idea of laminar flows: the different subflows don't mix with each other on an endpoint, unlike the "turbulent" way traffic is mixed by 'fullmesh'. In the code, the new endpoint type is added. Similar to the other subflow types, an MPTCP_INFO counter is added. While at it, hole are now commented in struct mptcp_info, to remember next time that these holes can no longer be used. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-15-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 6 +++- net/mptcp/pm_kernel.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/sockopt.c | 2 ++ 4 files changed, 90 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5ec996977b3f..87cfab874e24 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -39,6 +39,7 @@ #define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) #define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) #define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) +#define MPTCP_PM_ADDR_FLAG_LAMINAR _BITUL(5) struct mptcp_info { __u8 mptcpi_subflows; @@ -51,6 +52,7 @@ struct mptcp_info { #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max + /* 16-bit hole that can no longer be filled */ __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; @@ -60,13 +62,15 @@ struct mptcp_info { __u8 mptcpi_local_addr_max; #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; + /* 8-bit hole that can no longer be filled */ __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; __u64 mptcpi_bytes_sent; __u64 mptcpi_bytes_received; __u64 mptcpi_bytes_acked; __u8 mptcpi_subflows_total; - __u8 reserved[3]; + __u8 mptcpi_endp_laminar_max; + __u8 reserved[2]; __u32 mptcpi_last_data_sent; __u32 mptcpi_last_data_recv; __u32 mptcpi_last_ack_recv; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 55dbf89d19b8..e0f44dc232aa 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,6 +21,7 @@ struct pm_nl_pernet { u8 endpoints; u8 endp_signal_max; u8 endp_subflow_max; + u8 endp_laminar_max; u8 limit_add_addr_accepted; u8 limit_extra_subflows; u8 next_id; @@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_laminar_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); + u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -458,6 +467,66 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, return i; } +static unsigned int +fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_pm_local *local; + int found = 0; + + /* Forbid creation of new subflows matching existing ones, possibly + * already created by 'subflow' endpoints + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | + TCPF_CLOSE)) + continue; + + __set_bit(subflow_get_local_id(subflow), unavail_id); + } + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr), + unavail_id)) + continue; + + local = &locals[0]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (local->addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + } + + msk->pm.extra_subflows++; + found = 1; + break; + } + rcu_read_unlock(); + + return found; +} + static unsigned int fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, @@ -532,6 +601,10 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (i) return i; + /* If there is at least one MPTCP endpoint with a laminar flag */ + if (mptcp_pm_get_endp_laminar_max(msk)) + return fill_local_laminar_endp(msk, remote, locals); + /* Special case: peer sets the C flag, accept one ADD_ADDR if default * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints */ @@ -707,6 +780,10 @@ find_next: addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); + } pernet->endpoints++; if (!entry->addr.port) @@ -1100,6 +1177,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); + } pernet->endpoints--; list_del_rcu(&entry->list); @@ -1182,6 +1263,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->endp_subflow_max, 0); + WRITE_ONCE(pernet->endp_laminar_max, 0); pernet->endpoints = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0cd3333cafaf..371084a3fc22 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,6 +1182,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 92a2a2742627..a28a48385885 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -980,6 +980,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + info->mptcpi_endp_laminar_max = + mptcp_pm_get_endp_laminar_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From 87608c2a7718dcac5deef801fb3c18cf36fb0233 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 26 Sep 2025 17:52:39 +0800 Subject: bpf: Remove duplicate crypto/sha2.h header ./include/linux/bpf.h: crypto/sha2.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=25501 Signed-off-by: Jiapeng Chong Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20250926095240.3397539-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6338e54a9b1f..fe2a396d8ac6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -32,7 +32,6 @@ #include #include #include -#include struct bpf_verifier_env; struct bpf_verifier_log; -- cgit v1.2.3 From fed7eaa4f037361fe4f3d4170649d6849a25998d Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:16 -0700 Subject: PM: runtime: Update kerneldoc return codes APIs based on __pm_runtime_idle() (pm_runtime_idle(), pm_request_idle()) do not return 1 when already suspended. They return -EAGAIN. This is already covered in the docs, so the entry for "1" is redundant and conflicting. (pm_runtime_put() and pm_runtime_put_sync() were previously incorrect, but that's fixed in "PM: runtime: pm_runtime_put{,_sync}() returns 1 when already suspended", to ensure consistency with APIs like pm_runtime_put_autosuspend().) RPM_GET_PUT APIs based on __pm_runtime_suspend() do return 1 when already suspended, but the language is a little unclear -- it's not really an "error", so it seems better to list as a clarification before the 0/success case. Additionally, they only actually return 1 when the refcount makes it to 0; if the usage counter is still non-zero, we return 0. pm_runtime_put(), etc., also don't appear at first like they can ever see "-EAGAIN: Runtime PM usage_count non-zero", because in non-racy conditions, pm_runtime_put() would drop its reference count, see it's non-zero, and return early (in __pm_runtime_idle()). However, it's possible to race with another actor that increments the usage_count afterward, since rpm_idle() is protected by a separate lock; in such a case, we may see -EAGAIN. Because this case is only seen in the presence of concurrent actors, it makes sense to clarify that this is when "usage_count **became** non-zero", by way of some racing actor. Lastly, pm_runtime_put_sync_suspend() duplicated some -EAGAIN language. Fix that. Fixes: 271ff96d6066 ("PM: runtime: Document return values of suspend-related API functions") Link: https://lore.kernel.org/linux-pm/aJ5pkEJuixTaybV4@google.com/ Signed-off-by: Brian Norris Reviewed-by: Sakari Ailus Cc: 6.17+ # 6.17+ Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 56 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d88d6b6ccf5b..d1ff76e0e2d0 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -350,13 +350,12 @@ static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; } * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM idle and suspend callbacks. */ @@ -370,14 +369,15 @@ static inline int pm_runtime_idle(struct device *dev) * @dev: Target device. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -396,14 +396,15 @@ static inline int pm_runtime_suspend(struct device *dev) * engaging its "idle check" callback. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -433,13 +434,12 @@ static inline int pm_runtime_resume(struct device *dev) * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_idle(struct device *dev) { @@ -464,15 +464,16 @@ static inline int pm_request_resume(struct device *dev) * equivalent pm_runtime_autosuspend() for @dev asynchronously. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_autosuspend(struct device *dev) { @@ -540,15 +541,16 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * equal to 0, queue up a work item for @dev like in pm_request_idle(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put(struct device *dev) { @@ -565,15 +567,16 @@ DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int __pm_runtime_put_autosuspend(struct device *dev) { @@ -590,15 +593,16 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) * in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put_autosuspend(struct device *dev) { @@ -619,14 +623,15 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -646,15 +651,15 @@ static inline int pm_runtime_put_sync(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. - * * -EAGAIN: usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -677,15 +682,16 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ -- cgit v1.2.3 From 4540aed51b12bc13364149bf95f6ecef013197c0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2025 19:12:00 +0200 Subject: bpf: Enforce expected_attach_type for tailcall compatibility Yinhao et al. recently reported: Our fuzzer tool discovered an uninitialized pointer issue in the bpf_prog_test_run_xdp() function within the Linux kernel's BPF subsystem. This leads to a NULL pointer dereference when a BPF program attempts to deference the txq member of struct xdp_buff object. The test initializes two programs of BPF_PROG_TYPE_XDP: progA acts as the entry point for bpf_prog_test_run_xdp() and its expected_attach_type can neither be of be BPF_XDP_DEVMAP nor BPF_XDP_CPUMAP. progA calls into a slot of a tailcall map it owns. progB's expected_attach_type must be BPF_XDP_DEVMAP to pass xdp_is_valid_access() validation. The program returns struct xdp_md's egress_ifindex, and the latter is only allowed to be accessed under mentioned expected_attach_type. progB is then inserted into the tailcall which progA calls. The underlying issue goes beyond XDP though. Another example are programs of type BPF_PROG_TYPE_CGROUP_SOCK_ADDR. sock_addr_is_valid_access() as well as sock_addr_func_proto() have different logic depending on the programs' expected_attach_type. Similarly, a program attached to BPF_CGROUP_INET4_GETPEERNAME should not be allowed doing a tailcall into a program which calls bpf_bind() out of BPF which is only enabled for BPF_CGROUP_INET4_CONNECT. In short, specifying expected_attach_type allows to open up additional functionality or restrictions beyond what the basic bpf_prog_type enables. The use of tailcalls must not violate these constraints. Fix it by enforcing expected_attach_type in __bpf_prog_map_compatible(). Note that we only enforce this for tailcall maps, but not for BPF devmaps or cpumaps: There, the programs are invoked through dev_map_bpf_prog_run*() and cpu_map_bpf_prog_run*() which set up a new environment / context and therefore these situations are not prone to this issue. Fixes: 5e43f899b03a ("bpf: Check attach type at prog load time") Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reviewed-by: Dongliang Mu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250926171201.188490-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe2a396d8ac6..a98c83346134 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -289,6 +289,7 @@ struct bpf_map_owner { bool xdp_has_frags; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; + enum bpf_attach_type expected_attach_type; }; struct bpf_map { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b64674df16b..d595fe512498 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2361,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { map->owner->storage_cookie[i] = @@ -2372,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + map->owner->expected_attach_type != fp->expected_attach_type) + ret = false; for_each_cgroup_storage_type(i) { if (!ret) break; -- cgit v1.2.3 From d4e99db3d942c8099006f3b7536bc52f766b475a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 10 Aug 2025 23:53:20 +0200 Subject: Bluetooth: Annotate struct hci_drv_rp_read_info with __counted_by_le() Add the __counted_by_le() compiler attribute to the flexible array member 'supported_commands' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Signed-off-by: Thorsten Blum Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_drv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h index 2f01c44f05ec..3fd6fdbdb02e 100644 --- a/include/net/bluetooth/hci_drv.h +++ b/include/net/bluetooth/hci_drv.h @@ -47,7 +47,7 @@ struct hci_drv_ev_cmd_complete { struct hci_drv_rp_read_info { __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; __le16 num_supported_commands; - __le16 supported_commands[]; + __le16 supported_commands[] __counted_by_le(num_supported_commands); } __packed; /* Driver specific OGF (Opcode Group Field) -- cgit v1.2.3 From 339a87883a14d6a818ca436fed41aa5d10e0f4bd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Aug 2025 15:21:19 -0400 Subject: Bluetooth: ISO: Use sk_sndtimeo as conn_timeout This aligns the usage of socket sk_sndtimeo as conn_timeout when initiating a connection and then use it when scheduling the resulting HCI command, similar to what has been done in bf98feea5b65 ("Bluetooth: hci_conn: Always use sk_timeo as conn_timeout"). Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 10 ++++++---- net/bluetooth/hci_conn.c | 20 ++++++++++++-------- net/bluetooth/iso.c | 16 ++++++++++------ 3 files changed, 28 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6560b32f3125..a068beae9318 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1587,16 +1587,18 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base); + __u8 base_len, __u8 *base, u16 timeout); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 data_len, __u8 *data); + __u8 data_len, __u8 *data, u16 timeout); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 091cff2155e6..111f0e37b672 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1539,7 +1539,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; @@ -1581,6 +1581,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, conn->state = BT_CONNECT; conn->sid = sid; + conn->conn_timeout = timeout; hci_conn_hold(conn); return conn; @@ -1921,7 +1922,8 @@ done: } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *cis; @@ -1936,6 +1938,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; + cis->conn_timeout = timeout; } if (cis->state == BT_CONNECTED) @@ -2175,7 +2178,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; struct hci_conn *parent; @@ -2196,7 +2199,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout); if (IS_ERR(conn)) return conn; @@ -2249,13 +2252,13 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout); if (IS_ERR(conn)) return conn; @@ -2298,7 +2301,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *le; struct hci_conn *cis; @@ -2322,7 +2326,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); - cis = hci_bind_cis(hdev, dst, dst_type, qos); + cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ac6e83313b9b..5c68c0ea7d97 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -91,8 +91,8 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ -#define ISO_CONN_TIMEOUT (HZ * 40) -#define ISO_DISCONN_TIMEOUT (HZ * 2) +#define ISO_CONN_TIMEOUT secs_to_jiffies(20) +#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2) static void iso_conn_free(struct kref *ref) { @@ -367,7 +367,8 @@ static int iso_connect_bis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -376,7 +377,8 @@ static int iso_connect_bis(struct sock *sk) hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, - iso_pi(sk)->base_len, iso_pi(sk)->base); + iso_pi(sk)->base_len, iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -469,7 +471,8 @@ static int iso_connect_cis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -477,7 +480,8 @@ static int iso_connect_cis(struct sock *sk) } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; -- cgit v1.2.3 From c9beb36c14660713b948e289b1e352cc3d386d44 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Aug 2025 15:57:39 -0400 Subject: Bluetooth: hci_core: Detect if an ISO link has stalled This attempts to detect if an ISO link has been waiting for an ISO buffer for longer than the maximum allowed transport latency then proceed to use hci_link_tx_to which prints an error and disconnects. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index df1847b74e55..9ecc70baaca9 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -488,6 +488,7 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ +#define HCI_ISO_TX_TIMEOUT usecs_to_jiffies(0x7fffff) /* 8388607 usecs */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a068beae9318..2924c2bf2a98 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -487,6 +487,7 @@ struct hci_dev { unsigned long acl_last_tx; unsigned long le_last_tx; + unsigned long iso_last_tx; __u8 le_tx_def_phys; __u8 le_rx_def_phys; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e2bffad9816f..4cf4bb1187dc 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3585,24 +3585,37 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { - unsigned long last_tx; + unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { + case ACL_LINK: + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; + break; case LE_LINK: - last_tx = hdev->le_last_tx; + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; - default: - last_tx = hdev->acl_last_tx; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + /* tx timeout must be longer than the maximum transport latency + * (8.388607 seconds) + */ + timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; + default: + return; } - /* tx timeout must be longer than maximum link supervision timeout - * (40.9 seconds) - */ - if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) + if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } @@ -3759,10 +3772,15 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) return; cnt = &hdev->iso_cnt; + + __check_timeout(hdev, *cnt, type); + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); + hci_send_conn_frame(hdev, conn, skb); + hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) -- cgit v1.2.3 From 9eb14331885b09adfa7fe69a5a4603e24909c88f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 18 Aug 2025 16:43:53 -0400 Subject: Bluetooth: Add function and line information to bt_dbg When enabling debug via CONFIG_BT_FEATURE_DEBUG include function and line information by default otherwise it is hard to make any sense of which function the logs comes from. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index e5751f3070b8..d46ed9011ee5 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -272,7 +272,8 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) -#define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) +#define BT_DBG(fmt, ...) \ + bt_dbg("%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__) #else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif -- cgit v1.2.3 From be812ace0378a9db86344ad637c5ed2a5d11f216 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 9 Sep 2025 14:13:35 +0200 Subject: Bluetooth: Avoid a couple dozen -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the __struct_group() helper to fix 31 instances of the following type of warnings: 30 net/bluetooth/mgmt_config.c:16:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] 1 net/bluetooth/mgmt_config.c:22:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 9 +++++++-- net/bluetooth/mgmt_config.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 3575cd16049a..74edea06985b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -53,10 +53,15 @@ struct mgmt_hdr { } __packed; struct mgmt_tlv { - __le16 type; - __u8 length; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(mgmt_tlv_hdr, __hdr, __packed, + __le16 type; + __u8 length; + ); __u8 value[]; } __packed; +static_assert(offsetof(struct mgmt_tlv, value) == sizeof(struct mgmt_tlv_hdr), + "struct member likely outside of __struct_group()"); struct mgmt_addr_info { bdaddr_t bdaddr; diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 6ef701c27da4..c4063d200c0a 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -13,13 +13,13 @@ #define HDEV_PARAM_U16(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __le16 value; \ } __packed _param_name_ #define HDEV_PARAM_U8(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __u8 value; \ } __packed _param_name_ -- cgit v1.2.3 From 720a485d12c590750f40f4ffbe41e36725f43f3d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 9 Aug 2025 10:19:41 -0700 Subject: KEYS: trusted_tpm1: Move private functionality out of public header Move functionality used only by trusted_tpm1.c out of the public header . Specifically, change the exported functions into static functions, since they are not used outside trusted_tpm1.c, and move various other definitions and inline functions to trusted_tpm1.c. Signed-off-by: Eric Biggers Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/keys/trusted_tpm.h | 79 ------------------------------ security/keys/trusted-keys/trusted_tpm1.c | 80 +++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 87 deletions(-) (limited to 'include') diff --git a/include/keys/trusted_tpm.h b/include/keys/trusted_tpm.h index a088b33fd0e3..0fadc6a4f166 100644 --- a/include/keys/trusted_tpm.h +++ b/include/keys/trusted_tpm.h @@ -5,41 +5,8 @@ #include #include -/* implementation specific TPM constants */ -#define TPM_SIZE_OFFSET 2 -#define TPM_RETURN_OFFSET 6 -#define TPM_DATA_OFFSET 10 - -#define LOAD32(buffer, offset) (ntohl(*(uint32_t *)&buffer[offset])) -#define LOAD32N(buffer, offset) (*(uint32_t *)&buffer[offset]) -#define LOAD16(buffer, offset) (ntohs(*(uint16_t *)&buffer[offset])) - extern struct trusted_key_ops trusted_key_tpm_ops; -struct osapsess { - uint32_t handle; - unsigned char secret[SHA1_DIGEST_SIZE]; - unsigned char enonce[TPM_NONCE_SIZE]; -}; - -/* discrete values, but have to store in uint16_t for TPM use */ -enum { - SEAL_keytype = 1, - SRK_keytype = 4 -}; - -int TSS_authhmac(unsigned char *digest, const unsigned char *key, - unsigned int keylen, unsigned char *h1, - unsigned char *h2, unsigned int h3, ...); -int TSS_checkhmac1(unsigned char *buffer, - const uint32_t command, - const unsigned char *ononce, - const unsigned char *key, - unsigned int keylen, ...); - -int trusted_tpm_send(unsigned char *cmd, size_t buflen); -int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); - int tpm2_seal_trusted(struct tpm_chip *chip, struct trusted_key_payload *payload, struct trusted_key_options *options); @@ -47,50 +14,4 @@ int tpm2_unseal_trusted(struct tpm_chip *chip, struct trusted_key_payload *payload, struct trusted_key_options *options); -#define TPM_DEBUG 0 - -#if TPM_DEBUG -static inline void dump_options(struct trusted_key_options *o) -{ - pr_info("sealing key type %d\n", o->keytype); - pr_info("sealing key handle %0X\n", o->keyhandle); - pr_info("pcrlock %d\n", o->pcrlock); - pr_info("pcrinfo %d\n", o->pcrinfo_len); - print_hex_dump(KERN_INFO, "pcrinfo ", DUMP_PREFIX_NONE, - 16, 1, o->pcrinfo, o->pcrinfo_len, 0); -} - -static inline void dump_sess(struct osapsess *s) -{ - print_hex_dump(KERN_INFO, "trusted-key: handle ", DUMP_PREFIX_NONE, - 16, 1, &s->handle, 4, 0); - pr_info("secret:\n"); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, - 16, 1, &s->secret, SHA1_DIGEST_SIZE, 0); - pr_info("trusted-key: enonce:\n"); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, - 16, 1, &s->enonce, SHA1_DIGEST_SIZE, 0); -} - -static inline void dump_tpm_buf(unsigned char *buf) -{ - int len; - - pr_info("\ntpm buffer\n"); - len = LOAD32(buf, TPM_SIZE_OFFSET); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, buf, len, 0); -} -#else -static inline void dump_options(struct trusted_key_options *o) -{ -} - -static inline void dump_sess(struct osapsess *s) -{ -} - -static inline void dump_tpm_buf(unsigned char *buf) -{ -} -#endif #endif diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c index 126437459a74..636acb66a4f6 100644 --- a/security/keys/trusted-keys/trusted_tpm1.c +++ b/security/keys/trusted-keys/trusted_tpm1.c @@ -24,6 +24,74 @@ static struct tpm_chip *chip; static struct tpm_digest *digests; +/* implementation specific TPM constants */ +#define TPM_SIZE_OFFSET 2 +#define TPM_RETURN_OFFSET 6 +#define TPM_DATA_OFFSET 10 + +#define LOAD32(buffer, offset) (ntohl(*(uint32_t *)&buffer[offset])) +#define LOAD32N(buffer, offset) (*(uint32_t *)&buffer[offset]) +#define LOAD16(buffer, offset) (ntohs(*(uint16_t *)&buffer[offset])) + +struct osapsess { + uint32_t handle; + unsigned char secret[SHA1_DIGEST_SIZE]; + unsigned char enonce[TPM_NONCE_SIZE]; +}; + +/* discrete values, but have to store in uint16_t for TPM use */ +enum { + SEAL_keytype = 1, + SRK_keytype = 4 +}; + +#define TPM_DEBUG 0 + +#if TPM_DEBUG +static inline void dump_options(struct trusted_key_options *o) +{ + pr_info("sealing key type %d\n", o->keytype); + pr_info("sealing key handle %0X\n", o->keyhandle); + pr_info("pcrlock %d\n", o->pcrlock); + pr_info("pcrinfo %d\n", o->pcrinfo_len); + print_hex_dump(KERN_INFO, "pcrinfo ", DUMP_PREFIX_NONE, + 16, 1, o->pcrinfo, o->pcrinfo_len, 0); +} + +static inline void dump_sess(struct osapsess *s) +{ + print_hex_dump(KERN_INFO, "trusted-key: handle ", DUMP_PREFIX_NONE, + 16, 1, &s->handle, 4, 0); + pr_info("secret:\n"); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, + 16, 1, &s->secret, SHA1_DIGEST_SIZE, 0); + pr_info("trusted-key: enonce:\n"); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, + 16, 1, &s->enonce, SHA1_DIGEST_SIZE, 0); +} + +static inline void dump_tpm_buf(unsigned char *buf) +{ + int len; + + pr_info("\ntpm buffer\n"); + len = LOAD32(buf, TPM_SIZE_OFFSET); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, buf, len, 0); +} +#else +static inline void dump_options(struct trusted_key_options *o) +{ +} + +static inline void dump_sess(struct osapsess *s) +{ +} + +static inline void dump_tpm_buf(unsigned char *buf) +{ +} +#endif + static int TSS_rawhmac(unsigned char *digest, const unsigned char *key, unsigned int keylen, ...) { @@ -56,7 +124,7 @@ static int TSS_rawhmac(unsigned char *digest, const unsigned char *key, /* * calculate authorization info fields to send to TPM */ -int TSS_authhmac(unsigned char *digest, const unsigned char *key, +static int TSS_authhmac(unsigned char *digest, const unsigned char *key, unsigned int keylen, unsigned char *h1, unsigned char *h2, unsigned int h3, ...) { @@ -94,12 +162,11 @@ int TSS_authhmac(unsigned char *digest, const unsigned char *key, TPM_NONCE_SIZE, h2, 1, &c, 0, 0); return ret; } -EXPORT_SYMBOL_GPL(TSS_authhmac); /* * verify the AUTH1_COMMAND (Seal) result from TPM */ -int TSS_checkhmac1(unsigned char *buffer, +static int TSS_checkhmac1(unsigned char *buffer, const uint32_t command, const unsigned char *ononce, const unsigned char *key, @@ -159,7 +226,6 @@ int TSS_checkhmac1(unsigned char *buffer, return -EINVAL; return 0; } -EXPORT_SYMBOL_GPL(TSS_checkhmac1); /* * verify the AUTH2_COMMAND (unseal) result from TPM @@ -244,7 +310,7 @@ static int TSS_checkhmac2(unsigned char *buffer, * For key specific tpm requests, we will generate and send our * own TPM command packets using the drivers send function. */ -int trusted_tpm_send(unsigned char *cmd, size_t buflen) +static int trusted_tpm_send(unsigned char *cmd, size_t buflen) { struct tpm_buf buf; int rc; @@ -270,7 +336,6 @@ int trusted_tpm_send(unsigned char *cmd, size_t buflen) tpm_put_ops(chip); return rc; } -EXPORT_SYMBOL_GPL(trusted_tpm_send); /* * Lock a trusted key, by extending a selected PCR. @@ -324,7 +389,7 @@ static int osap(struct tpm_buf *tb, struct osapsess *s, /* * Create an object independent authorisation protocol (oiap) session */ -int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) +static int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) { int ret; @@ -341,7 +406,6 @@ int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) TPM_NONCE_SIZE); return 0; } -EXPORT_SYMBOL_GPL(oiap); struct tpm_digests { unsigned char encauth[SHA1_DIGEST_SIZE]; -- cgit v1.2.3 From 2f7d98f10b8f64525b2c74cae7d70ae5278eb654 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Jul 2025 15:32:31 -0400 Subject: Have cc(1) catch attempts to modify ->f_path There are very few places that have cause to do that - all in core VFS now, and all done to files that are not yet opened (or visible to anybody else, for that matter). Let's turn f_path into a union of struct path __f_path and const struct path f_path. It's C, not C++ - 6.5.2.3[4] in C99 and later explicitly allows that kind of type-punning. That way any attempts to bypass these checks will be either very easy to catch, or (if the bastards get sufficiently creative to make it hard to spot with grep alone) very clearly malicious - and still catchable with a bit of instrumentation for sparse. Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/file_table.c | 4 ++-- fs/namei.c | 8 ++++---- fs/open.c | 10 +++++----- include/linux/fs.h | 7 ++++++- 4 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/file_table.c b/fs/file_table.c index 85b53e39138d..b223d873e48b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -171,7 +171,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->__f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; @@ -319,7 +319,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { - file->f_path = *path; + file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); diff --git a/fs/namei.c b/fs/namei.c index 3eb0408e3400..ba8bf73d2f9c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3563,8 +3563,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; - file->f_path.dentry = DENTRY_NOT_SET; - file->f_path.mnt = nd->path.mnt; + file->__f_path.dentry = DENTRY_NOT_SET; + file->__f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); @@ -3932,8 +3932,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap, child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; - file->f_path.mnt = parentpath->mnt; - file->f_path.dentry = child; + file->__f_path.mnt = parentpath->mnt; + file->__f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); diff --git a/fs/open.c b/fs/open.c index 9655158c3885..f4bdf7693530 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1022,8 +1022,8 @@ cleanup_all: put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } @@ -1050,7 +1050,7 @@ int finish_open(struct file *file, struct dentry *dentry, { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - file->f_path.dentry = dentry; + file->__f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -1071,7 +1071,7 @@ EXPORT_SYMBOL(finish_open); */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; + file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); @@ -1091,7 +1091,7 @@ int vfs_open(const struct path *path, struct file *file) { int ret; - file->f_path = *path; + file->__f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* diff --git a/include/linux/fs.h b/include/linux/fs.h index af514fae4e2d..1fb02c76ae09 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1082,6 +1082,8 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file + * @__f_path: writable alias for @f_path; *ONLY* for core VFS and only before + * the file gets open * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position @@ -1107,7 +1109,10 @@ struct file { const struct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ - struct path f_path; + union { + const struct path f_path; + struct path __f_path; + }; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; -- cgit v1.2.3 From ab91835e61ab56d3964f51480955c9661678c269 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Sep 2025 14:03:25 +0100 Subject: ASoC: cs35l56: Set fw_regs table after getting REVID Defer setting the cs35l56_base.fw_regs pointer until after the REVID has been read in cs35l56_hw_init(). Also make the corresponding change to the cs35l56_hda drivers to prevent a build break. This is preparing for firmware registers that change address between revisions of the same device. Signed-off-by: Richard Fitzgerald Signed-off-by: Takashi Iwai --- include/sound/cs35l56.h | 3 -- sound/hda/codecs/side-codecs/cs35l56_hda.c | 1 + sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c | 2 - sound/hda/codecs/side-codecs/cs35l56_hda_spi.c | 2 - sound/soc/codecs/cs35l56-i2c.c | 4 +- sound/soc/codecs/cs35l56-sdw.c | 4 +- sound/soc/codecs/cs35l56-shared.c | 59 +++++++++++++++----------- sound/soc/codecs/cs35l56-spi.c | 2 +- 8 files changed, 41 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 7c8bbe8ad1e2..20dc3ee6378d 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -337,9 +337,6 @@ extern const struct regmap_config cs35l56_regmap_sdw; extern const struct regmap_config cs35l63_regmap_i2c; extern const struct regmap_config cs35l63_regmap_sdw; -extern const struct cs35l56_fw_reg cs35l56_fw_reg; -extern const struct cs35l56_fw_reg cs35l63_fw_reg; - extern const struct cirrus_amp_cal_controls cs35l56_calibration_controls; extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC]; diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index 36fa62a41984..5bb1c4ebeaf3 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -1049,6 +1049,7 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) goto err; } + cs35l56->base.type = hid & 0xff; cs35l56->base.cal_index = -1; cs35l56_init_cs_dsp(&cs35l56->base, &cs35l56->cs_dsp); diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c b/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c index d10209e4eddd..1072f17385ac 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c @@ -27,8 +27,6 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt) cs35l56->base.can_hibernate = true; #endif - cs35l56->base.fw_reg = &cs35l56_fw_reg; - cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c b/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c index f57533d3d728..f802c83c57b4 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c @@ -30,8 +30,6 @@ static int cs35l56_hda_spi_probe(struct spi_device *spi) cs35l56->base.can_hibernate = true; #endif - cs35l56->base.fw_reg = &cs35l56_fw_reg; - cs35l56->base.regmap = devm_regmap_init_spi(spi, &cs35l56_regmap_spi); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); diff --git a/sound/soc/codecs/cs35l56-i2c.c b/sound/soc/codecs/cs35l56-i2c.c index 073f1796ae29..0492ddc4102d 100644 --- a/sound/soc/codecs/cs35l56-i2c.c +++ b/sound/soc/codecs/cs35l56-i2c.c @@ -35,11 +35,11 @@ static int cs35l56_i2c_probe(struct i2c_client *client) switch (id) { case 0x3556: regmap_config = &cs35l56_regmap_i2c; - cs35l56->base.fw_reg = &cs35l56_fw_reg; + cs35l56->base.type = 0x56; break; case 0x3563: regmap_config = &cs35l63_regmap_i2c; - cs35l56->base.fw_reg = &cs35l63_fw_reg; + cs35l56->base.type = 0x63; break; default: return -ENODEV; diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index 3905c9cb188a..42d24ac2977f 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -527,16 +527,16 @@ static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_devi case 0x3556: case 0x3557: regmap_config = &cs35l56_regmap_sdw; - cs35l56->base.fw_reg = &cs35l56_fw_reg; break; case 0x3563: regmap_config = &cs35l63_regmap_sdw; - cs35l56->base.fw_reg = &cs35l63_fw_reg; break; default: return -ENODEV; } + cs35l56->base.type = ((unsigned int)id->driver_data) & 0xff; + cs35l56->base.regmap = devm_regmap_init(dev, &cs35l56_regmap_bus_sdw, peripheral, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 95d018ecb953..03ea66f08787 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -309,6 +309,40 @@ static bool cs35l63_volatile_reg(struct device *dev, unsigned int reg) } } +static const struct cs35l56_fw_reg cs35l56_fw_reg = { + .fw_ver = CS35L56_DSP1_FW_VER, + .halo_state = CS35L56_DSP1_HALO_STATE, + .pm_cur_stat = CS35L56_DSP1_PM_CUR_STATE, + .prot_sts = CS35L56_PROTECTION_STATUS, + .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L56_MAIN_POSTURE_NUMBER, +}; + +static const struct cs35l56_fw_reg cs35l63_fw_reg = { + .fw_ver = CS35L63_DSP1_FW_VER, + .halo_state = CS35L63_DSP1_HALO_STATE, + .pm_cur_stat = CS35L63_DSP1_PM_CUR_STATE, + .prot_sts = CS35L63_PROTECTION_STATUS, + .transducer_actual_ps = CS35L63_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L63_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L63_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L63_MAIN_POSTURE_NUMBER, +}; + +static void cs35l56_set_fw_reg_table(struct cs35l56_base *cs35l56_base) +{ + switch (cs35l56_base->type) { + default: + cs35l56_base->fw_reg = &cs35l56_fw_reg; + break; + case 0x63: + cs35l56_base->fw_reg = &cs35l63_fw_reg; + break; + } +} + int cs35l56_mbox_send(struct cs35l56_base *cs35l56_base, unsigned int command) { unsigned int val; @@ -979,6 +1013,7 @@ int cs35l56_hw_init(struct cs35l56_base *cs35l56_base) return ret; } cs35l56_base->rev = revid & (CS35L56_AREVID_MASK | CS35L56_MTLREVID_MASK); + cs35l56_set_fw_reg_table(cs35l56_base); ret = cs35l56_wait_for_firmware_boot(cs35l56_base); if (ret) @@ -1280,30 +1315,6 @@ const struct regmap_config cs35l63_regmap_sdw = { }; EXPORT_SYMBOL_NS_GPL(cs35l63_regmap_sdw, "SND_SOC_CS35L56_SHARED"); -const struct cs35l56_fw_reg cs35l56_fw_reg = { - .fw_ver = CS35L56_DSP1_FW_VER, - .halo_state = CS35L56_DSP1_HALO_STATE, - .pm_cur_stat = CS35L56_DSP1_PM_CUR_STATE, - .prot_sts = CS35L56_PROTECTION_STATUS, - .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, - .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, - .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, - .posture_number = CS35L56_MAIN_POSTURE_NUMBER, -}; -EXPORT_SYMBOL_NS_GPL(cs35l56_fw_reg, "SND_SOC_CS35L56_SHARED"); - -const struct cs35l56_fw_reg cs35l63_fw_reg = { - .fw_ver = CS35L63_DSP1_FW_VER, - .halo_state = CS35L63_DSP1_HALO_STATE, - .pm_cur_stat = CS35L63_DSP1_PM_CUR_STATE, - .prot_sts = CS35L63_PROTECTION_STATUS, - .transducer_actual_ps = CS35L63_TRANSDUCER_ACTUAL_PS, - .user_mute = CS35L63_MAIN_RENDER_USER_MUTE, - .user_volume = CS35L63_MAIN_RENDER_USER_VOLUME, - .posture_number = CS35L63_MAIN_POSTURE_NUMBER, -}; -EXPORT_SYMBOL_NS_GPL(cs35l63_fw_reg, "SND_SOC_CS35L56_SHARED"); - MODULE_DESCRIPTION("ASoC CS35L56 Shared"); MODULE_AUTHOR("Richard Fitzgerald "); MODULE_AUTHOR("Simon Trimmer "); diff --git a/sound/soc/codecs/cs35l56-spi.c b/sound/soc/codecs/cs35l56-spi.c index c2ddee22cd23..9bc9b7c98390 100644 --- a/sound/soc/codecs/cs35l56-spi.c +++ b/sound/soc/codecs/cs35l56-spi.c @@ -26,7 +26,7 @@ static int cs35l56_spi_probe(struct spi_device *spi) spi_set_drvdata(spi, cs35l56); - cs35l56->base.fw_reg = &cs35l56_fw_reg; + cs35l56->base.type = 0x56; cs35l56->base.regmap = devm_regmap_init_spi(spi, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { -- cgit v1.2.3 From 33da2d892b6241a3e71f96acdd0e64de5d70b7f3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Sep 2025 14:03:26 +0100 Subject: ASoC: cs35l56: Add support for CS35L56 B2 silicon This adds support for changed firmware addresses on the B2 revision of CS35L56 silicon. Signed-off-by: Richard Fitzgerald Signed-off-by: Takashi Iwai --- include/sound/cs35l56.h | 2 ++ sound/soc/codecs/cs35l56-shared.c | 40 +++++++++++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 20dc3ee6378d..ab044ce2aa8b 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -85,7 +85,9 @@ #define CS35L56_DSP1_XMEM_UNPACKED24_0 0x2800000 #define CS35L56_DSP1_FW_VER 0x2800010 #define CS35L56_DSP1_HALO_STATE 0x28021E0 +#define CS35L56_B2_DSP1_HALO_STATE 0x2803D20 #define CS35L56_DSP1_PM_CUR_STATE 0x2804308 +#define CS35L56_B2_DSP1_PM_CUR_STATE 0x2804678 #define CS35L56_DSP1_XMEM_UNPACKED24_8191 0x2807FFC #define CS35L56_DSP1_CORE_BASE 0x2B80000 #define CS35L56_DSP1_SCRATCH1 0x2B805C0 diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 03ea66f08787..9e6b9ca2f354 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -320,6 +320,17 @@ static const struct cs35l56_fw_reg cs35l56_fw_reg = { .posture_number = CS35L56_MAIN_POSTURE_NUMBER, }; +static const struct cs35l56_fw_reg cs35l56_b2_fw_reg = { + .fw_ver = CS35L56_DSP1_FW_VER, + .halo_state = CS35L56_B2_DSP1_HALO_STATE, + .pm_cur_stat = CS35L56_B2_DSP1_PM_CUR_STATE, + .prot_sts = CS35L56_PROTECTION_STATUS, + .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L56_MAIN_POSTURE_NUMBER, +}; + static const struct cs35l56_fw_reg cs35l63_fw_reg = { .fw_ver = CS35L63_DSP1_FW_VER, .halo_state = CS35L63_DSP1_HALO_STATE, @@ -335,7 +346,14 @@ static void cs35l56_set_fw_reg_table(struct cs35l56_base *cs35l56_base) { switch (cs35l56_base->type) { default: - cs35l56_base->fw_reg = &cs35l56_fw_reg; + switch (cs35l56_base->rev) { + case 0xb0: + cs35l56_base->fw_reg = &cs35l56_fw_reg; + break; + default: + cs35l56_base->fw_reg = &cs35l56_b2_fw_reg; + break; + } break; case 0x63: cs35l56_base->fw_reg = &cs35l63_fw_reg; @@ -502,6 +520,11 @@ static const struct reg_sequence cs35l56_system_reset_seq[] = { REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), }; +static const struct reg_sequence cs35l56_b2_system_reset_seq[] = { + REG_SEQ0(CS35L56_B2_DSP1_HALO_STATE, 0), + REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), +}; + static const struct reg_sequence cs35l63_system_reset_seq[] = { REG_SEQ0(CS35L63_DSP1_HALO_STATE, 0), REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), @@ -524,9 +547,18 @@ void cs35l56_system_reset(struct cs35l56_base *cs35l56_base, bool is_soundwire) case 0x54: case 0x56: case 0x57: - regmap_multi_reg_write_bypassed(cs35l56_base->regmap, - cs35l56_system_reset_seq, - ARRAY_SIZE(cs35l56_system_reset_seq)); + switch (cs35l56_base->rev) { + case 0xb0: + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, + cs35l56_system_reset_seq, + ARRAY_SIZE(cs35l56_system_reset_seq)); + break; + default: + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, + cs35l56_b2_system_reset_seq, + ARRAY_SIZE(cs35l56_b2_system_reset_seq)); + break; + } break; case 0x63: regmap_multi_reg_write_bypassed(cs35l56_base->regmap, -- cgit v1.2.3 From fa7d16734f9606c396681648618dd76a5af861e6 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Sat, 27 Sep 2025 14:27:08 +0000 Subject: ALSA: compress: document 'chan_map' member in snd_dec_opus When building kernel docs, the following warning appeared: WARNING: ./include/uapi/sound/compress_params.h:364 struct member 'chan_map' not described in 'snd_dec_opus' The inline struct 'snd_dec_opus_ch_map' inside 'snd_dec_opus' was not properly documented. This patch documents the 'chan_map' member and its fields (stream_count, coupled_count, channel_map), resolving the warning. Fixes: 5d36370f3431 ("ALSA: compress: add raw opus codec define and opus decoder structs") Suggested-by: Bagas Sanjaya Signed-off-by: Kriish Sharma Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_params.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h index faf4fa911f7f..d7db6b4e1166 100644 --- a/include/uapi/sound/compress_params.h +++ b/include/uapi/sound/compress_params.h @@ -336,16 +336,14 @@ struct snd_dec_ape { * @mapping_family: Order and meaning of output channels. Only values 0 and 1 * are expected; values 2..255 are not recommended for playback. * - * Optional channel mapping table. Describes mapping of opus streams to decoded - * channels. - * @struct snd_dec_opus_ch_map - * @stream_count: Number of streams encoded in each Ogg packet. - * @coupled_count: Number of streams whose decoders are used for two - * channels. - * @channel_map: describes which decoded channel to be used for each one. - * See RFC doc for details. - * This supports only mapping families 0 and 1, therefore max - * number of channels is 8. + * @chan_map: Optional channel mapping table. Describes mapping of opus streams + * to decoded channels. Fields: + * @chan_map.stream_count: Number of streams encoded in each Ogg packet. + * @chan_map.coupled_count: Number of streams whose decoders are used + * for two channels. + * @chan_map.channel_map: Which decoded channel to be used for each one. + * Supports only mapping families 0 and 1, + * max number of channels is 8. * * These options were extracted from RFC7845 Section 5. */ -- cgit v1.2.3 From 1ddf1636e0e058adf2231486da0419243eb49539 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 22 Sep 2025 09:06:30 +0300 Subject: net/mlx5: Add IFC bit for TIR/SQ order capability Before this cap, firmware requested a certain creation order between TIR objects and SQs of the same transport domain to properly support the self loopback prevention feature. If order is not preserved, explicit modify_tir operations are necessary after the opening of the SQs. When set, this cap bit indicates that this firmware requirement / limitation no longer holds. Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758521191-814350-2-git-send-email-tariqt@nvidia.com Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 0cf187e13def..c0f5fee7a4a5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1895,7 +1895,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_2a0[0x7]; u8 mkey_pcie_tph[0x1]; - u8 reserved_at_2a8[0x2]; + u8 reserved_at_2a8[0x1]; + u8 tis_tir_td_order[0x1]; u8 psp[0x1]; u8 shampo[0x1]; -- cgit v1.2.3 From 137d1a6355131457723b51a34192320d93d15654 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 22 Sep 2025 09:06:31 +0300 Subject: net/mlx5: IFC add balance ID and LAG per MP group bits Add interface definitions for load balance ID and LAG per multiplane group functionality. This patch introduces the hardware capability bits needed to support balance ID in multiplane LAG configurations. The new fields include: - load_balance_id: 4-bit field for balance identifier. - lag_per_mp_group: capability bit for LAG per multiplane group support. These interface additions are prerequisites for implementing balance ID support in the MLX5 driver. Signed-off-by: Mark Bloch Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758521191-814350-3-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c0f5fee7a4a5..07614cd95bed 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2235,12 +2235,16 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_440[0x8]; u8 max_num_eqs_24b[0x18]; - u8 reserved_at_460[0x160]; + u8 reserved_at_460[0x144]; + u8 load_balance_id[0x4]; + u8 reserved_at_5a8[0x18]; u8 query_adjacent_functions_id[0x1]; u8 ingress_egress_esw_vport_connect[0x1]; u8 function_id_type_vhca_id[0x1]; - u8 reserved_at_5c3[0xd]; + u8 reserved_at_5c3[0x1]; + u8 lag_per_mp_group[0x1]; + u8 reserved_at_5c5[0xb]; u8 delegate_vhca_management_profiles[0x10]; u8 delegated_vhca_max[0x10]; -- cgit v1.2.3 From 2db579838296239545554443234fafb8f485cca0 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:06 +0100 Subject: mm/page_vma_mapped: track if the page is mapped across page table boundary Patch series "mm: Improve mlock tracking for large folios", v3. The patchset includes several fixes and improvements related to mlock tracking of large folios. The main objective is to reduce the undercount of Mlocked memory in /proc/meminfo and improve the accuracy of the statistics. Patches 1-2: These patches address a minor race condition in folio_referenced_one() related to mlock_vma_folio(). Currently, mlock_vma_folio() is called on large folio without the page table lock, which can result in a race condition with unmap (i.e. MADV_DONTNEED). This can lead to partially mapped folios on the unevictable LRU list. While not a significant issue, I do not believe backporting is necessary. Patch 3: This patch adds mlocking logic similar to folio_referenced_one() to try_to_unmap_one(), allowing for mlocking of large folios where possible. Patch 4-5: These patches modifies finish_fault() and faultaround to map in the entire folio when possible, enabling efficient mlocking upon addition to the rmap. Patch 6: This patch makes rmap mlock large folios if they are fully mapped, addressing the primary source of mlock undercount for large folios. This patch (of 6): Add a PVMW_PGTABLE_CROSSSED flag that page_vma_mapped_walk() will set if the page is mapped across page table boundary. Unlike other PVMW_* flags, this one is result of page_vma_mapped_walk() and not set by the caller. folio_referenced_one() will use it to detect if it safe to mlock the folio. [akpm@linux-foundation.org: s/CROSSSED/CROSSED/] Link: https://lkml.kernel.org/r/20250923110711.690639-1-kirill@shutemov.name Link: https://lkml.kernel.org/r/20250923110711.690639-2-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Reviewed-by: Shakeel Butt Cc: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 +++++ mm/page_vma_mapped.c | 1 + 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e8aff6d2deda..daa92a58585d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -922,6 +922,11 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, /* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) +/* Result flags */ + +/* The page is mapped across page table boundary */ +#define PVMW_PGTABLE_CROSSED (1 << 16) + struct page_vma_mapped_walk { unsigned long pfn; unsigned long nr_pages; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e981a1a292d2..c498a91b6706 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -309,6 +309,7 @@ next_pte: } pte_unmap(pvmw->pte); pvmw->pte = NULL; + pvmw->flags |= PVMW_PGTABLE_CROSSED; goto restart; } pvmw->pte++; -- cgit v1.2.3 From 4d6fc29f36341d7795db1d1819b4c15fe9be7b23 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Wed, 24 Sep 2025 00:16:59 +0530 Subject: mm/ksm: fix incorrect KSM counter handling in mm_struct during fork Patch series "mm/ksm: Fix incorrect accounting of KSM counters during fork", v3. The first patch in this series fixes the incorrect accounting of KSM counters such as ksm_merging_pages, ksm_rmap_items, and the global ksm_zero_pages during fork. The following patch add a selftest to verify the ksm_merging_pages counter was updated correctly during fork. Test Results ============ Without the first patch ----------------------- # [RUN] test_fork_ksm_merging_page_count not ok 10 ksm_merging_page in child: 32 With the first patch -------------------- # [RUN] test_fork_ksm_merging_page_count ok 10 ksm_merging_pages is not inherited after fork This patch (of 2): Currently, the KSM-related counters in `mm_struct`, such as `ksm_merging_pages`, `ksm_rmap_items`, and `ksm_zero_pages`, are inherited by the child process during fork. This results in inconsistent accounting. When a process uses KSM, identical pages are merged and an rmap item is created for each merged page. The `ksm_merging_pages` and `ksm_rmap_items` counters are updated accordingly. However, after a fork, these counters are copied to the child while the corresponding rmap items are not. As a result, when the child later triggers an unmerge, there are no rmap items present in the child, so the counters remain stale, leading to incorrect accounting. A similar issue exists with `ksm_zero_pages`, which maintains both a global counter and a per-process counter. During fork, the per-process counter is inherited by the child, but the global counter is not incremented. Since the child also references zero pages, the global counter should be updated as well. Otherwise, during zero-page unmerge, both the global and per-process counters are decremented, causing the global counter to become inconsistent. To fix this, ksm_merging_pages and ksm_rmap_items are reset to 0 during fork, and the global ksm_zero_pages counter is updated with the per-process ksm_zero_pages value inherited by the child. This ensures that KSM statistics remain accurate and reflect the activity of each process correctly. Link: https://lkml.kernel.org/r/cover.1758648700.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/7b9870eb67ccc0d79593940d9dbd4a0b39b5d396.1758648700.git.donettom@linux.ibm.com Fixes: 7609385337a4 ("ksm: count ksm merging pages for each process") Fixes: cb4df4cae4f2 ("ksm: count allocated ksm rmap_items for each process") Fixes: e2942062e01d ("ksm: count all zero pages placed by KSM") Signed-off-by: Donet Tom Reviewed-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: David Hildenbrand Cc: Donet Tom Cc: "Ritesh Harjani (IBM)" Cc: Wei Yang Cc: xu xin Cc: [6.6+] Signed-off-by: Andrew Morton --- include/linux/ksm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 22e67ca7cba3..067538fc4d58 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,8 +56,14 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) { + long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages); + + mm->ksm_merging_pages = 0; + mm->ksm_rmap_items = 0; + atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages); __ksm_enter(mm); + } } static inline int ksm_execve(struct mm_struct *mm) -- cgit v1.2.3 From 989c2f55ca4839121cbf23b5802f8513dbd54e1e Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 26 Sep 2025 17:24:26 +0800 Subject: mm: silence data-race in update_hiwater_rss KCSAN reports a data race on mm_cluster.hiwater_rss, which can be accessed concurrently from various paths like page migration and memory unmapping without synchronization. Since hiwater_rss is a statistical field for accounting purposes, this data race is benign. Annotate both the read and write accesses with data_race() to make KCSAN happy. Link: https://lkml.kernel.org/r/20250926092426.43312-1-lance.yang@linux.dev Signed-off-by: Lance Yang Reported-by: syzbot+60192c8877d0bc92a92b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/68d6364e.050a0220.3390a8.000d.GAE@google.com Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Marco Elver Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fcb1e72eea40..06978b4dbeb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2742,7 +2742,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) unsigned long _rss = get_mm_rss(mm); if (data_race(mm->hiwater_rss) < _rss) - (mm)->hiwater_rss = _rss; + data_race(mm->hiwater_rss = _rss); } static inline void update_hiwater_vm(struct mm_struct *mm) -- cgit v1.2.3 From 81e78b7ec61e89e8bab9736551839f79b063614c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Sep 2025 16:00:58 +0200 Subject: mm: convert folio_page() back to a macro In commit 73b3294b1152 ("mm: simplify folio_page() and folio_page_idx()") we converted folio_page() into a static inline function. However briefly afterwards in commit a847b17009ec ("mm: constify highmem related functions for improved const-correctness") we had to add some nasty const-away casting to make the compiler happy when checking const correctness. So let's just convert it back to a simple macro so the compiler can check const correctness properly. There is the alternative of using a _Generic() similar to page_folio(), but there is not a lot of benefit compared to just using a simple macro. Link: https://lkml.kernel.org/r/20250923140058.2020023-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kiryl Shutsemau Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Dev Jain Reviewed-by: Suren Baghdasaryan Reviewed-by: Lance Yang Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 568011930e35..48e27768e7ba 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,10 +316,7 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -static inline struct page *folio_page(const struct folio *folio, unsigned long n) -{ - return (struct page *)(&folio->page + n); -} +#define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { -- cgit v1.2.3 From c8a935a31bc787db52296944890f300ba9479088 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 23 Sep 2025 10:52:29 +0100 Subject: lib/string_choices: Add str_assert_deassert() helper Add str_assert_deassert() helper to return "assert" or "deassert" string literal depending on the boolean argument. Also add the inversed variant str_deassert_assert(). Suggested-by: Philipp Zabel Signed-off-by: Lad Prabhakar Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250923095229.2149740-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff26..6c4077be7742 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -17,6 +17,12 @@ #include +static inline const char *str_assert_deassert(bool v) +{ + return v ? "assert" : "deassert"; +} +#define str_deassert_assert(v) str_assert_deassert(!(v)) + static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; -- cgit v1.2.3 From 3c1ea5c5019ff197aca7e886a3a240c38f6c6f0d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:47 +0200 Subject: slab: sheaf prefilling for guaranteed allocations Add functions for efficient guaranteed allocations e.g. in a critical section that cannot sleep, when the exact number of allocations is not known beforehand, but an upper limit can be calculated. kmem_cache_prefill_sheaf() returns a sheaf containing at least given number of objects. kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf and is guaranteed not to fail until depleted. kmem_cache_return_sheaf() is for giving the sheaf back to the slab allocator after the critical section. This will also attempt to refill it to cache's sheaf capacity for better efficiency of sheaves handling, but it's not stricly necessary to succeed. kmem_cache_refill_sheaf() can be used to refill a previously obtained sheaf to requested size. If the current size is sufficient, it does nothing. If the requested size exceeds cache's sheaf_capacity and the sheaf's current capacity, the sheaf will be replaced with a new one, hence the indirect pointer parameter. kmem_cache_sheaf_size() can be used to query the current size. The implementation supports requesting sizes that exceed cache's sheaf_capacity, but it is not efficient - such "oversize" sheaves are allocated fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be especially ineffective when replacing a sheaf with a new one of a larger capacity. It is therefore better to size cache's sheaf_capacity accordingly to make oversize sheaves exceptional. CONFIG_SLUB_STATS counters are added for sheaf prefill and return operations. A prefill or return is considered _fast when it is able to grab or return a percpu spare sheaf (even if the sheaf needs a refill to satisfy the request, as those should amortize over time), and _slow otherwise (when the barn or even sheaf allocation/freeing has to be involved). sheaf_prefill_oversize is provided to determine how many prefills were oversize (counter for oversize returns is not necessary as all oversize refills result in oversize returns). When slub_debug is enabled for a cache with sheaves, no percpu sheaves exist for it, but the prefill functionality is still provided simply by all prefilled sheaves becoming oversize. If percpu sheaves are not created for a cache due to not passing the sheaf_capacity argument on cache creation, the prefills also work through oversize sheaves, but there's a WARN_ON_ONCE() to indicate the omission. Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 16 ++++ mm/slub.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 279 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 49acbcdc6696..680193356ac7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -829,6 +829,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); + +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size); + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); + +void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp, + struct slab_sheaf *sheaf) __assume_slab_alignment __malloc; +#define kmem_cache_alloc_from_sheaf(...) \ + alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__)) + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf); + /* * These macros allow declaring a kmem_buckets * parameter alongside size, which * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call diff --git a/mm/slub.c b/mm/slub.c index fec0cdc7ef37..4bd67d5d5ff5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -401,6 +401,11 @@ enum stat_item { BARN_GET_FAIL, /* Failed to get full sheaf from barn */ BARN_PUT, /* Put full sheaf to barn */ BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ NR_SLUB_STAT_ITEMS }; @@ -462,6 +467,8 @@ struct slab_sheaf { union { struct rcu_head rcu_head; struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; }; struct kmem_cache *cache; unsigned int size; @@ -2838,6 +2845,30 @@ static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf spin_unlock_irqrestore(&barn->lock, flags); } +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + /* * If a full sheaf is available, return it and put the supplied empty one to * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't @@ -5036,6 +5067,228 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod } EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); +/* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + stat(s, SHEAF_PREFILL_SLOW); + sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf && sheaf->size < size) { + if (refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + if (sheaf) + sheaf->capacity = s->sheaf_capacity; + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return refill_sheaf(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); + + return ret; +} + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} /* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to @@ -8578,6 +8831,11 @@ STAT_ATTR(BARN_GET, barn_get); STAT_ATTR(BARN_GET_FAIL, barn_get_fail); STAT_ATTR(BARN_PUT, barn_put); STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -8678,6 +8936,11 @@ static struct attribute *slab_attrs[] = { &barn_get_fail_attr.attr, &barn_put_attr.attr, &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- cgit v1.2.3 From 9b05890a25d9197e39fcf5b2298f0b911c323306 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:01 +0200 Subject: maple_tree: Prefilled sheaf conversion and testing Use prefilled sheaves instead of bulk allocations. This should speed up the allocations and the return path of unused allocations. Remove the push and pop of nodes from the maple state as this is now handled by the slab layer with sheaves. Testing has been removed as necessary since the features of the tree have been reduced. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/maple_tree.h | 6 +- lib/maple_tree.c | 326 ++++++--------------------- tools/testing/radix-tree/maple.c | 461 ++------------------------------------- tools/testing/shared/linux.c | 5 +- 4 files changed, 88 insertions(+), 710 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..0e31b191e3be 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -442,7 +442,8 @@ struct ma_state { struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ - struct maple_alloc *alloc; /* Allocated nodes for this operation */ + struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; @@ -490,7 +491,8 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ - .alloc = NULL, \ + .node_request = 0, \ + .sheaf = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0439aaacf6cb..b41245f2cc65 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -182,6 +182,22 @@ static inline void mt_free_bulk(size_t size, void __rcu **nodes) kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } +static void mt_return_sheaf(struct slab_sheaf *sheaf) +{ + kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf); +} + +static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count) +{ + return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count); +} + +static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf, + unsigned int size) +{ + return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size); +} + /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free @@ -574,67 +590,6 @@ static __always_inline bool mte_dead_node(const struct maple_enode *enode) return ma_dead_node(node); } -/* - * mas_allocated() - Get the number of nodes allocated in a maple state. - * @mas: The maple state - * - * The ma_state alloc member is overloaded to hold a pointer to the first - * allocated node or to the number of requested nodes to allocate. If bit 0 is - * set, then the alloc contains the number of requested nodes. If there is an - * allocated node, then the total allocated nodes is in that node. - * - * Return: The total number of nodes allocated - */ -static inline unsigned long mas_allocated(const struct ma_state *mas) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) - return 0; - - return mas->alloc->total; -} - -/* - * mas_set_alloc_req() - Set the requested number of allocations. - * @mas: the maple state - * @count: the number of allocations. - * - * The requested number of allocations is either in the first allocated node, - * located in @mas->alloc->request_count, or directly in @mas->alloc if there is - * no allocated node. Set the request either in the node or do the necessary - * encoding to store in @mas->alloc directly. - */ -static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { - if (!count) - mas->alloc = NULL; - else - mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); - return; - } - - mas->alloc->request_count = count; -} - -/* - * mas_alloc_req() - get the requested number of allocations. - * @mas: The maple state - * - * The alloc count is either stored directly in @mas, or in - * @mas->alloc->request_count if there is at least one node allocated. Decode - * the request count if it's stored directly in @mas->alloc. - * - * Return: The allocation request count. - */ -static inline unsigned int mas_alloc_req(const struct ma_state *mas) -{ - if ((unsigned long)mas->alloc & 0x1) - return (unsigned long)(mas->alloc) >> 1; - else if (mas->alloc) - return mas->alloc->request_count; - return 0; -} - /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node @@ -1120,77 +1075,15 @@ static int mas_ascend(struct ma_state *mas) */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { - struct maple_alloc *ret, *node = mas->alloc; - unsigned long total = mas_allocated(mas); - unsigned int req = mas_alloc_req(mas); + struct maple_node *ret; - /* nothing or a request pending. */ - if (WARN_ON(!total)) + if (WARN_ON_ONCE(!mas->sheaf)) return NULL; - if (total == 1) { - /* single allocation in this ma_state */ - mas->alloc = NULL; - ret = node; - goto single_node; - } - - if (node->node_count == 1) { - /* Single allocation in this node. */ - mas->alloc = node->slot[0]; - mas->alloc->total = node->total - 1; - ret = node; - goto new_head; - } - node->total--; - ret = node->slot[--node->node_count]; - node->slot[node->node_count] = NULL; - -single_node: -new_head: - if (req) { - req++; - mas_set_alloc_req(mas, req); - } - + ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); memset(ret, 0, sizeof(*ret)); - return (struct maple_node *)ret; -} - -/* - * mas_push_node() - Push a node back on the maple state allocation. - * @mas: The maple state - * @used: The used maple node - * - * Stores the maple node back into @mas->alloc for reuse. Updates allocated and - * requested node count as necessary. - */ -static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) -{ - struct maple_alloc *reuse = (struct maple_alloc *)used; - struct maple_alloc *head = mas->alloc; - unsigned long count; - unsigned int requested = mas_alloc_req(mas); - count = mas_allocated(mas); - - reuse->request_count = 0; - reuse->node_count = 0; - if (count) { - if (head->node_count < MAPLE_ALLOC_SLOTS) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - reuse->slot[0] = head; - reuse->node_count = 1; - } - - reuse->total = count + 1; - mas->alloc = reuse; -done: - if (requested > 1) - mas_set_alloc_req(mas, requested - 1); + return ret; } /* @@ -1200,75 +1093,32 @@ done: */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - struct maple_alloc *node; - unsigned long allocated = mas_allocated(mas); - unsigned int requested = mas_alloc_req(mas); - unsigned int count; - void **slots = NULL; - unsigned int max_req = 0; - - if (!requested) - return; + if (unlikely(mas->sheaf)) { + unsigned long refill = mas->node_request; - mas_set_alloc_req(mas, 0); - if (mas->mas_flags & MA_STATE_PREALLOC) { - if (allocated) + if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { + mas->node_request = 0; return; - WARN_ON(!allocated); - } - - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { - node = (struct maple_alloc *)mt_alloc_one(gfp); - if (!node) - goto nomem_one; - - if (allocated) { - node->slot[0] = mas->alloc; - node->node_count = 1; - } else { - node->node_count = 0; } - mas->alloc = node; - node->total = ++allocated; - node->request_count = 0; - requested--; - } + if (mt_refill_sheaf(gfp, &mas->sheaf, refill)) + goto error; - node = mas->alloc; - while (requested) { - max_req = MAPLE_ALLOC_SLOTS - node->node_count; - slots = (void **)&node->slot[node->node_count]; - max_req = min(requested, max_req); - count = mt_alloc_bulk(gfp, max_req, slots); - if (!count) - goto nomem_bulk; - - if (node->node_count == 0) { - node->slot[0]->node_count = 0; - node->slot[0]->request_count = 0; - } + mas->node_request = 0; + return; + } - node->node_count += count; - allocated += count; - /* find a non-full node*/ - do { - node = node->slot[0]; - } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); - requested -= count; + mas->sheaf = mt_get_sheaf(gfp, mas->node_request); + if (likely(mas->sheaf)) { + mas->node_request = 0; + return; } - mas->alloc->total = allocated; - return; -nomem_bulk: - /* Clean up potential freed allocations on bulk failure */ - memset(slots, 0, max_req * sizeof(unsigned long)); - mas->alloc->total = allocated; -nomem_one: - mas_set_alloc_req(mas, requested); +error: mas_set_err(mas, -ENOMEM); } + /* * mas_free() - Free an encoded maple node * @mas: The maple state @@ -1279,42 +1129,7 @@ nomem_one: */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { - struct maple_node *tmp = mte_to_node(used); - - if (mt_in_rcu(mas->tree)) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); -} - -/* - * mas_node_count_gfp() - Check if enough nodes are allocated and request more - * if there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * @gfp: the gfp flags - */ -static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) -{ - unsigned long allocated = mas_allocated(mas); - - if (allocated < count) { - mas_set_alloc_req(mas, count - allocated); - mas_alloc_nodes(mas, gfp); - } -} - -/* - * mas_node_count() - Check if enough nodes are allocated and request more if - * there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * - * Note: Uses GFP_NOWAIT for gfp flags. - */ -static void mas_node_count(struct ma_state *mas, int count) -{ - return mas_node_count_gfp(mas, count, GFP_NOWAIT); + ma_free_rcu(mte_to_node(used)); } /* @@ -2451,10 +2266,7 @@ static inline void mas_topiary_node(struct ma_state *mas, enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); - if (in_rcu) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); + ma_free_rcu(tmp); } /* @@ -3980,7 +3792,7 @@ set_content: * * Return: Number of nodes required for preallocation. */ -static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) +static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; unsigned char height = mas_mt_height(mas); @@ -4026,7 +3838,7 @@ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) WARN_ON_ONCE(1); } - return ret; + mas->node_request = ret; } /* @@ -4087,15 +3899,15 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { - int request; + struct ma_state *mas = wr_mas->mas; mas_wr_prealloc_setup(wr_mas); - wr_mas->mas->store_type = mas_wr_store_type(wr_mas); - request = mas_prealloc_calc(wr_mas, entry); - if (!request) + mas->store_type = mas_wr_store_type(wr_mas); + mas_prealloc_calc(wr_mas, entry); + if (!mas->node_request) return; - mas_node_count(wr_mas->mas, request); + mas_alloc_nodes(mas, GFP_NOWAIT); } /** @@ -5208,7 +5020,6 @@ static inline void mte_destroy_walk(struct maple_enode *enode, */ void *mas_store(struct ma_state *mas, void *entry) { - int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); @@ -5238,11 +5049,11 @@ void *mas_store(struct ma_state *mas, void *entry) return wr_mas.content; } - request = mas_prealloc_calc(&wr_mas, entry); - if (!request) + mas_prealloc_calc(&wr_mas, entry); + if (!mas->node_request) goto store; - mas_node_count(mas, request); + mas_alloc_nodes(mas, GFP_NOWAIT); if (mas_is_err(mas)) return NULL; @@ -5330,20 +5141,19 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); - int ret = 0; - int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); - request = mas_prealloc_calc(&wr_mas, entry); - if (!request) + mas_prealloc_calc(&wr_mas, entry); + if (!mas->node_request) goto set_flag; mas->mas_flags &= ~MA_STATE_PREALLOC; - mas_node_count_gfp(mas, request, gfp); + mas_alloc_nodes(mas, gfp); if (mas_is_err(mas)) { - mas_set_alloc_req(mas, 0); - ret = xa_err(mas->node); + int ret = xa_err(mas->node); + + mas->node_request = 0; mas_destroy(mas); mas_reset(mas); return ret; @@ -5351,7 +5161,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) set_flag: mas->mas_flags |= MA_STATE_PREALLOC; - return ret; + return 0; } EXPORT_SYMBOL_GPL(mas_preallocate); @@ -5365,26 +5175,13 @@ EXPORT_SYMBOL_GPL(mas_preallocate); */ void mas_destroy(struct ma_state *mas) { - struct maple_alloc *node; - unsigned long total; - mas->mas_flags &= ~MA_STATE_PREALLOC; - total = mas_allocated(mas); - while (total) { - node = mas->alloc; - mas->alloc = node->slot[0]; - if (node->node_count > 1) { - size_t count = node->node_count - 1; - - mt_free_bulk(count, (void __rcu **)&node->slot[1]); - total -= count; - } - kfree(ma_mnode_ptr(node)); - total--; - } + mas->node_request = 0; + if (mas->sheaf) + mt_return_sheaf(mas->sheaf); - mas->alloc = NULL; + mas->sheaf = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); @@ -6019,7 +5816,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) mas_alloc_nodes(mas, gfp); } - if (!mas_allocated(mas)) + if (!mas->sheaf) return false; mas->status = ma_start; @@ -7414,8 +7211,9 @@ void mas_dump(const struct ma_state *mas) pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); - pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", - mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); + pr_err(" min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n", + mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth, + mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 4a35e1e7c64b..72a8fe8e832a 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -57,430 +57,6 @@ struct rcu_reader_struct { struct rcu_test_struct2 *test; }; -static int get_alloc_node_count(struct ma_state *mas) -{ - int count = 1; - struct maple_alloc *node = mas->alloc; - - if (!node || ((unsigned long)node & 0x1)) - return 0; - while (node->node_count) { - count += node->node_count; - node = node->slot[0]; - } - return count; -} - -static void check_mas_alloc_node_count(struct ma_state *mas) -{ - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL); - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL); - MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total); - mas_destroy(mas); -} - -/* - * check_new_node() - Check the creation of new nodes and error path - * verification. - */ -static noinline void __init check_new_node(struct maple_tree *mt) -{ - - struct maple_node *mn, *mn2, *mn3; - struct maple_alloc *smn; - struct maple_node *nodes[100]; - int i, j, total; - - MA_STATE(mas, mt, 0, 0); - - check_mas_alloc_node_count(&mas); - - /* Try allocating 3 nodes */ - mtree_lock(mt); - mt_set_non_kernel(0); - /* request 3 nodes to be allocated. */ - mas_node_count(&mas, 3); - /* Allocation request of 3. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 3); - /* Allocate failed. */ - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - mas_push_node(&mas, mn); - mas_reset(&mas); - mas_destroy(&mas); - mtree_unlock(mt); - - - /* Try allocating 1 node, then 2 more */ - mtree_lock(mt); - /* Set allocation request to 1. */ - mas_set_alloc_req(&mas, 1); - /* Check Allocation request of 1. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - mas_set_err(&mas, -ENOMEM); - /* Validate allocation request. */ - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - /* Eat the requested node. */ - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mn->slot[0] != NULL); - MT_BUG_ON(mt, mn->slot[1] != NULL); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas.status = ma_start; - mas_destroy(&mas); - /* Allocate 3 nodes, will fail. */ - mas_node_count(&mas, 3); - /* Drop the lock and allocate 3 nodes. */ - mas_nomem(&mas, GFP_KERNEL); - /* Ensure 3 are allocated. */ - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - /* Allocation request of 0. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 0); - - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); - /* Ensure we counted 3. */ - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - /* Free. */ - mas_reset(&mas); - mas_destroy(&mas); - - /* Set allocation request to 1. */ - mas_set_alloc_req(&mas, 1); - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - mas_set_err(&mas, -ENOMEM); - /* Validate allocation request. */ - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != 1); - /* Check the node is only one node. */ - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mn->slot[0] != NULL); - MT_BUG_ON(mt, mn->slot[1] != NULL); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != 1); - MT_BUG_ON(mt, mas.alloc->node_count); - - mas_set_alloc_req(&mas, 2); /* request 2 more. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 2); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); - for (i = 2; i >= 0; i--) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != i); - MT_BUG_ON(mt, !mn); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - - total = 64; - mas_set_alloc_req(&mas, total); /* request 2 more. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != total); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (i = total; i > 0; i--) { - unsigned int e = 0; /* expected node_count */ - - if (!MAPLE_32BIT) { - if (i >= 35) - e = i - 34; - else if (i >= 5) - e = i - 4; - else if (i >= 2) - e = i - 1; - } else { - if (i >= 4) - e = i - 3; - else if (i >= 1) - e = i - 1; - else - e = 0; - } - - MT_BUG_ON(mt, mas.alloc->node_count != e); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); - MT_BUG_ON(mt, !mn); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - - total = 100; - for (i = 1; i < total; i++) { - mas_set_alloc_req(&mas, i); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (j = i; j > 0; j--) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); - MT_BUG_ON(mt, !mn); - MT_BUG_ON(mt, not_empty(mn)); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != j); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - mas_set_alloc_req(&mas, i); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (j = 0; j <= i/2; j++) { - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - nodes[j] = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); - } - - while (j) { - j--; - mas_push_node(&mas, nodes[j]); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - } - MT_BUG_ON(mt, mas_allocated(&mas) != i); - for (j = 0; j <= i/2; j++) { - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); - } - mas_reset(&mas); - MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); - mas_destroy(&mas); - - } - - /* Set allocation request. */ - total = 500; - mas_node_count(&mas, total); - /* Drop the lock and allocate the nodes. */ - mas_nomem(&mas, GFP_KERNEL); - MT_BUG_ON(mt, !mas.alloc); - i = 1; - smn = mas.alloc; - while (i < total) { - for (j = 0; j < MAPLE_ALLOC_SLOTS; j++) { - i++; - MT_BUG_ON(mt, !smn->slot[j]); - if (i == total) - break; - } - smn = smn->slot[0]; /* next. */ - } - MT_BUG_ON(mt, mas_allocated(&mas) != total); - mas_reset(&mas); - mas_destroy(&mas); /* Free. */ - - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - for (i = 1; i < 128; i++) { - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ - for (j = i; j > 0; j--) { /*Free the requests */ - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - } - - for (i = 1; i < MAPLE_NODE_MASK + 1; i++) { - MA_STATE(mas2, mt, 0, 0); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ - for (j = 1; j <= i; j++) { /* Move the allocations to mas2 */ - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mas_push_node(&mas2, mn); - MT_BUG_ON(mt, mas_allocated(&mas2) != j); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_allocated(&mas2) != i); - - for (j = i; j > 0; j--) { /*Free the requests */ - MT_BUG_ON(mt, mas_allocated(&mas2) != j); - mn = mas_pop_node(&mas2); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas2) != 0); - } - - - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); - - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - /* Check the limit of pop/push/pop */ - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - - for (i = 3; i < MAPLE_NODE_MASK * 3; i++) { - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put it back */ - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn2 = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put them back */ - mas_push_node(&mas, mn2); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn2 = mas_pop_node(&mas); /* get the next node. */ - mn3 = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put them back */ - mas_push_node(&mas, mn2); - mas_push_node(&mas, mn3); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas_destroy(&mas); - } - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 5); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != 5); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 10); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != 10); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS - 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS - 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2); - mas_destroy(&mas); - - mtree_unlock(mt); -} - /* * Check erasing including RCU. */ @@ -35507,6 +35083,13 @@ static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) return vacant_height; } +static int mas_allocated(struct ma_state *mas) +{ + if (mas->sheaf) + return kmem_cache_sheaf_size(mas->sheaf); + + return 0; +} /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) { @@ -35525,7 +35108,10 @@ static noinline void __init check_prealloc(struct maple_tree *mt) /* Spanning store */ mas_set_range(&mas, 470, 500); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + + mas_wr_preallocate(&wr_mas, ptr); + MT_BUG_ON(mt, mas.store_type != wr_spanning_store); + MT_BUG_ON(mt, mas_is_err(&mas)); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); vacant_height = get_vacant_height(&wr_mas, ptr); @@ -35535,6 +35121,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); + mas_wr_preallocate(&wr_mas, ptr); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -35575,20 +35162,6 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); - allocated = mas_allocated(&mas); - height = mas_mt_height(&mas); - vacant_height = get_vacant_height(&wr_mas, ptr); - MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); - mas_destroy(&mas); - allocated = mas_allocated(&mas); - MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -36389,11 +35962,17 @@ static void check_nomem_writer_race(struct maple_tree *mt) check_load(mt, 6, xa_mk_value(0xC)); mtree_unlock(mt); + mt_set_non_kernel(0); /* test for the same race but with mas_store_gfp() */ mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); mas_set_range(&mas, 0, 5); + + /* setup writer 2 that will trigger the race condition */ + mt_set_private(mt); + mt_set_callback(writer2); + mtree_lock(mt); mas_store_gfp(&mas, NULL, GFP_KERNEL); @@ -36508,10 +36087,6 @@ void farmer_tests(void) check_erase_testset(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, 0); - check_new_node(&tree); - mtree_destroy(&tree); - if (!MAPLE_32BIT) { mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_rcu_simulated(&tree); diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 4ceff7969b78..8c7257155958 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -64,7 +64,8 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, if (!(gfp & __GFP_DIRECT_RECLAIM)) { if (!cachep->non_kernel) { - cachep->exec_callback = true; + if (cachep->callback) + cachep->exec_callback = true; return NULL; } @@ -210,6 +211,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, for (i = 0; i < size; i++) __kmem_cache_free_locked(cachep, p[i]); pthread_mutex_unlock(&cachep->lock); + if (cachep->callback) + cachep->exec_callback = true; return 0; } -- cgit v1.2.3 From 6bf377b06c08049d0f4042493df302285e45165e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:02 +0200 Subject: maple_tree: Add single node allocation support to maple state The fast path through a write will require replacing a single node in the tree. Using a sheaf (32 nodes) is too heavy for the fast path, so special case the node store operation by just allocating one node in the maple state. Signed-off-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- include/linux/maple_tree.h | 4 +++- lib/maple_tree.c | 47 +++++++++++++++++++++++++++++++++++----- tools/testing/radix-tree/maple.c | 9 ++++++-- 3 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0e31b191e3be..51a64ff23b88 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -443,6 +443,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + struct maple_node *alloc; /* A single allocated node for fast path writes */ unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ @@ -491,8 +492,9 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ - .node_request = 0, \ .sheaf = NULL, \ + .alloc = NULL, \ + .node_request = 0, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b41245f2cc65..2b7299409900 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1073,16 +1073,23 @@ static int mas_ascend(struct ma_state *mas) * * Return: A pointer to a maple node. */ -static inline struct maple_node *mas_pop_node(struct ma_state *mas) +static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_node *ret; + if (mas->alloc) { + ret = mas->alloc; + mas->alloc = NULL; + goto out; + } + if (WARN_ON_ONCE(!mas->sheaf)) return NULL; ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); - memset(ret, 0, sizeof(*ret)); +out: + memset(ret, 0, sizeof(*ret)); return ret; } @@ -1093,9 +1100,34 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - if (unlikely(mas->sheaf)) { - unsigned long refill = mas->node_request; + if (!mas->node_request) + return; + + if (mas->node_request == 1) { + if (mas->sheaf) + goto use_sheaf; + + if (mas->alloc) + return; + mas->alloc = mt_alloc_one(gfp); + if (!mas->alloc) + goto error; + + mas->node_request = 0; + return; + } + +use_sheaf: + if (unlikely(mas->alloc)) { + kfree(mas->alloc); + mas->alloc = NULL; + } + + if (mas->sheaf) { + unsigned long refill; + + refill = mas->node_request; if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { mas->node_request = 0; return; @@ -5180,8 +5212,11 @@ void mas_destroy(struct ma_state *mas) mas->node_request = 0; if (mas->sheaf) mt_return_sheaf(mas->sheaf); - mas->sheaf = NULL; + + if (mas->alloc) + kfree(mas->alloc); + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); @@ -5816,7 +5851,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) mas_alloc_nodes(mas, gfp); } - if (!mas->sheaf) + if (!mas->sheaf && !mas->alloc) return false; mas->status = ma_start; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 72a8fe8e832a..83260f2efb19 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35085,10 +35085,15 @@ static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) static int mas_allocated(struct ma_state *mas) { + int total = 0; + + if (mas->alloc) + total++; + if (mas->sheaf) - return kmem_cache_sheaf_size(mas->sheaf); + total += kmem_cache_sheaf_size(mas->sheaf); - return 0; + return total; } /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) -- cgit v1.2.3 From 4957089a23f41f31f8e7e22802a8ef9f5789c191 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:02 -0700 Subject: locking/local_lock: Introduce local_lock_is_locked(). Introduce local_lock_is_locked() that returns true when given local_lock is locked by current cpu (in !PREEMPT_RT) or by current task (in PREEMPT_RT). The goal is to detect a deadlock by the caller. Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- include/linux/local_lock.h | 2 ++ include/linux/local_lock_internal.h | 7 +++++++ include/linux/rtmutex.h | 10 ++++++++++ kernel/locking/rtmutex_common.h | 9 --------- 4 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 2ba846419524..0d91d060e3e9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -66,6 +66,8 @@ */ #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) +#define local_lock_is_locked(lock) __local_lock_is_locked(lock) + /** * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable * interrupts if acquired diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 949de37700db..a4dc479157b5 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -165,6 +165,9 @@ do { \ !!tl; \ }) +/* preemption or migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) + #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ @@ -285,4 +288,8 @@ do { \ __local_trylock(lock); \ }) +/* migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(__lock) \ + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index fa9f1021541e..ede4c6bf6f22 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) return READ_ONCE(lock->owner) != NULL; } +#ifdef CONFIG_RT_MUTEXES +#define RT_MUTEX_HAS_WAITERS 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); +} +#endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. -- cgit v1.2.3 From 99253de51f80acccc528a9c94e2f4d5f329071f1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:03 -0700 Subject: mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock(). Change alloc_pages_nolock() to default to __GFP_COMP when allocating pages, since upcoming reentrant alloc_slab_page() needs __GFP_COMP. Also allow __GFP_ACCOUNT flag to be specified, since most of BPF infra needs __GFP_ACCOUNT except BPF streams. Reviewed-by: Vlastimil Babka Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/gfp.h | 2 +- kernel/bpf/stream.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/page_alloc.c | 10 ++++++---- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5ebf26fcdcfa..0ceb4e09306c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ab592db4a4bf..eb6c5a21c2ef 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fbfa8532c39..dbf86f8014de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -581,7 +581,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..30ccff0283fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7480,6 +7480,7 @@ static bool __free_unaccepted(struct page *page) /** * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. * @nid: node to allocate from * @order: allocation order size * @@ -7493,7 +7494,7 @@ static bool __free_unaccepted(struct page *page) * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. * It means ENOMEM. There is no reason to call it again and expect !NULL. */ -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7515,12 +7516,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC - | __GFP_ACCOUNT; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7558,7 +7560,7 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) if (page) set_page_refcounted(page); - if (memcg_kmem_online() && page && + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { free_pages_nolock(page, order); page = NULL; -- cgit v1.2.3 From 7612833192d56af86061de8ab51989b75daf5b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:06 -0700 Subject: slab: Reuse first bit for OBJEXTS_ALLOC_FAIL Since the combination of valid upper bits in slab->obj_exts with OBJEXTS_ALLOC_FAIL bit can never happen, use OBJEXTS_ALLOC_FAIL == (1ull << 0) as a magic sentinel instead of (1ull << 2) to free up bit 2. Signed-off-by: Alexei Starovoitov Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/memcontrol.h | 10 ++++++++-- mm/slub.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..d254c0b96d0d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,23 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/mm/slub.c b/mm/slub.c index ee575ed9250f..189cd5aa4ac4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2046,7 +2046,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) -- cgit v1.2.3 From af92793e52c3a99b828ed4bdd277fd3e11c18d08 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:07 -0700 Subject: slab: Introduce kmalloc_nolock() and kfree_nolock(). kmalloc_nolock() relies on ability of local_trylock_t to detect the situation when per-cpu kmem_cache is locked. In !PREEMPT_RT local_(try)lock_irqsave(&s->cpu_slab->lock, flags) disables IRQs and marks s->cpu_slab->lock as acquired. local_lock_is_locked(&s->cpu_slab->lock) returns true when slab is in the middle of manipulating per-cpu cache of that specific kmem_cache. kmalloc_nolock() can be called from any context and can re-enter into ___slab_alloc(): kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> NMI -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) or kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) Hence the caller of ___slab_alloc() checks if &s->cpu_slab->lock can be acquired without a deadlock before invoking the function. If that specific per-cpu kmem_cache is busy the kmalloc_nolock() retries in a different kmalloc bucket. The second attempt will likely succeed, since this cpu locked different kmem_cache. Similarly, in PREEMPT_RT local_lock_is_locked() returns true when per-cpu rt_spin_lock is locked by current _task_. In this case re-entrance into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries a different bucket that is most likely is not locked by the current task. Though it may be locked by a different task it's safe to rt_spin_lock() and sleep on it. Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL immediately if called from hard irq or NMI in PREEMPT_RT. kfree_nolock() defers freeing to irq_work when local_lock_is_locked() and (in_nmi() or in PREEMPT_RT). SLUB_TINY config doesn't use local_lock_is_locked() and relies on spin_trylock_irqsave(&n->list_lock) to allocate, while kfree_nolock() always defers to irq_work. Note, kfree_nolock() must be called _only_ for objects allocated with kmalloc_nolock(). Debug checks (like kmemleak and kfence) were skipped on allocation, hence obj = kmalloc(); kfree_nolock(obj); will miss kmemleak/kfence book keeping and will cause false positives. large_kmalloc is not supported by either kmalloc_nolock() or kfree_nolock(). Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 13 +- include/linux/memcontrol.h | 2 + include/linux/slab.h | 4 + mm/Kconfig | 1 + mm/kasan/common.c | 5 +- mm/slab.h | 6 + mm/slab_common.c | 3 + mm/slub.c | 504 ++++++++++++++++++++++++++++++++++++++++----- 8 files changed, 483 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..acdc8cb0152e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, - bool still_accessible); + bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. @@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init, - bool still_accessible) + void *object, bool init, + bool still_accessible, + bool no_quarantine) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init, still_accessible); + return __kasan_slab_free(s, object, init, still_accessible, + no_quarantine); return false; } @@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - bool init, bool still_accessible) + bool init, bool still_accessible, + bool no_quarantine) { return false; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d254c0b96d0d..82563236f35c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -358,6 +358,8 @@ enum objext_flags { * MEMCG_DATA_OBJEXTS. */ OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/include/linux/slab.h b/include/linux/slab.h index 680193356ac7..561597dd2164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -501,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size, #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) void kfree(const void *objp); +void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -957,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) + #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..202e044f2b4d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -194,6 +194,7 @@ menu "Slab allocator options" config SLUB def_bool y + select IRQ_WORK config KVFREE_RCU_BATCHED def_bool y diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..3264900b942f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -252,7 +252,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, } bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, - bool still_accessible) + bool still_accessible, bool no_quarantine) { if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; @@ -274,6 +274,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, poison_slab_object(cache, object, init); + if (no_quarantine) + return false; + /* * If the object is put into quarantine, do not let slab put the object * onto the freelist for now. The object's metadata is kept until the diff --git a/mm/slab.h b/mm/slab.h index 43245d9207b6..35e533e59b07 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -57,6 +57,10 @@ struct slab { struct { union { struct list_head slab_list; + struct { /* For deferred deactivate_slab() */ + struct llist_node llnode; + void *flush_freelist; + }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; @@ -662,6 +666,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +void defer_free_barrier(void); + static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && diff --git a/mm/slab_common.c b/mm/slab_common.c index b6601e0fe598..932d13ada36c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); } + /* Wait for deferred work from kmalloc/kfree_nolock() */ + defer_free_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index 189cd5aa4ac4..f9f7f3942074 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include #include #include - +#include +#include #include #include @@ -426,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -2079,6 +2080,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2087,8 +2089,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ if (new_slab) @@ -2098,6 +2114,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2118,7 +2136,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2142,7 +2163,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2476,7 +2500,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2981,13 +3005,17 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) * Slab allocation and freeing */ static inline struct slab *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) + struct kmem_cache_order_objects oo, + bool allow_spin) { struct folio *folio; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) + if (unlikely(!allow_spin)) + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) folio = (struct folio *)alloc_frozen_pages(flags, order); else folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); @@ -3137,6 +3165,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -3156,7 +3185,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -3164,7 +3197,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) return NULL; stat(s, ORDER_FALLBACK); @@ -3333,33 +3366,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ -static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab, int orig_size) +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); unsigned long flags; void *object; + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + slab->frozen = 1; + defer_deactivate_slab(slab, NULL); + return NULL; + } object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object, orig_size)) + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory * corruption in theory could cause that. + * Leak memory of allocated slab. */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); return NULL; + } - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); if (slab->inuse == slab->objects) add_full(s, n, slab); @@ -3400,7 +3447,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { if (!pfmemalloc_match(slab, pc->flags)) continue; @@ -3606,7 +3656,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); if (finegrain_lockdep) lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); @@ -3699,6 +3749,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { @@ -3783,7 +3874,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -3808,7 +3899,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); if (slab_to_put) { __put_partials(s, slab_to_put); @@ -4323,6 +4414,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; @@ -4348,9 +4440,21 @@ reread_slab: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); @@ -4363,13 +4467,14 @@ reread_slab: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -4381,7 +4486,7 @@ reread_slab: if (!freelist) { c->slab = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -4400,34 +4505,34 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: #ifdef CONFIG_SLUB_CPU_PARTIAL while (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } @@ -4436,7 +4541,8 @@ new_slab: slub_set_percpu_partial(c, slab); if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags))) { + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { c->slab = slab; freelist = get_freelist(s, slab); VM_BUG_ON(!freelist); @@ -4444,7 +4550,7 @@ new_slab: goto load_freelist; } - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); slab->next = NULL; __put_partials(s, slab); @@ -4466,8 +4572,13 @@ new_objects: * allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) - pc.flags = GFP_NOWAIT | __GFP_THISNODE; + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } pc.orig_size = orig_size; slab = get_partial(s, node, &pc); @@ -4506,7 +4617,7 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size); + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) goto new_objects; @@ -4528,7 +4639,7 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. @@ -4539,7 +4650,7 @@ new_objects: retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -4548,9 +4659,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -4560,6 +4676,19 @@ retry_load_slab: goto load_freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -4579,8 +4708,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -4704,7 +4844,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, return NULL; } - object = alloc_single_from_new_slab(s, slab, orig_size); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); return object; } @@ -4783,8 +4923,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, init_flags); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); alloc_tagging_slab_alloc_hook(s, p[i], flags); } @@ -5451,6 +5592,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc_noprof); +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ +#ifndef CONFIG_SLUB_TINY + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) +#endif + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) { @@ -6108,6 +6339,93 @@ flush_remote: } } +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + *(void **)x = NULL; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + +#ifdef CONFIG_SLUB_TINY + discard_slab(slab->slab_cache, slab); +#else + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); +#endif + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + slab->flush_freelist = flush_freelist; + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -6128,6 +6446,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -6146,10 +6466,29 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail, cnt, addr); + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } return; } + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } + if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); @@ -6160,11 +6499,13 @@ redo: goto redo; } } else { + __maybe_unused unsigned long flags = 0; + /* Update the free list under the local lock */ - local_lock(&s->cpu_slab->lock); + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; @@ -6174,7 +6515,7 @@ redo: c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); } stat_add(s, FREE_FASTPATH, cnt); } @@ -6405,6 +6746,71 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); +#ifndef CONFIG_SLUB_TINY + do_slab_free(s, slab, x, x, 0, _RET_IP_); +#else + defer_free(s, x); +#endif +} +EXPORT_SYMBOL_GPL(kfree_nolock); + static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) { -- cgit v1.2.3 From 9a0abc39450a3123fd52533a662fbd37e0d1508c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 17:47:14 +0200 Subject: PM: runtime: Add auto-cleanup macros for "resume and get" operations It is generally useful to be able to automatically drop a device's runtime PM usage counter incremented by runtime PM operations that resume a device and bump up its usage counter [1]. To that end, add guard definition macros allowing pm_runtime_put() and pm_runtime_put_autosuspend() to be used for the auto-cleanup in those cases. Simply put, a piece of code like below: pm_runtime_get_sync(dev); ..... pm_runtime_put(dev); return 0; can be transformed with guard() like: guard(pm_runtime_active)(dev); ..... return 0; (see the pm_runtime_put() call is gone). However, it is better to do proper error handling in the majority of cases, so doing something like this instead of the above is recommended: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; ..... return 0; In all of the cases in which runtime PM is known to be enabled for the given device or the device can be regarded as operational (and so it can be accessed) with runtime PM disabled, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put(dev); return 0; can be changed as follows: ACQUIRE(pm_runtime_active_try, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try, &pm); if (ret < 0) return ret; ..... return 0; (again, see the pm_runtime_put() call is gone). Still, if the device cannot be accessed unless runtime PM has been enabled for it, the pm_runtime_active_try_enabled guard variant needs to be used, that is (in the context of the example above): ACQUIRE(pm_runtime_active_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; When the original code calls pm_runtime_put_autosuspend(), use one of the "auto" guard variants, pm_runtime_active_auto/_try/_enabled, so for example, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put_autosuspend(dev); return 0; will become: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; Note that the cases in which the return value of pm_runtime_get_sync() is checked can also be handled with the help of the new guard macros. For example, a piece of code like: ret = pm_runtime_get_sync(dev); if (ret < 0) { pm_runtime_put(dev); return ret; } ..... pm_runtime_put(dev); return 0; can be rewritten as: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; or pm_runtime_get_active_try can be used if transparent handling of disabled runtime PM is desirable. Link: https://lore.kernel.org/linux-pm/878qimv24u.wl-tiwai@suse.de/ [1] Link: https://lore.kernel.org/linux-pm/20250926150613.000073a4@huawei.com/ Signed-off-by: Rafael J. Wysocki Acked-by: Dan Williams Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/2238241.irdbgypaU6@rafael.j.wysocki [ rjw: Fixed leftovers from the previous version in the changelog ] Reviewed-by: Jonathan Cameron Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 ++ include/linux/pm_runtime.h | 44 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index faa68bf9ef3d..0fc825066ede 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -799,6 +799,8 @@ static int rpm_resume(struct device *dev, int rpmflags) if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; + else if (rpmflags & RPM_TRANSPARENT) + goto out; else retval = -EACCES; } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d1ff76e0e2d0..e5426bdd0c9f 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -21,6 +21,7 @@ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ +#define RPM_TRANSPARENT 0x10 /* Succeed if runtime PM is disabled */ /* * Use this for defining a set of PM operations to be used in all situations @@ -512,6 +513,19 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +static inline int pm_runtime_get_active(struct device *dev, int rpmflags) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. @@ -522,15 +536,7 @@ static inline int pm_runtime_get_sync(struct device *dev) */ static inline int pm_runtime_resume_and_get(struct device *dev) { - int ret; - - ret = __pm_runtime_resume(dev, RPM_GET_PUT); - if (ret < 0) { - pm_runtime_put_noidle(dev); - return ret; - } - - return 0; + return pm_runtime_get_active(dev, 0); } /** @@ -610,6 +616,26 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_active, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put(_T)); +DEFINE_GUARD(pm_runtime_active_auto, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T)); +/* + * Use the following guards with ACQUIRE()/ACQUIRE_ERR(). + * + * The difference between the "_try" and "_try_enabled" variants is that the + * former do not produce an error when runtime PM is disabled for the given + * device. + */ +DEFINE_GUARD_COND(pm_runtime_active, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active, _try_enabled, + pm_runtime_resume_and_get(_T)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, + pm_runtime_resume_and_get(_T)) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From d5e58ce1fb0f13a9a0845851f267ede3551cd9fe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 18:26:40 +0200 Subject: PM: runtime: Drop DEFINE_FREE() for pm_runtime_put() The DEFINE_FREE() for pm_runtime_put has been superseded by recently introduced runtime PM auto-cleanup macros and its only user has been converted to using one of the new macros, so drop it. Signed-off-by: Rafael J. Wysocki Reviewed-by: Dhruva Gole Reviewed-by: Takashi Iwai Reviewed-by: Jonathan Cameron --- include/linux/pm_runtime.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index e5426bdd0c9f..edb8aed5ef62 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -563,8 +563,6 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } -DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) - /** * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. -- cgit v1.2.3 From c39d6d4d933381714b6e5d735545256558ec6c05 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 27 Sep 2025 08:29:35 -0400 Subject: ptr_ring: __ptr_ring_zero_tail micro optimization __ptr_ring_zero_tail currently does the - 1 operation twice: - during initialization of head - at each loop iteration Let's just do it in one place, all we need to do is adjust the loop condition. this is better: - a slightly clearer logic with less duplication - uses prefix -- we don't need to save the old value - one less - 1 operation - for example, when ring is empty we now don't do - 1 at all, existing code does it once Text size shrinks from 15081 to 15050 bytes. Signed-off-by: Michael S. Tsirkin Reviewed-by: Simon Horman Link: https://patch.msgid.link/bcd630c7edc628e20d4f8e037341f26c90ab4365.1758976026.git.mst@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index a736b16859a6..534531807d95 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -248,15 +248,15 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) */ static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head) { - int head = consumer_head - 1; + int head = consumer_head; /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ - while (likely(head >= r->consumer_tail)) - r->queue[head--] = NULL; + while (likely(head > r->consumer_tail)) + r->queue[--head] = NULL; r->consumer_tail = consumer_head; } -- cgit v1.2.3 From a7556779745c047efb7b0ce8732889b0cdc80936 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:38 +0200 Subject: tcp: make tcp_rcvbuf_grow() accessible to mptcp code To leverage the auto-tuning improvements brought by commit 2da35e4b4df9 ("Merge branch 'tcp-receive-side-improvements'"), the MPTCP stack need to access the mentioned helper. Acked-by: Geliang Tang Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-2-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7c51a0a5ace8..5ca230ed526a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -370,6 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); +void tcp_rcvbuf_grow(struct sock *sk); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0a2511ce34db..b44fdc309633 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -891,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -static void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 7d452516b67add4a53e63bfa496d8df930a66b9a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Sep 2025 18:21:12 +0000 Subject: Revert "net: group sk_backlog and sk_receive_queue" This reverts commit 4effb335b5dab08cb6e2c38d038910f8b527cfc9. This was a benefit for UDP flood case, which was later greatly improved with commits 6471658dc66c ("udp: use skb_attempt_defer_free()") and b650bf0977d3 ("udp: remove busylock and add per NUMA queues"). Apparently blamed commit added a regression for RAW sockets, possibly because they do not use the dual RX queue strategy that UDP has. sock_queue_rcv_skb_reason() and RAW recvmsg() compete for sk_receive_buf and sk_rmem_alloc changes, and them being in the same cache line reduce performance. Fixes: 4effb335b5da ("net: group sk_backlog and sk_receive_queue") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509281326.f605b4eb-lkp@intel.com Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: David Ahern Cc: Kuniyuki Iwashima Link: https://patch.msgid.link/20250929182112.824154-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 8c5b64f41ab7..60bcb13f045c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -395,6 +395,7 @@ struct sock { atomic_t sk_drops; __s32 sk_peek_off; + struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -412,7 +413,6 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc - struct sk_buff_head sk_error_queue; __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); -- cgit v1.2.3 From a680581f6a131fd8c62d284ed4a24d4bc1cc553e Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 27 Sep 2025 10:49:10 +0200 Subject: dpll: add phase-offset-avg-factor device attribute to netlink spec Add dpll device level attribute DPLL_A_PHASE_OFFSET_AVG_FACTOR to allow control over a calculation of reported phase offset value. Attribute is present, if the driver provides such capability, otherwise attribute shall not be present. Signed-off-by: Ivan Vecera Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250927084912.2343597-2-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 18 +++++++++++++++++- Documentation/netlink/specs/dpll.yaml | 6 ++++++ drivers/dpll/dpll_nl.c | 5 +++-- include/uapi/linux/dpll.h | 1 + 4 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index eca72d9b9ed8..be1fc643b645 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -179,7 +179,23 @@ Phase offset measurement and adjustment Device may provide ability to measure a phase difference between signals on a pin and its parent dpll device. If pin-dpll phase offset measurement is supported, it shall be provided with ``DPLL_A_PIN_PHASE_OFFSET`` -attribute for each parent dpll device. +attribute for each parent dpll device. The reported phase offset may be +computed as the average of prior values and the current measurement, using +the following formula: + +.. math:: + curr\_avg = prev\_avg * \frac{2^N-1}{2^N} + new\_val * \frac{1}{2^N} + +where `curr_avg` is the current reported phase offset, `prev_avg` is the +previously reported value, `new_val` is the current measurement, and `N` is +the averaging factor. Configured averaging factor value is provided with +``DPLL_A_PHASE_OFFSET_AVG_FACTOR`` attribute of a device and value change can +be requested with the same attribute with ``DPLL_CMD_DEVICE_SET`` command. + + ================================== ====================================== + ``DPLL_A_PHASE_OFFSET_AVG_FACTOR`` attr configured value of phase offset + averaging factor + ================================== ====================================== Device may also provide ability to adjust a signal phase on a pin. If pin phase adjustment is supported, minimal and maximal values that pin diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 5decee61a2c4..cafb4ec20447 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -315,6 +315,10 @@ attribute-sets: If enabled, dpll device shall monitor and notify all currently available inputs for changes of their phase offset against the dpll device. + - + name: phase-offset-avg-factor + type: u32 + doc: Averaging factor applied to calculation of reported phase offset. - name: pin enum-name: dpll_a_pin @@ -523,6 +527,7 @@ operations: - clock-id - type - phase-offset-monitor + - phase-offset-avg-factor dump: reply: *dev-attrs @@ -540,6 +545,7 @@ operations: attributes: - id - phase-offset-monitor + - phase-offset-avg-factor - name: device-create-ntf doc: Notification about device appearing diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index 9f2efaf25268..3c6d570babf8 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -42,9 +42,10 @@ static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = { }; /* DPLL_CMD_DEVICE_SET - do */ -static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_MONITOR + 1] = { +static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_AVG_FACTOR + 1] = { [DPLL_A_ID] = { .type = NLA_U32, }, [DPLL_A_PHASE_OFFSET_MONITOR] = NLA_POLICY_MAX(NLA_U32, 1), + [DPLL_A_PHASE_OFFSET_AVG_FACTOR] = { .type = NLA_U32, }, }; /* DPLL_CMD_PIN_ID_GET - do */ @@ -112,7 +113,7 @@ static const struct genl_split_ops dpll_nl_ops[] = { .doit = dpll_nl_device_set_doit, .post_doit = dpll_post_doit, .policy = dpll_device_set_nl_policy, - .maxattr = DPLL_A_PHASE_OFFSET_MONITOR, + .maxattr = DPLL_A_PHASE_OFFSET_AVG_FACTOR, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 37b438ce8efc..ab1725a954d7 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -216,6 +216,7 @@ enum dpll_a { DPLL_A_LOCK_STATUS_ERROR, DPLL_A_CLOCK_QUALITY_LEVEL, DPLL_A_PHASE_OFFSET_MONITOR, + DPLL_A_PHASE_OFFSET_AVG_FACTOR, __DPLL_A_MAX, DPLL_A_MAX = (__DPLL_A_MAX - 1) -- cgit v1.2.3 From e28d5a68b6519ec6b2118a3f604295b5534eeb51 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 27 Sep 2025 10:49:11 +0200 Subject: dpll: add phase_offset_avg_factor_get/set callback ops Add new callback operations for a dpll device: - phase_offset_avg_factor_get(...) - to obtain current phase offset averaging factor from dpll device, - phase_offset_avg_factor_set(...) - to set phase offset averaging factor Obtain the factor value using the get callback and provide it to the user if the device driver implement this callback. Execute the set callback upon user requests, if the driver implement it. Signed-off-by: Ivan Vecera v2: * do not require 'set' callback to retrieve current value * always call 'set' callback regardless of current value Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250927084912.2343597-3-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 66 ++++++++++++++++++++++++++++++++++++++++----- include/linux/dpll.h | 6 +++++ 2 files changed, 65 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 0a852011653c..74c1f0ca95f2 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -164,6 +164,27 @@ dpll_msg_add_phase_offset_monitor(struct sk_buff *msg, struct dpll_device *dpll, return 0; } +static int +dpll_msg_add_phase_offset_avg_factor(struct sk_buff *msg, + struct dpll_device *dpll, + struct netlink_ext_ack *extack) +{ + const struct dpll_device_ops *ops = dpll_device_ops(dpll); + u32 factor; + int ret; + + if (ops->phase_offset_avg_factor_get) { + ret = ops->phase_offset_avg_factor_get(dpll, dpll_priv(dpll), + &factor, extack); + if (ret) + return ret; + if (nla_put_u32(msg, DPLL_A_PHASE_OFFSET_AVG_FACTOR, factor)) + return -EMSGSIZE; + } + + return 0; +} + static int dpll_msg_add_lock_status(struct sk_buff *msg, struct dpll_device *dpll, struct netlink_ext_ack *extack) @@ -675,6 +696,9 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg, if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type)) return -EMSGSIZE; ret = dpll_msg_add_phase_offset_monitor(msg, dpll, extack); + if (ret) + return ret; + ret = dpll_msg_add_phase_offset_avg_factor(msg, dpll, extack); if (ret) return ret; @@ -839,6 +863,23 @@ dpll_phase_offset_monitor_set(struct dpll_device *dpll, struct nlattr *a, extack); } +static int +dpll_phase_offset_avg_factor_set(struct dpll_device *dpll, struct nlattr *a, + struct netlink_ext_ack *extack) +{ + const struct dpll_device_ops *ops = dpll_device_ops(dpll); + u32 factor = nla_get_u32(a); + + if (!ops->phase_offset_avg_factor_set) { + NL_SET_ERR_MSG_ATTR(extack, a, + "device not capable of changing phase offset average factor"); + return -EOPNOTSUPP; + } + + return ops->phase_offset_avg_factor_set(dpll, dpll_priv(dpll), factor, + extack); +} + static int dpll_pin_freq_set(struct dpll_pin *pin, struct nlattr *a, struct netlink_ext_ack *extack) @@ -1736,14 +1777,25 @@ int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info) static int dpll_set_from_nlattr(struct dpll_device *dpll, struct genl_info *info) { - int ret; - - if (info->attrs[DPLL_A_PHASE_OFFSET_MONITOR]) { - struct nlattr *a = info->attrs[DPLL_A_PHASE_OFFSET_MONITOR]; + struct nlattr *a; + int rem, ret; - ret = dpll_phase_offset_monitor_set(dpll, a, info->extack); - if (ret) - return ret; + nla_for_each_attr(a, genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + switch (nla_type(a)) { + case DPLL_A_PHASE_OFFSET_MONITOR: + ret = dpll_phase_offset_monitor_set(dpll, a, + info->extack); + if (ret) + return ret; + break; + case DPLL_A_PHASE_OFFSET_AVG_FACTOR: + ret = dpll_phase_offset_avg_factor_set(dpll, a, + info->extack); + if (ret) + return ret; + break; + } } return 0; diff --git a/include/linux/dpll.h b/include/linux/dpll.h index fa1e76920d0e..25be745bf41f 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -38,6 +38,12 @@ struct dpll_device_ops { void *dpll_priv, enum dpll_feature_state *state, struct netlink_ext_ack *extack); + int (*phase_offset_avg_factor_set)(const struct dpll_device *dpll, + void *dpll_priv, u32 factor, + struct netlink_ext_ack *extack); + int (*phase_offset_avg_factor_get)(const struct dpll_device *dpll, + void *dpll_priv, u32 *factor, + struct netlink_ext_ack *extack); }; struct dpll_pin_ops { -- cgit v1.2.3 From 7bd80ed89d72285515db673803b021469ba71ee8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 24 Sep 2025 14:02:41 +0200 Subject: Documentation: net: add flow control guide and document ethtool API Introduce a new document, flow_control.rst, to provide a comprehensive guide on Ethernet Flow Control in Linux. The guide explains how flow control works, how autonegotiation resolves pause capabilities, and how to configure it using ethtool and Netlink. In parallel, document the pause and pause-stat attributes in the ethtool.yaml netlink spec. This enables the ynl tool to generate kernel-doc comments for the corresponding enums in the UAPI header, making the C interface self-documenting. Finally, replace the legacy flow control section in phy.rst with a reference to the new document and add pointers in the relevant C source files. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20250924120241.724850-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 ++ Documentation/networking/flow_control.rst | 373 +++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 ++- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 + net/ethtool/pause.c | 4 + 8 files changed, 453 insertions(+), 15 deletions(-) create mode 100644 Documentation/networking/flow_control.rst (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 6a0fb1974513..e4852505294f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,7 +864,9 @@ attribute-sets: - name: pause-stat + doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt + enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -875,13 +877,17 @@ attribute-sets: type: pad - name: tx-frames + doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames + doc: Number of PAUSE frames received. type: u64 - name: pause + doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt + enum-name: ethtool-a-pause attributes: - name: unspec @@ -893,19 +899,40 @@ attribute-sets: nested-attributes: header - name: autoneg + doc: | + Acts as a mode selector for the driver. + On GET: indicates the driver's behavior. If true, the driver will + respect the negotiated outcome; if false, the driver will use a + forced configuration. + On SET: if true, the driver configures the PHY's advertisement based + on the rx and tx attributes. If false, the driver forces the MAC + into the state defined by the rx and tx attributes. type: u8 - name: rx + doc: | + Enable receiving PAUSE frames (pausing local TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: tx + doc: | + Enable transmitting PAUSE frames (pausing peer TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: stats + doc: | + Contains the pause statistics counters. The source of these + statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src + doc: | + Selects the source of the MAC statistics, values from + enum ethtool_mac_stats_src. This allows requesting statistics + from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst new file mode 100644 index 000000000000..48646d54513f --- /dev/null +++ b/Documentation/networking/flow_control.rst @@ -0,0 +1,373 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _ethernet-flow-control: + +===================== +Ethernet Flow Control +===================== + +This document is a practical guide to Ethernet Flow Control in Linux, covering +what it is, how it works, and how to configure it. + +What is Flow Control? +===================== + +Flow control is a mechanism to prevent a fast sender from overwhelming a +slow receiver with data, which would cause buffer overruns and dropped packets. +The receiver can signal the sender to temporarily stop transmitting, giving it +time to process its backlog. + +Standards references +==================== + +Ethernet flow control mechanisms are specified across consolidated IEEE base +standards; some originated as amendments: + +- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** + (half-duplex). +- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** + (originally **802.3x**). +- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** + (originally **802.1Qbb**). + +In the remainder of this document, the consolidated clause numbers are used. + +How It Works: The Mechanisms +============================ + +The method used for flow control depends on the link's duplex mode. + +.. note:: + The user-visible ``ethtool`` pause API described in this document controls + **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the + collision-based behavior that exists on half-duplex links. + +1. Half-Duplex: Collision-Based Flow Control +-------------------------------------------- +On half-duplex links, a device cannot send and receive simultaneously, so PAUSE +frames are not used. Flow control is achieved by leveraging the CSMA/CD +(Carrier Sense Multiple Access with Collision Detection) protocol itself. + +* **How it works**: To inhibit incoming data, a receiving device can force a + collision on the line. When the sending station detects this collision, it + terminates its transmission, sends a "jam" signal, and then executes the + "Collision backoff and retransmission" procedure as defined in IEEE 802.3, + Section 4.2.3.2.5. This algorithm makes the sender wait for a random + period before attempting to retransmit. By repeatedly forcing collisions, + the receiver can effectively throttle the sender's transmission rate. + +.. note:: + While this mechanism is part of the IEEE standard, there is currently no + generic kernel API to configure or control it. Drivers should not enable + this feature until a standardized interface is available. + +.. warning:: + On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a + hub rather than a switch) forcing collisions inhibits traffic **across the + entire shared segment**, not just a single point-to-point link. Enabling + such behavior is generally undesirable. + +2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) +------------------------------------------------------ +On full-duplex links, devices can send and receive at the same time. Flow +control is achieved by sending a special **PAUSE frame**, defined by IEEE +802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore +called *link-wide PAUSE*. + +* **What it is**: A standard Ethernet frame with a globally reserved + destination MAC address (``01-80-C2-00-00-01``). This address is in a range + that standard IEEE 802.1D-compliant bridges do not forward. However, some + unmanaged or misconfigured bridges have been reported to forward these + frames, which can disrupt flow control across a network. + +* **How it works**: The frame contains a MAC Control opcode for PAUSE + (``0x0001``) and a ``pause_time`` value, telling the sender how long to + wait before sending more data frames. This time is specified in units of + "pause quantum", where one quantum is the time it takes to transmit 512 bits. + For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, + and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates + that the transmitter can resume transmission, even if a previous non-zero + pause time has not yet elapsed. + +* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. + +3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) +------------------------------------------------------------------------- +Priority-based Flow Control is an enhancement to the standard PAUSE mechanism +that allows flow control to be applied independently to different classes of +traffic, identified by their priority level. + +* **What it is**: PFC allows a receiver to pause traffic for one or more of the + 8 standard priority levels without stopping traffic for other priorities. + This is critical in data center environments for protocols that cannot + tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet + or RoCE). + +* **How it works**: PFC uses a specific PAUSE frame format. It shares the same + globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy + PAUSE frames but uses a unique opcode (``0x0101``). The frame payload + contains two key fields: + + - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to + one of the 8 priorities. If a bit is set to 1, it means the pause time + for that priority is active. + - **``time_vector``**: A list of eight 2-octet fields, one for each priority. + Each field specifies the ``pause_time`` for its corresponding priority, + measured in units of ``pause_quanta`` (the time to transmit 512 bits). + +.. note:: + When PFC is enabled for at least one priority on a port, the standard + **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. + The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). + +Configuring Flow Control +======================== + +Link-wide PAUSE and Priority-based Flow Control are configured with different +tools. + +Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) +------------------------------------------------------------------- +Use ``ethtool -a `` to view and ``ethtool -A `` to change +the link-wide PAUSE settings. + +.. code-block:: bash + + # View current link-wide PAUSE settings + ethtool -a eth0 + + # Enable RX and TX pause, with autonegotiation + ethtool -A eth0 autoneg on rx on tx on + +**Key Configuration Concepts**: + +* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` + controls **Pause Autoneg** (Annex 31B) only. It is independent from the + **Generic link autonegotiation** configured with ``ethtool -s``. A device can + have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. + +* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** + advertise pause in the PHY. The MAC PAUSE state is **forced** according to + ``rx``/``tx`` and does not depend on partner capabilities or resolution. + Ensure the peer is configured complementarily for PAUSE to be effective. + +* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy + is **remembered** by the kernel and applied later when Generic autoneg is + enabled again. + +* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` + capabilities. The final active state is determined by what both sides of the + link agree on. See the "PHY (Physical Layer Transceiver)" section below, + especially the *Resolution* subsection, for details of the negotiation rules. + +* **Forced Mode**: This mode is necessary when autonegotiation is not used or + not possible. This includes links where one or both partners have + autonegotiation disabled, or in setups without a PHY (e.g., direct + MAC-to-MAC connections). The driver bypasses PHY advertisement and + directly forces the MAC into the specified ``rx``/``tx`` state. The + configuration on both sides of the link must be complementary. For + example, if one side is set to ``tx on`` ``rx off``, the link partner must be + set to ``tx off`` ``rx on`` for flow control to function correctly. + +Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) +---------------------------------------------------- +PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the +``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this +document shows ``dcb(8)`` examples. + +**Viewing PFC Settings**: + +.. code-block:: text + + $ dcb pfc show dev eth0 + pfc-cap 8 macsec-bypass off delay 4096 + prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on + +This shows the PFC state (on/off) for each priority (0-7). + +**Changing PFC Settings**: + +.. code-block:: bash + + # Enable PFC on priorities 6 and 7, leaving others as they are + $ dcb pfc set dev eth0 prio-pfc 6:on 7:on + + # Disable PFC for all priorities except 6 and 7 + $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on + +Monitoring Flow Control +======================= + +The standard way to check if flow control is actively being used is to view the +pause-related statistics. + +**Monitoring Link-wide PAUSE**: +Use ``ethtool --include-statistics -a ``. + +.. code-block:: text + + $ ethtool --include-statistics -a eth0 + Pause parameters for eth0: + ... + Statistics: + tx_pause_frames: 0 + rx_pause_frames: 0 + +**Monitoring PFC**: +PFC statistics (sent and received frames per priority) are available +through the ``dcb`` tool. + +.. code-block:: text + + $ dcb pfc show dev eth0 requests indications + requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 + indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 + +The ``requests`` counters track transmitted PFC frames (TX), and the +``indications`` counters track received PFC frames (RX). + +Link-wide PAUSE Autonegotiation Details +======================================= + +The autonegotiation process for link-wide PAUSE is managed by the PHY and +involves advertising capabilities and resolving the outcome. + +* Terminology (link-wide PAUSE): + + - **Symmetric pause**: both directions are paused when requested (TX+RX + enabled). + - **Asymmetric pause**: only one direction is paused (e.g., RX-only or + TX-only). + + In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded + using two bits (Pause/Asym) and resolved per the standard truth tables + below. + +* **Advertisement**: The PHY advertises the MAC's flow control capabilities. + This is done using two bits in the advertisement register: "Symmetric + Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be + interpreted as a combined value, not as independent flags. The kernel + converts the user's ``rx`` and ``tx`` settings into this two-bit value as + follows: + + .. code-block:: text + + tx rx | Pause Asym + -------+------------- + 0 0 | 0 0 + 0 1 | 1 1 + 1 0 | 0 1 + 1 1 | 1 0 + +* **Resolution**: After negotiation, the PHY reports the link partner's + advertised Pause and Asym bits. The final flow control mode is determined + by the combination of the local and partner advertisements, according to + the IEEE 802.3 standard: + + .. code-block:: text + + Local Device | Link Partner | Result + Pause Asym | Pause Asym | + -------------------+--------------------+--------- + 0 X | 0 X | Disabled + 0 1 | 1 0 | Disabled + 0 1 | 1 1 | TX only + 1 0 | 0 X | Disabled + 1 X | 1 X | TX + RX + 1 1 | 0 1 | RX only + + It is important to note that the advertised bits reflect the *current + configuration* of the MAC, which may not represent its full hardware + capabilities. + +Kernel Policy: "Set and Trust" +============================== + +The ethtool pause API is defined as a **wish policy** for +IEEE 802.3 link-wide PAUSE only. A user request is always accepted +as the preferred configuration, but it may not be possible to apply +it in all link states. + +Key constraints: + +- Link-wide PAUSE is not valid on half-duplex links. +- Link-wide PAUSE cannot be used together with Priority-based Flow Control + (PFC, IEEE 802.1Q Clause 36). +- If autonegotiation is active and the link is currently down, the future + mode is not yet known. + +Because of these constraints, the kernel stores the requested setting +and applies it only when the link is in a compatible state. + +Implications for userspace: + +1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is + remembered even if it cannot be applied immediately. +2. Applied conditionally: when the link comes up, the kernel enables + PAUSE only if the active mode allows it. + +Component Roles in Flow Control +=============================== + +The configuration of flow control involves several components, each with a +distinct role. + +The MAC (Media Access Controller) +--------------------------------- +The MAC is the hardware component that actually sends and receives PAUSE +frames. Its capabilities define the upper limit of what the driver can support. +For link-wide PAUSE, MACs can vary in their support for symmetric (both +directions) or asymmetric (independent TX/RX) flow control. + +For PFC, the MAC must be capable of generating and interpreting the +priority-based PAUSE frames and managing separate pause states for each +traffic class. + +Many MACs also implement automatic PAUSE frame transmission based on the fill +level of their internal RX FIFO. This is typically configured with two +thresholds: + +* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this + threshold, the MAC automatically transmits a PAUSE frame to stop the sender. + +* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this + threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell + the sender it can resume transmission. + +The PHY (Physical Layer Transceiver) +------------------------------------ +The PHY's role is distinct for each flow control mechanism: + +* **Link-wide PAUSE**: During the autonegotiation process, the PHY is + responsible for advertising the device's flow control capabilities. See the + "Link-wide PAUSE Autonegotiation Details" section for more information. + +* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the + CSMA/CD process. It performs carrier sensing (checking if the line is idle) + and collision detection, which is the mechanism leveraged to throttle the + sender. + +* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in + negotiating PFC capabilities. Its role is to establish the physical link. + PFC negotiation happens at a higher layer via the Data Center Bridging + Capability Exchange Protocol (DCBX). + +User Space Interface +==================== +The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for +PFC. They communicate with the kernel to configure the network device driver +and underlying hardware. + +**Link-wide PAUSE Netlink Interface (``ethtool``)** + +See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) +for the authoritative definition of the Pause control and Pause statistics +attributes. The generated UAPI is in +``include/uapi/linux/ethtool_netlink_generated.h``. + +**PFC Netlink Interface (``dcb``)** + +The authoritative definitions for DCB/PFC netlink attributes and commands are in +``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB +subsystem documentation for userspace configuration details. + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c775cababc8c..52aafdc85f6a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,6 +55,7 @@ Contents: eql fib_trie filter + flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index b0f2ef83735d..40cc0a988d60 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,16 +343,8 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -The PHY does not participate directly in flow control/pause frames except by -making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in -MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC -controller supports such a thing. Since flow control/pause frames generation -involves the Ethernet MAC driver, it is recommended that this driver takes care -of properly indicating advertisement and support for such features by setting -the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done -either before or after phy_connect() and/or as a result of implementing the -ethtool::set_pauseparam feature. - +For detailed link-wide PAUSE and PFC behavior and configuration, see +flow_control.rst. Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..eeed1ea50369 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,9 +953,48 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * @get_pauseparam: Report pause parameters - * @set_pauseparam: Set pause parameters. Returns a negative error code - * or zero. + * + * @get_pauseparam: Report the configured policy for link-wide PAUSE + * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam + * such that: + * @autoneg: + * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only + * and is independent of generic link autonegotiation configured + * via ethtool -s. + * true -> the device follows the negotiated result of pause + * autonegotiation (Pause/Asym); + * false -> the device uses a forced MAC state independent of + * negotiation. + * @rx_pause/@tx_pause: + * represent the desired policy (preferred configuration). + * In autoneg mode they describe what is to be advertised; + * in forced mode they describe the MAC state to apply. + * + * Drivers (and/or frameworks) should persist this policy across link + * changes and reapply appropriate MAC programming when link parameters + * change. + * + * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). + * If @autoneg is true: + * Arrange for pause advertisement (Pause/Asym) based on + * @rx_pause/@tx_pause and program the MAC to follow the + * negotiated result (which may be symmetric, asymmetric, or off + * depending on the link partner). + * If @autoneg is false: + * Do not rely on autonegotiation; force the MAC RX/TX pause + * state directly per @rx_pause/@tx_pause. + * + * Implementations that integrate with PHYLIB/PHYLINK should cooperate + * with those frameworks for advertisement and resolution; MAC drivers are + * still responsible for applying the required MAC state. + * + * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if + * link-wide PAUSE is unsupported. If only symmetric pause is supported, + * reject unsupported asymmetric requests with -EINVAL (or document any + * coercion policy). + * + * See also: Documentation/networking/flow_control.rst + * * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 0e8ac0d974e2..3dd9d7cde86e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum { +enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum { +enum ethtool_a_pause { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 03eb1d941fca..91ee22f53774 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,6 +27,8 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. + * See Documentation/networking/flow_control.rst for a high level description + * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..eacf6a4859bf 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +/* See Documentation/networking/flow_control.rst for a high level description of + * the userspace interface. + */ + #include "netlink.h" #include "common.h" -- cgit v1.2.3 From 5b66169f6be4847008c0aea50885ff0632151479 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 25 Sep 2025 02:33:03 +0000 Subject: bonding: fix xfrm offload feature setup on active-backup mode The active-backup bonding mode supports XFRM ESP offload. However, when a bond is added using command like `ip link add bond0 type bond mode 1 miimon 100`, the `ethtool -k` command shows that the XFRM ESP offload is disabled. This occurs because, in bond_newlink(), we change bond link first and register bond device later. So the XFRM feature update in bond_option_mode_set() is not called as the bond device is not yet registered, leading to the offload feature not being set successfully. To resolve this issue, we can modify the code order in bond_newlink() to ensure that the bond device is registered first before changing the bond link parameters. This change will allow the XFRM ESP offload feature to be correctly enabled. Fixes: 007ab5345545 ("bonding: fix feature flag setting at init time") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250925023304.472186-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/bonding/bond_netlink.c | 16 +++++++++------- include/net/bonding.h | 1 + 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 57be04f6cb11..f4f0feddd9fa 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4411,7 +4411,7 @@ void bond_work_init_all(struct bonding *bond) INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } -static void bond_work_cancel_all(struct bonding *bond) +void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 57fff2421f1b..7a9d73ec8e91 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -579,20 +579,22 @@ static int bond_newlink(struct net_device *bond_dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct bonding *bond = netdev_priv(bond_dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; - err = bond_changelink(bond_dev, tb, data, extack); - if (err < 0) + err = register_netdevice(bond_dev); + if (err) return err; - err = register_netdevice(bond_dev); - if (!err) { - struct bonding *bond = netdev_priv(bond_dev); + netif_carrier_off(bond_dev); + bond_work_init_all(bond); - netif_carrier_off(bond_dev); - bond_work_init_all(bond); + err = bond_changelink(bond_dev, tb, data, extack); + if (err) { + bond_work_cancel_all(bond); + unregister_netdevice(bond_dev); } return err; diff --git a/include/net/bonding.h b/include/net/bonding.h index e06f0d63b2c1..bd56ad976cfb 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -711,6 +711,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); +void bond_work_cancel_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); -- cgit v1.2.3 From e211c463b748c4e2e8364c10bc216ca775fcc943 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Sep 2025 21:52:30 +0200 Subject: net: phy: stop exporting phy_driver_unregister After 42e2a9e11a1d ("net: phy: dp83640: improve phydev and driver removal handling") we can stop exporting also phy_driver_unregister(). Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/2bab950e-4b70-4030-b997-03f48379586f@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/phy/phy_device.c | 11 +++++------ include/linux/phy.h | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 01269b865f5e..1c02640412f7 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3589,6 +3589,11 @@ static int phy_driver_register(struct phy_driver *new_driver, return 0; } +static void phy_driver_unregister(struct phy_driver *drv) +{ + driver_unregister(&drv->mdiodrv.driver); +} + int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner) { @@ -3606,12 +3611,6 @@ int phy_drivers_register(struct phy_driver *new_driver, int n, } EXPORT_SYMBOL(phy_drivers_register); -void phy_driver_unregister(struct phy_driver *drv) -{ - driver_unregister(&drv->mdiodrv.driver); -} -EXPORT_SYMBOL(phy_driver_unregister); - void phy_drivers_unregister(struct phy_driver *drv, int n) { int i; diff --git a/include/linux/phy.h b/include/linux/phy.h index b377dfaa6801..7a54a8b4d277 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2030,7 +2030,6 @@ static inline int phy_read_status(struct phy_device *phydev) return genphy_read_status(phydev); } -void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); -- cgit v1.2.3 From 9c94ae6bb0b2895024b6e29fcc1cbec968b4776a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:32 +0000 Subject: net: make softnet_data.defer_count an atomic This is preparation work to remove the softnet_data.defer_lock, as it is contended on hosts with large number of cores. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- net/core/dev.c | 2 +- net/core/skbuff.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b85454116f6..27e3fa69253f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3538,7 +3538,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; - int defer_count; + atomic_t defer_count; int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; diff --git a/net/core/dev.c b/net/core/dev.c index 8b54fdf0289a..8566678d8344 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6726,7 +6726,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; - sd->defer_count = 0; + atomic_set(&sd->defer_count, 0); spin_unlock(&sd->defer_lock); while (skb != NULL) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 618afd59afff..16cd357d62a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7202,14 +7202,12 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + if (atomic_read(&sd->defer_count) >= defer_max) goto nodefer; spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ -- cgit v1.2.3 From 844c9db7f7f5fe1b0b53ed9f1c2bc7313b3021c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:33 +0000 Subject: net: use llist for sd->defer_list Get rid of sd->defer_lock and adopt llist operations. We optimize skb_attempt_defer_free() for the common case, where the packet is queued. Otherwise sd->defer_count is increasing, until skb_defer_free_flush() clears it. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 8 ++++---- net/core/dev.c | 18 ++++++------------ net/core/skbuff.c | 15 +++++++-------- 3 files changed, 17 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 27e3fa69253f..5c9aa16933d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3537,10 +3537,10 @@ struct softnet_data { struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ - spinlock_t defer_lock ____cacheline_aligned_in_smp; - atomic_t defer_count; - int defer_ipi_scheduled; - struct sk_buff *defer_list; + struct llist_head defer_list ____cacheline_aligned_in_smp; + atomic_long_t defer_count; + + int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/net/core/dev.c b/net/core/dev.c index 8566678d8344..fb67372774de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6717,22 +6717,16 @@ EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { + struct llist_node *free_list; struct sk_buff *skb, *next; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) + if (llist_empty(&sd->defer_list)) return; + atomic_long_set(&sd->defer_count, 0); + free_list = llist_del_all(&sd->defer_list); - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - atomic_set(&sd->defer_count, 0); - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; + llist_for_each_entry_safe(skb, next, free_list, ll_node) { napi_consume_skb(skb, 1); - skb = next; } } @@ -12995,7 +12989,7 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); + init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16cd357d62a6..17455fc1e692 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,6 +7185,7 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + unsigned long defer_count; int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned int defer_max; @@ -7202,17 +7203,15 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (atomic_read(&sd->defer_count) >= defer_max) + defer_count = atomic_long_inc_return(&sd->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_bh(&sd->defer_lock); - /* Send an IPI every time queue reaches half capacity. */ - kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); + llist_add(&skb->ll_node, &sd->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_bh(&sd->defer_lock); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). -- cgit v1.2.3 From 5628f3fe3b16114e8424bbfcf0594caef8958a06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:34 +0000 Subject: net: add NUMA awareness to skb_attempt_defer_free() Instead of sharing sd->defer_list & sd->defer_count with many cpus, add one pair for each NUMA node. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 4 ---- include/net/hotdata.h | 7 +++++++ net/core/dev.c | 35 +++++++++++++++++++++++------------ net/core/dev.h | 2 +- net/core/skbuff.c | 11 ++++++----- 5 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c9aa16933d1..d1a687444b27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3536,10 +3536,6 @@ struct softnet_data { struct numa_drop_counters drop_counters; - /* Another possibly contended cache line */ - struct llist_head defer_list ____cacheline_aligned_in_smp; - atomic_long_t defer_count; - int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index fda94b2647ff..4acec191c54a 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -2,10 +2,16 @@ #ifndef _NET_HOTDATA_H #define _NET_HOTDATA_H +#include #include #include #include +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; + /* Read mostly data used in network fast paths. */ struct net_hotdata { #if IS_ENABLED(CONFIG_INET) @@ -30,6 +36,7 @@ struct net_hotdata { struct rps_sock_flow_table __rcu *rps_sock_flow_table; u32 rps_cpu_mask; #endif + struct skb_defer_node __percpu *skb_defer_nodes; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index fb67372774de..a64cef2c537e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5180,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -6715,18 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - if (llist_empty(&sd->defer_list)) - return; - atomic_long_set(&sd->defer_count, 0); - free_list = llist_del_all(&sd->defer_list); + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; + + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - llist_for_each_entry_safe(skb, next, free_list, ll_node) { - napi_consume_skb(skb, 1); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + napi_consume_skb(skb, 1); + } } } @@ -6854,7 +6861,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7713,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7755,7 +7762,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -12989,7 +12996,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12999,6 +13005,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index d6b08d435479..900880e8b5b4 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17455fc1e692..bc12790017b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,9 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7201,14 +7201,15 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); - sd = &per_cpu(softnet_data, cpu); + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - defer_count = atomic_long_inc_return(&sd->defer_count); + defer_count = atomic_long_inc_return(&sdn->defer_count); if (defer_count >= defer_max) goto nodefer; - llist_add(&skb->ll_node, &sd->defer_list); + llist_add(&skb->ll_node, &sdn->defer_list); /* Send an IPI every time queue reaches half capacity. */ kick = (defer_count - 1) == (defer_max >> 1); @@ -7217,7 +7218,7 @@ nodefer: kfree_skb_napi_cache(skb); * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, -- cgit v1.2.3 From 20c48920583675e67b3824f147726e0fbda735ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:33:00 -0700 Subject: KVM: Export KVM-internal symbols for sub-modules only Rework the vast majority of KVM's exports to expose symbols only to KVM submodules, i.e. to x86's kvm-{amd,intel}.ko and PPC's kvm-{pr,hv}.ko. With few exceptions, KVM's exported APIs are intended (and safe) for KVM- internal usage only. Keep kvm_get_kvm(), kvm_get_kvm_safe(), and kvm_put_kvm() as normal exports, as they are needed by VFIO, and are generally safe for external usage (though ideally even the get/put APIs would be KVM-internal, and VFIO would pin a VM by grabbing a reference to its associated file). Implement a framework in kvm_types.h in anticipation of providing a macro to restrict KVM-specific kernel exports, i.e. to provide symbol exports for KVM if and only if KVM is built as one or more modules. Link: https://lore.kernel.org/r/20250919003303.1355064-3-seanjc@google.com Cc: Nathan Chancellor Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/Kbuild | 1 - arch/powerpc/include/asm/kvm_types.h | 15 ++++ arch/x86/include/asm/kvm_types.h | 10 +++ include/linux/kvm_types.h | 25 +++++-- virt/kvm/eventfd.c | 2 +- virt/kvm/guest_memfd.c | 4 +- virt/kvm/kvm_main.c | 128 +++++++++++++++++------------------ 7 files changed, 110 insertions(+), 75 deletions(-) create mode 100644 arch/powerpc/include/asm/kvm_types.h (limited to 'include') diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index e5fdc336c9b2..2e23533b67e3 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -3,7 +3,6 @@ generated-y += syscall_table_32.h generated-y += syscall_table_64.h generated-y += syscall_table_spu.h generic-y += agp.h -generic-y += kvm_types.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += early_ioremap.h diff --git a/arch/powerpc/include/asm/kvm_types.h b/arch/powerpc/include/asm/kvm_types.h new file mode 100644 index 000000000000..5d4bffea7d47 --- /dev/null +++ b/arch/powerpc/include/asm/kvm_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PPC_KVM_TYPES_H +#define _ASM_PPC_KVM_TYPES_H + +#if IS_MODULE(CONFIG_KVM_BOOK3S_64_PR) && IS_MODULE(CONFIG_KVM_BOOK3S_64_HV) +#define KVM_SUB_MODULES kvm-pr,kvm-hv +#elif IS_MODULE(CONFIG_KVM_BOOK3S_64_PR) +#define KVM_SUB_MODULES kvm-pr +#elif IS_MODULE(CONFIG_KVM_BOOK3S_64_HV) +#define KVM_SUB_MODULES kvm-hv +#else +#undef KVM_SUB_MODULES +#endif + +#endif diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h index 08f1b57d3b62..23268a188e70 100644 --- a/arch/x86/include/asm/kvm_types.h +++ b/arch/x86/include/asm/kvm_types.h @@ -2,6 +2,16 @@ #ifndef _ASM_X86_KVM_TYPES_H #define _ASM_X86_KVM_TYPES_H +#if IS_MODULE(CONFIG_KVM_AMD) && IS_MODULE(CONFIG_KVM_INTEL) +#define KVM_SUB_MODULES kvm-amd,kvm-intel +#elif IS_MODULE(CONFIG_KVM_AMD) +#define KVM_SUB_MODULES kvm-amd +#elif IS_MODULE(CONFIG_KVM_INTEL) +#define KVM_SUB_MODULES kvm-intel +#else +#undef KVM_SUB_MODULES +#endif + #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 #endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 827ecc0b7e10..490464c205b4 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -3,6 +3,23 @@ #ifndef __KVM_TYPES_H__ #define __KVM_TYPES_H__ +#include +#include +#include +#include + +#ifdef KVM_SUB_MODULES +#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \ + EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES)) +#else +#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) +#endif + +#ifndef __ASSEMBLER__ + +#include +#include + struct kvm; struct kvm_async_pf; struct kvm_device_ops; @@ -19,13 +36,6 @@ struct kvm_memslots; enum kvm_mr_change; -#include -#include -#include -#include - -#include - /* * Address types: * @@ -116,5 +126,6 @@ struct kvm_vcpu_stat_generic { }; #define KVM_STATS_NAME_SIZE 48 +#endif /* !__ASSEMBLER__ */ #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 6b1133a6617f..a7794ffdb976 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -525,7 +525,7 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) return false; } -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1d323ca178cb..94bafd6c558c 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -702,7 +702,7 @@ out: fput(file); return r; } -EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, @@ -785,5 +785,5 @@ put_folio_and_exit: fput(file); return ret && !i ? ret : i; } -EXPORT_SYMBOL_GPL(kvm_gmem_populate); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c0c5626ab71d..226faeaa8e56 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -77,22 +77,22 @@ MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow); /* The start value to grow halt_poll_ns from */ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start); /* Default halves per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink); /* * Allow direct access (from KVM or the CPU) without MMU notifier protection @@ -170,7 +170,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } -EXPORT_SYMBOL_GPL(vcpu_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { @@ -180,7 +180,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } -EXPORT_SYMBOL_GPL(vcpu_put); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) @@ -288,7 +288,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) { @@ -309,7 +309,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; } -EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { @@ -499,7 +499,7 @@ void kvm_destroy_vcpus(struct kvm *kvm) atomic_set(&kvm->online_vcpus, 0); } -EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) @@ -1365,7 +1365,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm) { WARN_ON(refcount_dec_and_test(&kvm->users_count)); } -EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { @@ -1397,7 +1397,7 @@ out_unlock: } return -EINTR; } -EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus); int kvm_lock_all_vcpus(struct kvm *kvm) { @@ -1422,7 +1422,7 @@ out_unlock: } return r; } -EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus); void kvm_unlock_all_vcpus(struct kvm *kvm) { @@ -1434,7 +1434,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) mutex_unlock(&vcpu->mutex); } -EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus); /* * Allocation size is twice as large as the actual dirty bitmap size. @@ -2142,7 +2142,7 @@ int kvm_set_internal_memslot(struct kvm *kvm, return kvm_set_memory_region(kvm, mem); } -EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) @@ -2201,7 +2201,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, *is_dirty = 1; return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log); #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** @@ -2636,7 +2636,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2670,7 +2670,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn return NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { @@ -2678,7 +2678,7 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2686,7 +2686,7 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2743,19 +2743,19 @@ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, { return gfn_to_hva_many(slot, gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. @@ -2819,7 +2819,7 @@ void kvm_release_page_clean(struct page *page) kvm_set_page_accessed(page); put_page(page); } -EXPORT_SYMBOL_GPL(kvm_release_page_clean); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean); void kvm_release_page_dirty(struct page *page) { @@ -2829,7 +2829,7 @@ void kvm_release_page_dirty(struct page *page) kvm_set_page_dirty(page); kvm_release_page_clean(page); } -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty); static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) @@ -3073,7 +3073,7 @@ kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, return kvm_follow_pfn(&kfp); } -EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) @@ -3090,7 +3090,7 @@ int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(kvm_prefetch_pages); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages); /* * Don't use this API unless you are absolutely, positively certain that KVM @@ -3112,7 +3112,7 @@ struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) (void)kvm_follow_pfn(&kfp); return refcounted_page; } -EXPORT_SYMBOL_GPL(__gfn_to_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) @@ -3146,7 +3146,7 @@ int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(__kvm_vcpu_map); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { @@ -3174,7 +3174,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) map->page = NULL; map->pinned_page = NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { @@ -3210,7 +3210,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) @@ -3219,7 +3219,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { @@ -3239,7 +3239,7 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { @@ -3259,7 +3259,7 @@ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned l } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) @@ -3290,7 +3290,7 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic); /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, @@ -3320,7 +3320,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) @@ -3329,7 +3329,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) @@ -3350,7 +3350,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) @@ -3371,7 +3371,7 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, @@ -3420,7 +3420,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -3451,14 +3451,14 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -3488,14 +3488,14 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { @@ -3515,7 +3515,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_clear_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, @@ -3540,7 +3540,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, set_bit_le(rel_gfn, memslot->dirty_bitmap); } } -EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { @@ -3549,7 +3549,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -3558,7 +3558,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { @@ -3795,7 +3795,7 @@ out: trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { @@ -3807,7 +3807,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) return false; } -EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* @@ -3859,7 +3859,7 @@ void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) out: put_cpu(); } -EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) @@ -3882,7 +3882,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target) return ret; } -EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to); /* * Helper that checks whether a VCPU is eligible for directed yield. @@ -4037,7 +4037,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) /* Ensure vcpu is not eligible during next spinloop */ kvm_vcpu_set_dy_eligible(me, false); } -EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin); static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) { @@ -5019,7 +5019,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm) return true; } -EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) @@ -5474,7 +5474,7 @@ bool file_is_kvm(struct file *file) { return file && file->f_op == &kvm_vm_fops; } -EXPORT_SYMBOL_GPL(file_is_kvm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm); static int kvm_dev_ioctl_create_vm(unsigned long type) { @@ -5569,10 +5569,10 @@ static struct miscdevice kvm_dev = { #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); -EXPORT_SYMBOL_GPL(enable_virt_at_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); __visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); @@ -5723,7 +5723,7 @@ err_cpuhp: --kvm_usage_count; return r; } -EXPORT_SYMBOL_GPL(kvm_enable_virtualization); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); void kvm_disable_virtualization(void) { @@ -5736,7 +5736,7 @@ void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -EXPORT_SYMBOL_GPL(kvm_disable_virtualization); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5885,7 +5885,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) @@ -5954,7 +5954,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } -EXPORT_SYMBOL_GPL(kvm_io_bus_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read); static void __free_bus(struct rcu_head *rcu) { @@ -6078,7 +6078,7 @@ out_unlock: return iodev; } -EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev); static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), @@ -6415,7 +6415,7 @@ struct kvm_vcpu *kvm_get_running_vcpu(void) return vcpu; } -EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu); /** * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. @@ -6550,7 +6550,7 @@ err_cpu_kick_mask: kmem_cache_destroy(kvm_vcpu_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init); void kvm_exit(void) { @@ -6573,4 +6573,4 @@ void kvm_exit(void) kvm_async_pf_deinit(); kvm_irqfd_exit(); } -EXPORT_SYMBOL_GPL(kvm_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit); -- cgit v1.2.3 From d11f6cd1bb4a416b4515702d020a7480ac667f0f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 29 Sep 2025 11:56:41 -0400 Subject: NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support Use STATX_DIOALIGN and STATX_DIO_READ_ALIGN to get DIO alignment attributes from the underlying filesystem and store them in the associated nfsd_file. This is done when the nfsd_file is first opened for each regular file. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfsd/filecache.c | 34 ++++++++++++++++++++++++++++++++++ fs/nfsd/filecache.h | 4 ++++ fs/nfsd/trace.h | 27 +++++++++++++++++++++++++++ fs/nfsd/vfs.h | 4 ++++ include/trace/misc/fs.h | 22 ++++++++++++++++++++++ 5 files changed, 91 insertions(+) (limited to 'include') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..7ca1dedf4e04 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, refcount_set(&nf->nf_ref, 1); nf->nf_may = need; nf->nf_mark = NULL; + nf->nf_dio_mem_align = 0; + nf->nf_dio_offset_align = 0; + nf->nf_dio_read_offset_align = 0; return nf; } @@ -1069,6 +1072,35 @@ nfsd_file_is_cached(struct inode *inode) return ret; } +static __be32 +nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + struct kstat stat; + __be32 status; + + /* Currently only need to get DIO alignment info for regular files */ + if (!S_ISREG(inode->i_mode)) + return nfs_ok; + + status = fh_getattr(fhp, &stat); + if (status != nfs_ok) + return status; + + trace_nfsd_file_get_dio_attrs(inode, &stat); + + if (stat.result_mask & STATX_DIOALIGN) { + nf->nf_dio_mem_align = stat.dio_mem_align; + nf->nf_dio_offset_align = stat.dio_offset_align; + } + if (stat.result_mask & STATX_DIO_READ_ALIGN) + nf->nf_dio_read_offset_align = stat.dio_read_offset_align; + else + nf->nf_dio_read_offset_align = nf->nf_dio_offset_align; + + return nfs_ok; +} + static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, struct svc_cred *cred, @@ -1187,6 +1219,8 @@ open_file: } status = nfserrno(ret); trace_nfsd_file_open(nf, status); + if (status == nfs_ok) + status = nfsd_file_get_dio_attrs(fhp, nf); } } else status = nfserr_jukebox; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..237a05c74211 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -54,6 +54,10 @@ struct nfsd_file { struct list_head nf_gc; struct rcu_head nf_rcu; ktime_t nf_birthtime; + + u32 nf_dio_mem_align; + u32 nf_dio_offset_align; + u32 nf_dio_read_offset_align; }; int nfsd_file_cache_init(void); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a664fdf1161e..6e2c8e2aab10 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc, ) ); +TRACE_EVENT(nfsd_file_get_dio_attrs, + TP_PROTO( + const struct inode *inode, + const struct kstat *stat + ), + TP_ARGS(inode, stat), + TP_STRUCT__entry( + __field(const void *, inode) + __field(unsigned long, mask) + __field(u32, mem_align) + __field(u32, offset_align) + __field(u32, read_offset_align) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->mask = stat->result_mask; + __entry->mem_align = stat->dio_mem_align; + __entry->offset_align = stat->dio_offset_align; + __entry->read_offset_align = stat->dio_read_offset_align; + ), + TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u", + __entry->inode, show_statx_mask(__entry->mask), + __entry->mem_align, __entry->offset_align, + __entry->read_offset_align + ) +); + TRACE_EVENT(nfsd_file_acquire, TP_PROTO( const struct svc_rqst *rqstp, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index eff04959606f..fde3e0c11dba 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -185,6 +185,10 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; + struct inode *inode = d_inode(p.dentry); + + if (S_ISREG(inode->i_mode)) + request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN); if (fh->fh_maxsize == NFS4_FHSIZE) request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h index 0406ebe2a80a..7ead1c61f0cb 100644 --- a/include/trace/misc/fs.h +++ b/include/trace/misc/fs.h @@ -141,3 +141,25 @@ { ATTR_TIMES_SET, "TIMES_SET" }, \ { ATTR_TOUCH, "TOUCH"}, \ { ATTR_DELEG, "DELEG"}) + +#define show_statx_mask(flags) \ + __print_flags(flags, "|", \ + { STATX_TYPE, "TYPE" }, \ + { STATX_MODE, "MODE" }, \ + { STATX_NLINK, "NLINK" }, \ + { STATX_UID, "UID" }, \ + { STATX_GID, "GID" }, \ + { STATX_ATIME, "ATIME" }, \ + { STATX_MTIME, "MTIME" }, \ + { STATX_CTIME, "CTIME" }, \ + { STATX_INO, "INO" }, \ + { STATX_SIZE, "SIZE" }, \ + { STATX_BLOCKS, "BLOCKS" }, \ + { STATX_BASIC_STATS, "BASIC_STATS" }, \ + { STATX_BTIME, "BTIME" }, \ + { STATX_MNT_ID, "MNT_ID" }, \ + { STATX_DIOALIGN, "DIOALIGN" }, \ + { STATX_MNT_ID_UNIQUE, "MNT_ID_UNIQUE" }, \ + { STATX_SUBVOL, "SUBVOL" }, \ + { STATX_WRITE_ATOMIC, "WRITE_ATOMIC" }, \ + { STATX_DIO_READ_ALIGN, "DIO_READ_ALIGN" }) -- cgit v1.2.3 From 25ba2b84c38f624151a3ba36e56d41c39b9223ad Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 19 Sep 2025 10:36:26 -0400 Subject: nfs/localio: avoid issuing misaligned IO using O_DIRECT Add nfsd_file_dio_alignment and use it to avoid issuing misaligned IO using O_DIRECT. Any misaligned DIO falls back to using buffered IO. Because misaligned DIO is now handled safely, remove the nfs modparam 'localio_O_DIRECT_semantics' that was added to require users opt-in to the requirement that all O_DIRECT be properly DIO-aligned. Also, introduce nfs_iov_iter_aligned_bvec() which is a variant of iov_iter_aligned_bvec() that also verifies the offset associated with an iov_iter is DIO-aligned. NOTE: in a parallel effort, iov_iter_aligned_bvec() is being removed along with iov_iter_is_aligned(). Lastly, add pr_info_ratelimited if underlying filesystem returns -EINVAL because it was made to try O_DIRECT for IO that is not DIO-aligned (shouldn't happen, so its best to be louder if it does). Fixes: 3feec68563d ("nfs/localio: add direct IO enablement with sync and async IO support") Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 65 +++++++++++++++++++++++++++++++++++++++------- fs/nfsd/localio.c | 11 ++++++++ include/linux/nfslocalio.h | 2 ++ 3 files changed, 68 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 42ea50d42c99..b165922e5cb6 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -49,11 +49,6 @@ struct nfs_local_fsync_ctx { static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); -static bool localio_O_DIRECT_semantics __read_mostly = false; -module_param(localio_O_DIRECT_semantics, bool, 0644); -MODULE_PARM_DESC(localio_O_DIRECT_semantics, - "LOCALIO will use O_DIRECT semantics to filesystem."); - static inline bool nfs_client_is_local(const struct nfs_client *clp) { return !!rcu_access_pointer(clp->cl_uuid.net); @@ -322,12 +317,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, return NULL; } - if (localio_O_DIRECT_semantics && - test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { - iocb->kiocb.ki_filp = file; + init_sync_kiocb(&iocb->kiocb, file); + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) iocb->kiocb.ki_flags = IOCB_DIRECT; - } else - init_sync_kiocb(&iocb->kiocb, file); iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; @@ -337,6 +329,30 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, return iocb; } +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + loff_t offset, unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if ((offset | size) & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + static void nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) { @@ -346,6 +362,25 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) hdr->args.count + hdr->args.pgbase); if (hdr->args.pgbase != 0) iov_iter_advance(i, hdr->args.pgbase); + + if (iocb->kiocb.ki_flags & IOCB_DIRECT) { + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + /* Verify the IO is DIO-aligned as required */ + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, + &nf_dio_read_offset_align); + if (dir == READ) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (nf_dio_mem_align && nf_dio_offset_align && + nfs_iov_iter_aligned_bvec(i, hdr->args.offset, + nf_dio_mem_align - 1, + nf_dio_offset_align - 1)) + return; /* is DIO-aligned */ + + /* Fallback to using buffered for this misaligned IO */ + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + } } static void @@ -406,6 +441,11 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } + nfs_local_pgio_done(hdr, status); /* @@ -598,6 +638,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + /* Handle short writes as if they are ENOSPC */ if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..9e0a37cd29d8 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, return localio; } +static void nfsd_file_dio_alignment(struct nfsd_file *nf, + u32 *nf_dio_mem_align, + u32 *nf_dio_offset_align, + u32 *nf_dio_read_offset_align) +{ + *nf_dio_mem_align = nf->nf_dio_mem_align; + *nf_dio_offset_align = nf->nf_dio_offset_align; + *nf_dio_read_offset_align = nf->nf_dio_read_offset_align; +} + static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_try_get = nfsd_net_try_get, .nfsd_net_put = nfsd_net_put, @@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_file_put_local = nfsd_file_put_local, .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, + .nfsd_file_dio_alignment = nfsd_file_dio_alignment, }; void nfsd_localio_ops_init(void) diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 5c7c92659e73..7ca2715edccc 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -65,6 +65,8 @@ struct nfsd_localio_operations { struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **); struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); + void (*nfsd_file_dio_alignment)(struct nfsd_file *, + u32 *, u32 *, u32 *); } ____cacheline_aligned; extern void nfsd_localio_ops_init(void); -- cgit v1.2.3 From 558ae4579810fa0fef011944230c65a6f3087f85 Mon Sep 17 00:00:00 2001 From: Hoyoung Seo Date: Tue, 30 Sep 2025 15:14:28 +0900 Subject: scsi: ufs: core: Include UTP error in INT_FATAL_ERRORS When a UTP error occurs in isolation, UFS is not currently recoverable. This is because the UTP error is not considered fatal in the error handling code, leading to either an I/O timeout or an OCS error. Add the UTP error flag to INT_FATAL_ERRORS so the controller will be reset in this situation. sd 0:0:0:0: [sda] tag#38 UNKNOWN(0x2003) Result: hostbyte=0x07 driverbyte=DRIVER_OK cmd_age=0s sd 0:0:0:0: [sda] tag#38 CDB: opcode=0x28 28 00 00 51 24 e2 00 00 08 00 I/O error, dev sda, sector 42542864 op 0x0:(READ) flags 0x80700 phys_seg 8 prio class 2 OCS error from controller = 9 for tag 39 pa_err[1] = 0x80000010 at 2667224756 us pa_err: total cnt=2 dl_err[0] = 0x80000002 at 2667148060 us dl_err[1] = 0x80002000 at 2667282844 us No record of nl_err No record of tl_err No record of dme_err No record of auto_hibern8_err fatal_err[0] = 0x804 at 2667282836 us --------------------------------------------------- REGISTER --------------------------------------------------- NAME OFFSET VALUE STD HCI SFR 0xfffffff0 0x0 AHIT 0x18 0x814 INTERRUPT STATUS 0x20 0x1000 INTERRUPT ENABLE 0x24 0x70ef5 [mkp: commit desc] Signed-off-by: Hoyoung Seo Reviewed-by: Bart Van Assche Message-Id: <20250930061428.617955-1-hy50.seo@samsung.com> Signed-off-by: Martin K. Petersen --- include/ufs/ufshci.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h index 612500a7088f..e64b70132101 100644 --- a/include/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -180,6 +180,7 @@ static inline u32 ufshci_version(u32 major, u32 minor) #define UTP_TASK_REQ_COMPL 0x200 #define UIC_COMMAND_COMPL 0x400 #define DEVICE_FATAL_ERROR 0x800 +#define UTP_ERROR 0x1000 #define CONTROLLER_FATAL_ERROR 0x10000 #define SYSTEM_BUS_FATAL_ERROR 0x20000 #define CRYPTO_ENGINE_FATAL_ERROR 0x40000 @@ -199,7 +200,8 @@ static inline u32 ufshci_version(u32 major, u32 minor) CONTROLLER_FATAL_ERROR |\ SYSTEM_BUS_FATAL_ERROR |\ CRYPTO_ENGINE_FATAL_ERROR |\ - UIC_LINK_LOST) + UIC_LINK_LOST |\ + UTP_ERROR) /* HCS - Host Controller Status 30h */ #define DEVICE_PRESENT 0x1 -- cgit v1.2.3 From 6d0386ea99875313fdfd074eb74013b6e3b48a76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Aug 2025 17:01:53 -0700 Subject: entry/kvm: KVM: Move KVM details related to signal/-EINTR into KVM proper Move KVM's morphing of pending signals into userspace exits into KVM proper, and drop the @vcpu param from xfer_to_guest_mode_handle_work(). How KVM responds to -EINTR is a detail that really belongs in KVM itself, and invoking kvm_handle_signal_exit() from kernel code creates an inverted module dependency. E.g. attempting to move kvm_handle_signal_exit() into kvm_main.c would generate an linker error when building kvm.ko as a module. Dropping KVM details will also converting the KVM "entry" code into a more generic virtualization framework so that it can be used when running as a Hyper-V root partition. Lastly, eliminating usage of "struct kvm_vcpu" outside of KVM is also nice to have for KVM x86 developers, as keeping the details of kvm_vcpu purely within KVM allows changing the layout of the structure without having to boot into a new kernel, e.g. allows rebuilding and reloading kvm.ko with a modified kvm_vcpu structure as part of debug/development. Signed-off-by: Sean Christopherson Reviewed-by: Thomas Gleixner Signed-off-by: Wei Liu --- arch/arm64/kvm/arm.c | 3 +-- arch/loongarch/kvm/vcpu.c | 3 +-- arch/riscv/kvm/vcpu.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/x86.c | 3 +-- include/linux/entry-kvm.h | 11 +++-------- include/linux/kvm_host.h | 13 ++++++++++++- kernel/entry/kvm.c | 13 +++++-------- 8 files changed, 24 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 888f7c7abf54..418fd3043467 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -1177,7 +1176,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (!ret) ret = 1; diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index d1b8c50941ca..514256b25ba1 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include @@ -251,7 +250,7 @@ static int kvm_enter_guest_check(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret < 0) return ret; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index f001e56403f9..251e787f2ebc 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include @@ -910,7 +909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; while (ret > 0) { /* Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret) continue; ret = 1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..d7c86613e50a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..0b13b8bf69e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include @@ -11241,7 +11240,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { kvm_vcpu_srcu_read_unlock(vcpu); - r = xfer_to_guest_mode_handle_work(vcpu); + r = kvm_xfer_to_guest_mode_handle_work(vcpu); kvm_vcpu_srcu_read_lock(vcpu); if (r) return r; diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 16149f6625e4..3644de7e6019 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -21,8 +21,6 @@ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ ARCH_XFER_TO_GUEST_MODE_WORK) -struct kvm_vcpu; - /** * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest * mode work handling function. @@ -32,12 +30,10 @@ struct kvm_vcpu; * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be * replaced by architecture specific code. */ -static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, - unsigned long ti_work); +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); #ifndef arch_xfer_to_guest_mode_work -static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, - unsigned long ti_work) +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) { return 0; } @@ -46,11 +42,10 @@ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, /** * xfer_to_guest_mode_handle_work - Check and handle pending work which needs * to be handled before going to guest mode - * @vcpu: Pointer to current's VCPU data * * Returns: 0 or an error code */ -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); +int xfer_to_guest_mode_handle_work(void); /** * xfer_to_guest_mode_prepare - Perform last minute preparation work that diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..598b9473e46d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2,7 +2,7 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H - +#include #include #include #include @@ -2450,6 +2450,17 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->stat.signal_exits++; } + +static inline int kvm_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +{ + int r = xfer_to_guest_mode_handle_work(); + + if (r) { + WARN_ON_ONCE(r != -EINTR); + kvm_handle_signal_exit(vcpu); + } + return r; +} #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ /* diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 8485f63863af..6fc762eaacca 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -1,17 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +static int xfer_to_guest_mode_work(unsigned long ti_work) { do { int ret; - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - kvm_handle_signal_exit(vcpu); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) return -EINTR; - } if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); @@ -19,7 +16,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(NULL); - ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + ret = arch_xfer_to_guest_mode_handle_work(ti_work); if (ret) return ret; @@ -28,7 +25,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return 0; } -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +int xfer_to_guest_mode_handle_work(void) { unsigned long ti_work; @@ -44,6 +41,6 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; - return xfer_to_guest_mode_work(vcpu, ti_work); + return xfer_to_guest_mode_work(ti_work); } EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); -- cgit v1.2.3 From 9be7e1e320ff2e7db4b23c8ec5f599bbfac94ede Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Aug 2025 17:01:54 -0700 Subject: entry: Rename "kvm" entry code assets to "virt" to genericize APIs Rename the "kvm" entry code files and Kconfigs to use generic "virt" nomenclature so that the code can be reused by other hypervisors (or rather, their root/dom0 partition drivers), without incorrectly suggesting the code somehow relies on and/or involves KVM. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Thomas Gleixner Reviewed-by: Joel Fernandes Signed-off-by: Wei Liu --- MAINTAINERS | 2 +- arch/arm64/kvm/Kconfig | 2 +- arch/loongarch/kvm/Kconfig | 2 +- arch/riscv/kvm/Kconfig | 2 +- arch/x86/kvm/Kconfig | 2 +- include/linux/entry-kvm.h | 95 ---------------------------------------------- include/linux/entry-virt.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 6 +-- include/linux/rcupdate.h | 2 +- kernel/entry/Makefile | 2 +- kernel/entry/kvm.c | 46 ---------------------- kernel/entry/virt.c | 46 ++++++++++++++++++++++ kernel/rcu/tree.c | 6 +-- virt/kvm/Kconfig | 2 +- 14 files changed, 155 insertions(+), 155 deletions(-) delete mode 100644 include/linux/entry-kvm.h create mode 100644 include/linux/entry-virt.h delete mode 100644 kernel/entry/kvm.c create mode 100644 kernel/entry/virt.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..c255048333f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10200,7 +10200,7 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/entry F: include/linux/entry-common.h -F: include/linux/entry-kvm.h +F: include/linux/entry-virt.h F: include/linux/irq-entry-common.h F: kernel/entry/ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 713248f240e0..6f4fc3caa31a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -25,7 +25,7 @@ menuconfig KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_VFIO select HAVE_KVM_DIRTY_RING_ACQ_REL select NEED_KVM_DIRTY_RING_WITH_BITMAP diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index 40eea6da7c25..ae64bbdf83a7 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -31,7 +31,7 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS help diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 5a62091b0809..c50328212917 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2c86673155c9..f81074b0c0a8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,7 +40,7 @@ config KVM_X86 select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h deleted file mode 100644 index 3644de7e6019..000000000000 --- a/include/linux/entry-kvm.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_ENTRYKVM_H -#define __LINUX_ENTRYKVM_H - -#include -#include -#include -#include -#include -#include - -/* Transfer to guest mode work */ -#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK - -#ifndef ARCH_XFER_TO_GUEST_MODE_WORK -# define ARCH_XFER_TO_GUEST_MODE_WORK (0) -#endif - -#define XFER_TO_GUEST_MODE_WORK \ - (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ - _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ - ARCH_XFER_TO_GUEST_MODE_WORK) - -/** - * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest - * mode work handling function. - * @vcpu: Pointer to current's VCPU data - * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() - * - * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be - * replaced by architecture specific code. - */ -static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); - -#ifndef arch_xfer_to_guest_mode_work -static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) -{ - return 0; -} -#endif - -/** - * xfer_to_guest_mode_handle_work - Check and handle pending work which needs - * to be handled before going to guest mode - * - * Returns: 0 or an error code - */ -int xfer_to_guest_mode_handle_work(void); - -/** - * xfer_to_guest_mode_prepare - Perform last minute preparation work that - * need to be handled while IRQs are disabled - * upon entering to guest. - * - * Has to be invoked with interrupts disabled before the last call - * to xfer_to_guest_mode_work_pending(). - */ -static inline void xfer_to_guest_mode_prepare(void) -{ - lockdep_assert_irqs_disabled(); - tick_nohz_user_enter_prepare(); -} - -/** - * __xfer_to_guest_mode_work_pending - Check if work is pending - * - * Returns: True if work pending, False otherwise. - * - * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from - * interrupt enabled code for racy quick checks with care. - */ -static inline bool __xfer_to_guest_mode_work_pending(void) -{ - unsigned long ti_work = read_thread_flags(); - - return !!(ti_work & XFER_TO_GUEST_MODE_WORK); -} - -/** - * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be - * handled before returning to guest mode - * - * Returns: True if work pending, False otherwise. - * - * Has to be invoked with interrupts disabled before the transition to - * guest mode. - */ -static inline bool xfer_to_guest_mode_work_pending(void) -{ - lockdep_assert_irqs_disabled(); - return __xfer_to_guest_mode_work_pending(); -} -#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ - -#endif diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h new file mode 100644 index 000000000000..42c89e3e5ca7 --- /dev/null +++ b/include/linux/entry-virt.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_ENTRYVIRT_H +#define __LINUX_ENTRYVIRT_H + +#include +#include +#include +#include +#include +#include + +/* Transfer to guest mode work */ +#ifdef CONFIG_VIRT_XFER_TO_GUEST_WORK + +#ifndef ARCH_XFER_TO_GUEST_MODE_WORK +# define ARCH_XFER_TO_GUEST_MODE_WORK (0) +#endif + +#define XFER_TO_GUEST_MODE_WORK \ + (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ + _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ + ARCH_XFER_TO_GUEST_MODE_WORK) + +/** + * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest + * mode work handling function. + * @vcpu: Pointer to current's VCPU data + * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() + * + * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be + * replaced by architecture specific code. + */ +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); + +#ifndef arch_xfer_to_guest_mode_work +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) +{ + return 0; +} +#endif + +/** + * xfer_to_guest_mode_handle_work - Check and handle pending work which needs + * to be handled before going to guest mode + * + * Returns: 0 or an error code + */ +int xfer_to_guest_mode_handle_work(void); + +/** + * xfer_to_guest_mode_prepare - Perform last minute preparation work that + * need to be handled while IRQs are disabled + * upon entering to guest. + * + * Has to be invoked with interrupts disabled before the last call + * to xfer_to_guest_mode_work_pending(). + */ +static inline void xfer_to_guest_mode_prepare(void) +{ + lockdep_assert_irqs_disabled(); + tick_nohz_user_enter_prepare(); +} + +/** + * __xfer_to_guest_mode_work_pending - Check if work is pending + * + * Returns: True if work pending, False otherwise. + * + * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from + * interrupt enabled code for racy quick checks with care. + */ +static inline bool __xfer_to_guest_mode_work_pending(void) +{ + unsigned long ti_work = read_thread_flags(); + + return !!(ti_work & XFER_TO_GUEST_MODE_WORK); +} + +/** + * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be + * handled before returning to guest mode + * + * Returns: True if work pending, False otherwise. + * + * Has to be invoked with interrupts disabled before the transition to + * guest mode. + */ +static inline bool xfer_to_guest_mode_work_pending(void) +{ + lockdep_assert_irqs_disabled(); + return __xfer_to_guest_mode_work_pending(); +} +#endif /* CONFIG_VIRT_XFER_TO_GUEST_WORK */ + +#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 598b9473e46d..70ac2267d5d0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2,7 +2,7 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H -#include +#include #include #include #include @@ -2444,7 +2444,7 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ -#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK +#ifdef CONFIG_VIRT_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_INTR; @@ -2461,7 +2461,7 @@ static inline int kvm_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) } return r; } -#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ +#endif /* CONFIG_VIRT_XFER_TO_GUEST_WORK */ /* * If more than one page is being (un)accounted, @virt must be the address of diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..1e1f3aa375d9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -129,7 +129,7 @@ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static __always_inline void rcu_irq_work_resched(void) { } diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 77fcd83dd663..2333d70802e4 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -14,4 +14,4 @@ CFLAGS_common.o += -fno-stack-protector obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o -obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o +obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c deleted file mode 100644 index 6fc762eaacca..000000000000 --- a/kernel/entry/kvm.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include - -static int xfer_to_guest_mode_work(unsigned long ti_work) -{ - do { - int ret; - - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - return -EINTR; - - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); - - if (ti_work & _TIF_NOTIFY_RESUME) - resume_user_mode_work(NULL); - - ret = arch_xfer_to_guest_mode_handle_work(ti_work); - if (ret) - return ret; - - ti_work = read_thread_flags(); - } while (ti_work & XFER_TO_GUEST_MODE_WORK); - return 0; -} - -int xfer_to_guest_mode_handle_work(void) -{ - unsigned long ti_work; - - /* - * This is invoked from the outer guest loop with interrupts and - * preemption enabled. - * - * KVM invokes xfer_to_guest_mode_work_pending() with interrupts - * disabled in the inner loop before going into guest mode. No need - * to disable interrupts here. - */ - ti_work = read_thread_flags(); - if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) - return 0; - - return xfer_to_guest_mode_work(ti_work); -} -EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/entry/virt.c b/kernel/entry/virt.c new file mode 100644 index 000000000000..c52f99249763 --- /dev/null +++ b/kernel/entry/virt.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +static int xfer_to_guest_mode_work(unsigned long ti_work) +{ + do { + int ret; + + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + return -EINTR; + + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) + resume_user_mode_work(NULL); + + ret = arch_xfer_to_guest_mode_handle_work(ti_work); + if (ret) + return ret; + + ti_work = read_thread_flags(); + } while (ti_work & XFER_TO_GUEST_MODE_WORK); + return 0; +} + +int xfer_to_guest_mode_handle_work(void) +{ + unsigned long ti_work; + + /* + * This is invoked from the outer guest loop with interrupts and + * preemption enabled. + * + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts + * disabled in the inner loop before going into guest mode. No need + * to disable interrupts here. + */ + ti_work = read_thread_flags(); + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) + return 0; + + return xfer_to_guest_mode_work(ti_work); +} +EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 174ee243b349..995489b72535 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -573,7 +573,7 @@ void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -602,7 +602,7 @@ noinstr void rcu_irq_work_resched(void) if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; - if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; instrumentation_begin(); @@ -611,7 +611,7 @@ noinstr void rcu_irq_work_resched(void) } instrumentation_end(); } -#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 727b542074e7..ce843db53831 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -87,7 +87,7 @@ config HAVE_KVM_VCPU_RUN_PID_CHANGE config HAVE_KVM_NO_POLL bool -config KVM_XFER_TO_GUEST_WORK +config VIRT_XFER_TO_GUEST_WORK bool config HAVE_KVM_PM_NOTIFIER -- cgit v1.2.3 From 94b04355e6397a0a70b69c2571fa5c7d9990b835 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 15 Sep 2025 16:46:03 -0700 Subject: Drivers: hv: Add CONFIG_HYPERV_VMBUS option At present VMBus driver is hinged off of CONFIG_HYPERV which entails lot of builtin code and encompasses too much. It's not always clear what depends on builtin hv code and what depends on VMBus. Setting CONFIG_HYPERV as a module and fudging the Makefile to switch to builtin adds even more confusion. VMBus is an independent module and should have its own config option. Also, there are scenarios like baremetal dom0/root where support is built in with CONFIG_HYPERV but without VMBus. Lastly, there are more features coming down that use CONFIG_HYPERV and add more dependencies on it. So, create a fine grained HYPERV_VMBUS option and update Kconfigs for dependency on VMBus. Signed-off-by: Mukesh Rathor Acked-by: Bjorn Helgaas # drivers/pci Signed-off-by: Wei Liu --- drivers/gpu/drm/Kconfig | 2 +- drivers/hid/Kconfig | 2 +- drivers/hv/Kconfig | 11 +++++++++-- drivers/hv/Makefile | 2 +- drivers/input/serio/Kconfig | 4 ++-- drivers/net/hyperv/Kconfig | 2 +- drivers/pci/Kconfig | 2 +- drivers/scsi/Kconfig | 2 +- drivers/uio/Kconfig | 2 +- drivers/video/fbdev/Kconfig | 2 +- include/asm-generic/mshyperv.h | 8 +++++--- net/vmw_vsock/Kconfig | 2 +- 12 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index f7ea8e895c0c..58f34da061c6 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -398,7 +398,7 @@ source "drivers/gpu/drm/imagination/Kconfig" config DRM_HYPERV tristate "DRM Support for Hyper-V synthetic video device" - depends on DRM && PCI && HYPERV + depends on DRM && PCI && HYPERV_VMBUS select DRM_CLIENT_SELECTION select DRM_KMS_HELPER select DRM_GEM_SHMEM_HELPER diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index a57901203aeb..fe3dc8c0db99 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -1162,7 +1162,7 @@ config GREENASIA_FF config HID_HYPERV_MOUSE tristate "Microsoft Hyper-V mouse driver" - depends on HYPERV + depends on HYPERV_VMBUS help Select this option to enable the Hyper-V mouse driver. diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index e24f6299c376..29f8637f441a 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -45,18 +45,25 @@ config HYPERV_TIMER config HYPERV_UTILS tristate "Microsoft Hyper-V Utilities driver" - depends on HYPERV && CONNECTOR && NLS + depends on HYPERV_VMBUS && CONNECTOR && NLS depends on PTP_1588_CLOCK_OPTIONAL help Select this option to enable the Hyper-V Utilities. config HYPERV_BALLOON tristate "Microsoft Hyper-V Balloon driver" - depends on HYPERV + depends on HYPERV_VMBUS select PAGE_REPORTING help Select this option to enable Hyper-V Balloon driver. +config HYPERV_VMBUS + tristate "Microsoft Hyper-V VMBus driver" + depends on HYPERV + default HYPERV + help + Select this option to enable Hyper-V Vmbus driver. + config MSHV_ROOT tristate "Microsoft Hyper-V root partition support" depends on HYPERV && (X86_64 || ARM64) diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 976189c725dc..4bb41663767d 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HYPERV) += hv_vmbus.o +obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o obj-$(CONFIG_MSHV_ROOT) += mshv_root.o diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig index 17edc1597446..c7ef347a4dff 100644 --- a/drivers/input/serio/Kconfig +++ b/drivers/input/serio/Kconfig @@ -276,8 +276,8 @@ config SERIO_OLPC_APSP config HYPERV_KEYBOARD tristate "Microsoft Synthetic Keyboard driver" - depends on HYPERV - default HYPERV + depends on HYPERV_VMBUS + default HYPERV_VMBUS help Select this option to enable the Hyper-V Keyboard driver. diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig index c8cbd85adcf9..982964c1a9fb 100644 --- a/drivers/net/hyperv/Kconfig +++ b/drivers/net/hyperv/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config HYPERV_NET tristate "Microsoft Hyper-V virtual network driver" - depends on HYPERV + depends on HYPERV_VMBUS select UCS2_STRING select NLS help diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 9a249c65aedc..7065a8e5f9b1 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -221,7 +221,7 @@ config PCI_LABEL config PCI_HYPERV tristate "Hyper-V PCI Frontend" - depends on ((X86 && X86_64) || ARM64) && HYPERV && PCI_MSI && SYSFS + depends on ((X86 && X86_64) || ARM64) && HYPERV_VMBUS && PCI_MSI && SYSFS select PCI_HYPERV_INTERFACE select IRQ_MSI_LIB help diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 5522310bab8d..19d0884479a2 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -589,7 +589,7 @@ config XEN_SCSI_FRONTEND config HYPERV_STORAGE tristate "Microsoft Hyper-V virtual storage driver" - depends on SCSI && HYPERV + depends on SCSI && HYPERV_VMBUS depends on m || SCSI_FC_ATTRS != m default HYPERV help diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig index b060dcd7c635..6f86a61231e6 100644 --- a/drivers/uio/Kconfig +++ b/drivers/uio/Kconfig @@ -140,7 +140,7 @@ config UIO_MF624 config UIO_HV_GENERIC tristate "Generic driver for Hyper-V VMBus" - depends on HYPERV + depends on HYPERV_VMBUS help Generic driver that you can bind, dynamically, to any Hyper-V VMBus device. It is useful to provide direct access diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index c21484d15f0c..72c63eaeb983 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -1774,7 +1774,7 @@ config FB_BROADSHEET config FB_HYPERV tristate "Microsoft Hyper-V Synthetic Video support" - depends on FB && HYPERV + depends on FB && HYPERV_VMBUS select DMA_CMA if HAVE_DMA_CONTIGUOUS && CMA select FB_IOMEM_HELPERS_DEFERRED help diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index dbd4c2f3aee3..64ba6bc807d9 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -163,6 +163,7 @@ static inline u64 hv_generate_guest_id(u64 kernel_version) return guest_id; } +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) /* Free the message slot and signal end-of-message if required */ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) { @@ -198,6 +199,10 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) } } +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + int hv_get_hypervisor_version(union hv_hypervisor_version_info *info); void hv_setup_vmbus_handler(void (*handler)(void)); @@ -211,9 +216,6 @@ void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); void hv_setup_mshv_handler(void (*handler)(void)); -extern int vmbus_interrupt; -extern int vmbus_irq; - #if IS_ENABLED(CONFIG_HYPERV) /* * Hypervisor's notion of virtual processor ID is different from diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 56356d2980c8..8e803c4828c4 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -72,7 +72,7 @@ config VIRTIO_VSOCKETS_COMMON config HYPERV_VSOCKETS tristate "Hyper-V transport for Virtual Sockets" - depends on VSOCKETS && HYPERV + depends on VSOCKETS && HYPERV_VMBUS help This module implements a Hyper-V transport for Virtual Sockets. -- cgit v1.2.3 From 1a98f5699bd57c9b3f66ec54cc38571d5e42ffb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Sep 2025 15:45:06 +0200 Subject: Revert "Documentation: net: add flow control guide and document ethtool API" This reverts commit 7bd80ed89d72285515db673803b021469ba71ee8. I should not have merged it to begin with due to pending review and changes to be addressed. Link: https://patch.msgid.link/c6f3af12df9b7998920a02027fc8893ce82afc4c.1759239721.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 -- Documentation/networking/flow_control.rst | 373 ------------------------- Documentation/networking/index.rst | 1 - Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 +-- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 - net/ethtool/pause.c | 4 - 8 files changed, 15 insertions(+), 453 deletions(-) delete mode 100644 Documentation/networking/flow_control.rst (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index e4852505294f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,9 +864,7 @@ attribute-sets: - name: pause-stat - doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt - enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -877,17 +875,13 @@ attribute-sets: type: pad - name: tx-frames - doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames - doc: Number of PAUSE frames received. type: u64 - name: pause - doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt - enum-name: ethtool-a-pause attributes: - name: unspec @@ -899,40 +893,19 @@ attribute-sets: nested-attributes: header - name: autoneg - doc: | - Acts as a mode selector for the driver. - On GET: indicates the driver's behavior. If true, the driver will - respect the negotiated outcome; if false, the driver will use a - forced configuration. - On SET: if true, the driver configures the PHY's advertisement based - on the rx and tx attributes. If false, the driver forces the MAC - into the state defined by the rx and tx attributes. type: u8 - name: rx - doc: | - Enable receiving PAUSE frames (pausing local TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: tx - doc: | - Enable transmitting PAUSE frames (pausing peer TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: stats - doc: | - Contains the pause statistics counters. The source of these - statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src - doc: | - Selects the source of the MAC statistics, values from - enum ethtool_mac_stats_src. This allows requesting statistics - from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst deleted file mode 100644 index 48646d54513f..000000000000 --- a/Documentation/networking/flow_control.rst +++ /dev/null @@ -1,373 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _ethernet-flow-control: - -===================== -Ethernet Flow Control -===================== - -This document is a practical guide to Ethernet Flow Control in Linux, covering -what it is, how it works, and how to configure it. - -What is Flow Control? -===================== - -Flow control is a mechanism to prevent a fast sender from overwhelming a -slow receiver with data, which would cause buffer overruns and dropped packets. -The receiver can signal the sender to temporarily stop transmitting, giving it -time to process its backlog. - -Standards references -==================== - -Ethernet flow control mechanisms are specified across consolidated IEEE base -standards; some originated as amendments: - -- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** - (half-duplex). -- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** - (originally **802.3x**). -- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** - (originally **802.1Qbb**). - -In the remainder of this document, the consolidated clause numbers are used. - -How It Works: The Mechanisms -============================ - -The method used for flow control depends on the link's duplex mode. - -.. note:: - The user-visible ``ethtool`` pause API described in this document controls - **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the - collision-based behavior that exists on half-duplex links. - -1. Half-Duplex: Collision-Based Flow Control --------------------------------------------- -On half-duplex links, a device cannot send and receive simultaneously, so PAUSE -frames are not used. Flow control is achieved by leveraging the CSMA/CD -(Carrier Sense Multiple Access with Collision Detection) protocol itself. - -* **How it works**: To inhibit incoming data, a receiving device can force a - collision on the line. When the sending station detects this collision, it - terminates its transmission, sends a "jam" signal, and then executes the - "Collision backoff and retransmission" procedure as defined in IEEE 802.3, - Section 4.2.3.2.5. This algorithm makes the sender wait for a random - period before attempting to retransmit. By repeatedly forcing collisions, - the receiver can effectively throttle the sender's transmission rate. - -.. note:: - While this mechanism is part of the IEEE standard, there is currently no - generic kernel API to configure or control it. Drivers should not enable - this feature until a standardized interface is available. - -.. warning:: - On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a - hub rather than a switch) forcing collisions inhibits traffic **across the - entire shared segment**, not just a single point-to-point link. Enabling - such behavior is generally undesirable. - -2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) ------------------------------------------------------- -On full-duplex links, devices can send and receive at the same time. Flow -control is achieved by sending a special **PAUSE frame**, defined by IEEE -802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore -called *link-wide PAUSE*. - -* **What it is**: A standard Ethernet frame with a globally reserved - destination MAC address (``01-80-C2-00-00-01``). This address is in a range - that standard IEEE 802.1D-compliant bridges do not forward. However, some - unmanaged or misconfigured bridges have been reported to forward these - frames, which can disrupt flow control across a network. - -* **How it works**: The frame contains a MAC Control opcode for PAUSE - (``0x0001``) and a ``pause_time`` value, telling the sender how long to - wait before sending more data frames. This time is specified in units of - "pause quantum", where one quantum is the time it takes to transmit 512 bits. - For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, - and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates - that the transmitter can resume transmission, even if a previous non-zero - pause time has not yet elapsed. - -* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. - -3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) -------------------------------------------------------------------------- -Priority-based Flow Control is an enhancement to the standard PAUSE mechanism -that allows flow control to be applied independently to different classes of -traffic, identified by their priority level. - -* **What it is**: PFC allows a receiver to pause traffic for one or more of the - 8 standard priority levels without stopping traffic for other priorities. - This is critical in data center environments for protocols that cannot - tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet - or RoCE). - -* **How it works**: PFC uses a specific PAUSE frame format. It shares the same - globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy - PAUSE frames but uses a unique opcode (``0x0101``). The frame payload - contains two key fields: - - - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to - one of the 8 priorities. If a bit is set to 1, it means the pause time - for that priority is active. - - **``time_vector``**: A list of eight 2-octet fields, one for each priority. - Each field specifies the ``pause_time`` for its corresponding priority, - measured in units of ``pause_quanta`` (the time to transmit 512 bits). - -.. note:: - When PFC is enabled for at least one priority on a port, the standard - **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. - The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). - -Configuring Flow Control -======================== - -Link-wide PAUSE and Priority-based Flow Control are configured with different -tools. - -Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) -------------------------------------------------------------------- -Use ``ethtool -a `` to view and ``ethtool -A `` to change -the link-wide PAUSE settings. - -.. code-block:: bash - - # View current link-wide PAUSE settings - ethtool -a eth0 - - # Enable RX and TX pause, with autonegotiation - ethtool -A eth0 autoneg on rx on tx on - -**Key Configuration Concepts**: - -* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` - controls **Pause Autoneg** (Annex 31B) only. It is independent from the - **Generic link autonegotiation** configured with ``ethtool -s``. A device can - have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. - -* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** - advertise pause in the PHY. The MAC PAUSE state is **forced** according to - ``rx``/``tx`` and does not depend on partner capabilities or resolution. - Ensure the peer is configured complementarily for PAUSE to be effective. - -* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy - is **remembered** by the kernel and applied later when Generic autoneg is - enabled again. - -* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` - capabilities. The final active state is determined by what both sides of the - link agree on. See the "PHY (Physical Layer Transceiver)" section below, - especially the *Resolution* subsection, for details of the negotiation rules. - -* **Forced Mode**: This mode is necessary when autonegotiation is not used or - not possible. This includes links where one or both partners have - autonegotiation disabled, or in setups without a PHY (e.g., direct - MAC-to-MAC connections). The driver bypasses PHY advertisement and - directly forces the MAC into the specified ``rx``/``tx`` state. The - configuration on both sides of the link must be complementary. For - example, if one side is set to ``tx on`` ``rx off``, the link partner must be - set to ``tx off`` ``rx on`` for flow control to function correctly. - -Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) ----------------------------------------------------- -PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the -``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this -document shows ``dcb(8)`` examples. - -**Viewing PFC Settings**: - -.. code-block:: text - - $ dcb pfc show dev eth0 - pfc-cap 8 macsec-bypass off delay 4096 - prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on - -This shows the PFC state (on/off) for each priority (0-7). - -**Changing PFC Settings**: - -.. code-block:: bash - - # Enable PFC on priorities 6 and 7, leaving others as they are - $ dcb pfc set dev eth0 prio-pfc 6:on 7:on - - # Disable PFC for all priorities except 6 and 7 - $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on - -Monitoring Flow Control -======================= - -The standard way to check if flow control is actively being used is to view the -pause-related statistics. - -**Monitoring Link-wide PAUSE**: -Use ``ethtool --include-statistics -a ``. - -.. code-block:: text - - $ ethtool --include-statistics -a eth0 - Pause parameters for eth0: - ... - Statistics: - tx_pause_frames: 0 - rx_pause_frames: 0 - -**Monitoring PFC**: -PFC statistics (sent and received frames per priority) are available -through the ``dcb`` tool. - -.. code-block:: text - - $ dcb pfc show dev eth0 requests indications - requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 - indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 - -The ``requests`` counters track transmitted PFC frames (TX), and the -``indications`` counters track received PFC frames (RX). - -Link-wide PAUSE Autonegotiation Details -======================================= - -The autonegotiation process for link-wide PAUSE is managed by the PHY and -involves advertising capabilities and resolving the outcome. - -* Terminology (link-wide PAUSE): - - - **Symmetric pause**: both directions are paused when requested (TX+RX - enabled). - - **Asymmetric pause**: only one direction is paused (e.g., RX-only or - TX-only). - - In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded - using two bits (Pause/Asym) and resolved per the standard truth tables - below. - -* **Advertisement**: The PHY advertises the MAC's flow control capabilities. - This is done using two bits in the advertisement register: "Symmetric - Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be - interpreted as a combined value, not as independent flags. The kernel - converts the user's ``rx`` and ``tx`` settings into this two-bit value as - follows: - - .. code-block:: text - - tx rx | Pause Asym - -------+------------- - 0 0 | 0 0 - 0 1 | 1 1 - 1 0 | 0 1 - 1 1 | 1 0 - -* **Resolution**: After negotiation, the PHY reports the link partner's - advertised Pause and Asym bits. The final flow control mode is determined - by the combination of the local and partner advertisements, according to - the IEEE 802.3 standard: - - .. code-block:: text - - Local Device | Link Partner | Result - Pause Asym | Pause Asym | - -------------------+--------------------+--------- - 0 X | 0 X | Disabled - 0 1 | 1 0 | Disabled - 0 1 | 1 1 | TX only - 1 0 | 0 X | Disabled - 1 X | 1 X | TX + RX - 1 1 | 0 1 | RX only - - It is important to note that the advertised bits reflect the *current - configuration* of the MAC, which may not represent its full hardware - capabilities. - -Kernel Policy: "Set and Trust" -============================== - -The ethtool pause API is defined as a **wish policy** for -IEEE 802.3 link-wide PAUSE only. A user request is always accepted -as the preferred configuration, but it may not be possible to apply -it in all link states. - -Key constraints: - -- Link-wide PAUSE is not valid on half-duplex links. -- Link-wide PAUSE cannot be used together with Priority-based Flow Control - (PFC, IEEE 802.1Q Clause 36). -- If autonegotiation is active and the link is currently down, the future - mode is not yet known. - -Because of these constraints, the kernel stores the requested setting -and applies it only when the link is in a compatible state. - -Implications for userspace: - -1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is - remembered even if it cannot be applied immediately. -2. Applied conditionally: when the link comes up, the kernel enables - PAUSE only if the active mode allows it. - -Component Roles in Flow Control -=============================== - -The configuration of flow control involves several components, each with a -distinct role. - -The MAC (Media Access Controller) ---------------------------------- -The MAC is the hardware component that actually sends and receives PAUSE -frames. Its capabilities define the upper limit of what the driver can support. -For link-wide PAUSE, MACs can vary in their support for symmetric (both -directions) or asymmetric (independent TX/RX) flow control. - -For PFC, the MAC must be capable of generating and interpreting the -priority-based PAUSE frames and managing separate pause states for each -traffic class. - -Many MACs also implement automatic PAUSE frame transmission based on the fill -level of their internal RX FIFO. This is typically configured with two -thresholds: - -* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this - threshold, the MAC automatically transmits a PAUSE frame to stop the sender. - -* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this - threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell - the sender it can resume transmission. - -The PHY (Physical Layer Transceiver) ------------------------------------- -The PHY's role is distinct for each flow control mechanism: - -* **Link-wide PAUSE**: During the autonegotiation process, the PHY is - responsible for advertising the device's flow control capabilities. See the - "Link-wide PAUSE Autonegotiation Details" section for more information. - -* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the - CSMA/CD process. It performs carrier sensing (checking if the line is idle) - and collision detection, which is the mechanism leveraged to throttle the - sender. - -* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in - negotiating PFC capabilities. Its role is to establish the physical link. - PFC negotiation happens at a higher layer via the Data Center Bridging - Capability Exchange Protocol (DCBX). - -User Space Interface -==================== -The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for -PFC. They communicate with the kernel to configure the network device driver -and underlying hardware. - -**Link-wide PAUSE Netlink Interface (``ethtool``)** - -See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) -for the authoritative definition of the Pause control and Pause statistics -attributes. The generated UAPI is in -``include/uapi/linux/ethtool_netlink_generated.h``. - -**PFC Netlink Interface (``dcb``)** - -The authoritative definitions for DCB/PFC netlink attributes and commands are in -``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB -subsystem documentation for userspace configuration details. - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 52aafdc85f6a..c775cababc8c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,7 +55,6 @@ Contents: eql fib_trie filter - flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 40cc0a988d60..b0f2ef83735d 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,8 +343,16 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -For detailed link-wide PAUSE and PFC behavior and configuration, see -flow_control.rst. +The PHY does not participate directly in flow control/pause frames except by +making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in +MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC +controller supports such a thing. Since flow control/pause frames generation +involves the Ethernet MAC driver, it is recommended that this driver takes care +of properly indicating advertisement and support for such features by setting +the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done +either before or after phy_connect() and/or as a result of implementing the +ethtool::set_pauseparam feature. + Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index eeed1ea50369..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,48 +953,9 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * - * @get_pauseparam: Report the configured policy for link-wide PAUSE - * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam - * such that: - * @autoneg: - * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only - * and is independent of generic link autonegotiation configured - * via ethtool -s. - * true -> the device follows the negotiated result of pause - * autonegotiation (Pause/Asym); - * false -> the device uses a forced MAC state independent of - * negotiation. - * @rx_pause/@tx_pause: - * represent the desired policy (preferred configuration). - * In autoneg mode they describe what is to be advertised; - * in forced mode they describe the MAC state to apply. - * - * Drivers (and/or frameworks) should persist this policy across link - * changes and reapply appropriate MAC programming when link parameters - * change. - * - * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). - * If @autoneg is true: - * Arrange for pause advertisement (Pause/Asym) based on - * @rx_pause/@tx_pause and program the MAC to follow the - * negotiated result (which may be symmetric, asymmetric, or off - * depending on the link partner). - * If @autoneg is false: - * Do not rely on autonegotiation; force the MAC RX/TX pause - * state directly per @rx_pause/@tx_pause. - * - * Implementations that integrate with PHYLIB/PHYLINK should cooperate - * with those frameworks for advertisement and resolution; MAC drivers are - * still responsible for applying the required MAC state. - * - * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if - * link-wide PAUSE is unsupported. If only symmetric pause is supported, - * reject unsupported asymmetric requests with -EINVAL (or document any - * coercion policy). - * - * See also: Documentation/networking/flow_control.rst - * + * @get_pauseparam: Report pause parameters + * @set_pauseparam: Set pause parameters. Returns a negative error code + * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3dd9d7cde86e..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum ethtool_a_pause_stat { +enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum ethtool_a_pause { +enum { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 91ee22f53774..03eb1d941fca 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,8 +27,6 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. - * See Documentation/networking/flow_control.rst for a high level description - * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index eacf6a4859bf..0f9af1e66548 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* See Documentation/networking/flow_control.rst for a high level description of - * the userspace interface. - */ - #include "netlink.h" #include "common.h" -- cgit v1.2.3 From 5e1c88679174e4bfe5d152060b06d370bd85de80 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 15:07:23 +0200 Subject: mfd: qnap-mcu: Include linux/types.h in qnap-mcu.h shared header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Relying on other components to include those basic types is unreliable and may cause compile errors like: ../include/linux/mfd/qnap-mcu.h:13:9: error: unknown type name ‘u32’ 13 | u32 baud_rate; | ^~~ ../include/linux/mfd/qnap-mcu.h:17:9: error: unknown type name ‘bool’ 17 | bool usb_led; | ^~~~ So make sure, the types used in the header are available. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804130726.3180806-2-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/qnap-mcu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/qnap-mcu.h b/include/linux/mfd/qnap-mcu.h index 8d48c212fd44..42bf523f9a5b 100644 --- a/include/linux/mfd/qnap-mcu.h +++ b/include/linux/mfd/qnap-mcu.h @@ -7,6 +7,8 @@ #ifndef _LINUX_QNAP_MCU_H_ #define _LINUX_QNAP_MCU_H_ +#include + struct qnap_mcu; struct qnap_mcu_variant { -- cgit v1.2.3 From 9c5ad8374b1fe0c8c9eaabc1146d96a543858c4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Aug 2025 17:17:56 +0200 Subject: mfd: arizona: Make legacy gpiolib interface optional The only machine that still uses the old gpio number based interface is the wlf_cragg_6410 board file. In order to remove the dependency on the interfaces, add #ifdef blocks here. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250808151822.536879-13-arnd@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/arizona-irq.c | 5 ++++- include/linux/mfd/arizona/pdata.h | 6 ++++++ sound/soc/codecs/arizona-jack.c | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c index 3f8622ee0e59..544016d420fe 100644 --- a/drivers/mfd/arizona-irq.c +++ b/drivers/mfd/arizona-irq.c @@ -136,7 +136,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) dev_err(arizona->dev, "Failed to read main IRQ status: %d\n", ret); } - +#ifdef CONFIG_GPIOLIB_LEGACY /* * Poll the IRQ pin status to see if we're really done * if the interrupt controller can't do it for us. @@ -150,6 +150,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) !gpio_get_value_cansleep(arizona->pdata.irq_gpio)) { poll = true; } +#endif } while (poll); pm_runtime_put_autosuspend(arizona->dev); @@ -349,6 +350,7 @@ int arizona_irq_init(struct arizona *arizona) goto err_map_main_irq; } +#ifdef CONFIG_GPIOLIB_LEGACY /* Used to emulate edge trigger and to work around broken pinmux */ if (arizona->pdata.irq_gpio) { if (gpio_to_irq(arizona->pdata.irq_gpio) != arizona->irq) { @@ -368,6 +370,7 @@ int arizona_irq_init(struct arizona *arizona) arizona->pdata.irq_gpio = 0; } } +#endif ret = request_threaded_irq(arizona->irq, NULL, arizona_irq_thread, flags, "arizona", arizona); diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 2d13bbea4f3a..f72e6d4b14a7 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,8 +117,10 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO used for mic isolation with HPDET */ int hpdet_id_gpio; +#endif /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -129,8 +131,10 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for mic detection polarity */ int micd_pol_gpio; +#endif /** Mic detect ramp rate */ unsigned int micd_bias_start_time; @@ -184,8 +188,10 @@ struct arizona_pdata { /** Haptic actuator type */ unsigned int hap_act; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for primary IRQ (used for edge triggered emulation) */ int irq_gpio; +#endif /** General purpose switch control */ unsigned int gpsw; diff --git a/sound/soc/codecs/arizona-jack.c b/sound/soc/codecs/arizona-jack.c index 22f9c431a0e5..6b55610ad535 100644 --- a/sound/soc/codecs/arizona-jack.c +++ b/sound/soc/codecs/arizona-jack.c @@ -461,7 +461,11 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, bool *mic) { struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#else + int id_gpio = 0; +#endif if (!arizona->pdata.hpdet_acc_id) return 0; @@ -472,6 +476,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, */ info->hpdet_res[info->num_hpdet_res++] = *reading; +#ifdef CONFIG_GPIOLIB_LEGACY /* Only check the mic directly if we didn't already ID it */ if (id_gpio && info->num_hpdet_res == 1) { dev_dbg(arizona->dev, "Measuring mic\n"); @@ -489,6 +494,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, ARIZONA_HP_POLL, ARIZONA_HP_POLL); return -EAGAIN; } +#endif /* OK, got both. Now, compare... */ dev_dbg(arizona->dev, "HPDET measured %d %d\n", @@ -529,7 +535,9 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) { struct arizona_priv *info = data; struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#endif int ret, reading, state, report; bool mic = false; @@ -585,8 +593,10 @@ done: arizona_extcon_hp_clamp(info, false); +#ifdef CONFIG_GPIOLIB_LEGACY if (id_gpio) gpio_set_value_cansleep(id_gpio, 0); +#endif /* If we have a mic then reenable MICDET */ if (state && (mic || info->mic)) @@ -1317,6 +1327,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) regmap_update_bits(arizona->regmap, ARIZONA_GP_SWITCH_1, ARIZONA_SW1_MODE_MASK, arizona->pdata.gpsw); +#ifdef CONFIG_GPIOLIB_LEGACY if (pdata->micd_pol_gpio > 0) { if (info->micd_modes[0].gpio) mode = GPIOF_OUT_INIT_HIGH; @@ -1332,7 +1343,9 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } info->micd_pol_gpio = gpio_to_desc(pdata->micd_pol_gpio); - } else { + } else +#endif + { if (info->micd_modes[0].gpio) mode = GPIOD_OUT_HIGH; else @@ -1353,6 +1366,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } } +#ifdef CONFIG_GPIOLIB_LEGACY if (arizona->pdata.hpdet_id_gpio > 0) { ret = devm_gpio_request_one(dev, arizona->pdata.hpdet_id_gpio, GPIOF_OUT_INIT_LOW, @@ -1364,6 +1378,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) return ret; } } +#endif return 0; } -- cgit v1.2.3 From 719d02a25a24601c6fb8a7b1627e1abf015b6c2a Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 21 Aug 2025 20:23:34 +0200 Subject: mfd: bd71828, bd71815: Prepare for power-supply support Add core support for ROHM BD718(15/28/78) PMIC's charger blocks. Signed-off-by: Matti Vaittinen Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20250821-bd71828-charger-v3-1-cc74ac4e0fb9@kemnade.info Signed-off-by: Lee Jones --- drivers/mfd/rohm-bd71828.c | 44 ++++++++++++++++++++++------ include/linux/mfd/rohm-bd71828.h | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c index a14b7aa69c3c..84a64c3b9c9f 100644 --- a/drivers/mfd/rohm-bd71828.c +++ b/drivers/mfd/rohm-bd71828.c @@ -45,8 +45,8 @@ static const struct resource bd71828_rtc_irqs[] = { static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_RMV, "bd71815-dcin-rmv"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-clps-out"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-clps-in"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-dcin-clps-out"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-dcin-clps-in"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_RES, "bd71815-dcin-ovp-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_DET, "bd71815-dcin-ovp-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_MON_RES, "bd71815-dcin-mon-res"), @@ -56,7 +56,7 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_RES, "bd71815-vsys-low-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_DET, "bd71815-vsys-low-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_DET, "bd71815-vsys-mon-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TEMP, "bd71815-chg-wdg-temp"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TIME, "bd71815-chg-wdg"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RECHARGE_RES, "bd71815-rechg-res"), @@ -87,10 +87,10 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_2_DET, "bd71815-bat-oc2-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_RES, "bd71815-bat-oc3-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_DET, "bd71815-bat-oc3-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-bat-low-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-bat-low-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-bat-hi-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-bat-hi-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-temp-bat-low-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-temp-bat-low-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-temp-bat-hi-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-temp-bat-hi-det"), }; static const struct mfd_cell bd71815_mfd_cells[] = { @@ -109,7 +109,30 @@ static const struct mfd_cell bd71815_mfd_cells[] = { }, }; -static const struct mfd_cell bd71828_mfd_cells[] = { +static const struct resource bd71828_power_irqs[] = { + DEFINE_RES_IRQ_NAMED(BD71828_INT_CHG_TOPOFF_TO_DONE, + "bd71828-chg-done"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_DET, "bd71828-pwr-dcin-in"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_RMV, "bd71828-pwr-dcin-out"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_RES, + "bd71828-vbat-normal"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_DET, "bd71828-vbat-low"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_DET, "bd71828-btemp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_RES, "bd71828-btemp-cool"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_DET, "bd71828-btemp-lo"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_RES, + "bd71828-btemp-warm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_DET, + "bd71828-temp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_RES, + "bd71828-temp-norm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_DET, + "bd71828-temp-125-over"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_RES, + "bd71828-temp-125-under"), +}; + +static struct mfd_cell bd71828_mfd_cells[] = { { .name = "bd71828-pmic", }, { .name = "bd71828-gpio", }, { .name = "bd71828-led", .of_compatible = "rohm,bd71828-leds" }, @@ -118,8 +141,11 @@ static const struct mfd_cell bd71828_mfd_cells[] = { * BD70528 clock gate are the register address and mask. */ { .name = "bd71828-clk", }, - { .name = "bd71827-power", }, { + .name = "bd71828-power", + .resources = bd71828_power_irqs, + .num_resources = ARRAY_SIZE(bd71828_power_irqs), + }, { .name = "bd71828-rtc", .resources = bd71828_rtc_irqs, .num_resources = ARRAY_SIZE(bd71828_rtc_irqs), diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h index ce786c96404a..73a71ef69152 100644 --- a/include/linux/mfd/rohm-bd71828.h +++ b/include/linux/mfd/rohm-bd71828.h @@ -189,6 +189,69 @@ enum { /* Charger/Battey */ #define BD71828_REG_CHG_STATE 0x65 #define BD71828_REG_CHG_FULL 0xd2 +#define BD71828_REG_CHG_EN 0x6F +#define BD71828_REG_DCIN_STAT 0x68 +#define BD71828_MASK_DCIN_DET 0x01 +#define BD71828_REG_VDCIN_U 0x9c +#define BD71828_MASK_CHG_EN 0x01 +#define BD71828_CHG_MASK_DCIN_U 0x0f +#define BD71828_REG_BAT_STAT 0x67 +#define BD71828_REG_BAT_TEMP 0x6c +#define BD71828_MASK_BAT_TEMP 0x07 +#define BD71828_BAT_TEMP_OPEN 0x07 +#define BD71828_MASK_BAT_DET 0x20 +#define BD71828_MASK_BAT_DET_DONE 0x10 +#define BD71828_REG_CHG_STATE 0x65 +#define BD71828_REG_VBAT_U 0x8c +#define BD71828_MASK_VBAT_U 0x0f +#define BD71828_REG_VBAT_REX_AVG_U 0x92 + +#define BD71828_REG_OCV_PWRON_U 0x8A + +#define BD71828_REG_VBAT_MIN_AVG_U 0x8e +#define BD71828_REG_VBAT_MIN_AVG_L 0x8f + +#define BD71828_REG_CC_CNT3 0xb5 +#define BD71828_REG_CC_CNT2 0xb6 +#define BD71828_REG_CC_CNT1 0xb7 +#define BD71828_REG_CC_CNT0 0xb8 +#define BD71828_REG_CC_CURCD_AVG_U 0xb2 +#define BD71828_MASK_CC_CURCD_AVG_U 0x3f +#define BD71828_MASK_CC_CUR_DIR 0x80 +#define BD71828_REG_VM_BTMP_U 0xa1 +#define BD71828_REG_VM_BTMP_L 0xa2 +#define BD71828_MASK_VM_BTMP_U 0x0f +#define BD71828_REG_COULOMB_CTRL 0xc4 +#define BD71828_REG_COULOMB_CTRL2 0xd2 +#define BD71828_MASK_REX_CC_CLR 0x01 +#define BD71828_MASK_FULL_CC_CLR 0x10 +#define BD71828_REG_CC_CNT_FULL3 0xbd +#define BD71828_REG_CC_CNT_CHG3 0xc1 + +#define BD71828_REG_VBAT_INITIAL1_U 0x86 +#define BD71828_REG_VBAT_INITIAL1_L 0x87 + +#define BD71828_REG_VBAT_INITIAL2_U 0x88 +#define BD71828_REG_VBAT_INITIAL2_L 0x89 + +#define BD71828_REG_IBAT_U 0xb0 +#define BD71828_REG_IBAT_L 0xb1 + +#define BD71828_REG_IBAT_AVG_U 0xb2 +#define BD71828_REG_IBAT_AVG_L 0xb3 + +#define BD71828_REG_VSYS_AVG_U 0x96 +#define BD71828_REG_VSYS_AVG_L 0x97 +#define BD71828_REG_VSYS_MIN_AVG_U 0x98 +#define BD71828_REG_VSYS_MIN_AVG_L 0x99 +#define BD71828_REG_CHG_SET1 0x75 +#define BD71828_REG_ALM_VBAT_LIMIT_U 0xaa +#define BD71828_REG_BATCAP_MON_LIMIT_U 0xcc +#define BD71828_REG_CONF 0x64 + +#define BD71828_REG_DCIN_CLPS 0x71 + +#define BD71828_REG_MEAS_CLEAR 0xaf /* LEDs */ #define BD71828_REG_LED_CTRL 0x4A -- cgit v1.2.3 From 7d096cb3e16f09d9762ae8cef897cdbc13a40029 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:33 +0800 Subject: virtio_ring: constify virtqueue pointer for DMA helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch constifies the virtqueue pointer for DMA helpers. Reviewed-by: Christoph Hellwig Reviewed-by: Xuan Zhuo Reviewed-by: Eugenio Pérez Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 25 +++++++++++++------------ include/linux/virtio.h | 12 ++++++------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f5062061c408..103bad8cffff 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -3149,12 +3149,12 @@ EXPORT_SYMBOL_GPL(virtqueue_get_vring); * * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). */ -dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, +dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) { kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); @@ -3176,11 +3176,12 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. * */ -void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, + dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return; @@ -3196,9 +3197,9 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); * * Returns 0 means dma valid. Other means invalid dma address. */ -int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr) +int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return 0; @@ -3217,9 +3218,9 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); * * return bool */ -bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr) +bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return false; @@ -3240,12 +3241,12 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); * the DMA address really needs to be synchronized * */ -void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, +void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) @@ -3266,12 +3267,12 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); * Before calling this function, use virtqueue_dma_need_sync() to confirm that * the DMA address really needs to be synchronized */ -void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, +void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) diff --git a/include/linux/virtio.h b/include/linux/virtio.h index db31fc6f4f1f..eab71a440fba 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -262,18 +262,18 @@ void unregister_virtio_driver(struct virtio_driver *drv); module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) -dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size_t size, +dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr); +int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr); -bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr); -void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t addr, +bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr); +void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -- cgit v1.2.3 From b41cb3bcf67fcb7b8297e5acc5bb3309c96c2ff2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:35 +0800 Subject: virtio: rename dma helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following patch will introduce virtio mapping function to avoid abusing DMA API for device that doesn't do DMA. To ease the introduction, this patch rename "dma" to "map" for the current dma mapping helpers. Reviewed-by: Christoph Hellwig Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-4-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/net/virtio_net.c | 28 +++++------ drivers/virtio/virtio_ring.c | 114 +++++++++++++++++++++---------------------- include/linux/virtio.h | 12 ++--- 3 files changed, 77 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 975bdc5dab84..31bd32bdecaf 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -962,7 +962,7 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) if (dma->need_sync && len) { offset = buf - (head + sizeof(*dma)); - virtqueue_dma_sync_single_range_for_cpu(rq->vq, dma->addr, + virtqueue_map_sync_single_range_for_cpu(rq->vq, dma->addr, offset, len, DMA_FROM_DEVICE); } @@ -970,8 +970,8 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) if (dma->ref) return; - virtqueue_dma_unmap_single_attrs(rq->vq, dma->addr, dma->len, - DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + virtqueue_unmap_single_attrs(rq->vq, dma->addr, dma->len, + DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); put_page(page); } @@ -1038,13 +1038,13 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) dma->len = alloc_frag->size - sizeof(*dma); - addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, - dma->len, DMA_FROM_DEVICE, 0); - if (virtqueue_dma_mapping_error(rq->vq, addr)) + addr = virtqueue_map_single_attrs(rq->vq, dma + 1, + dma->len, DMA_FROM_DEVICE, 0); + if (virtqueue_map_mapping_error(rq->vq, addr)) return NULL; dma->addr = addr; - dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); + dma->need_sync = virtqueue_map_need_sync(rq->vq, addr); /* Add a reference to dma to prevent the entire dma from * being released during error handling. This reference @@ -5952,9 +5952,9 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, if (!rq->xsk_buffs) return -ENOMEM; - hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, - DMA_TO_DEVICE, 0); - if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) { + hdr_dma = virtqueue_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, + DMA_TO_DEVICE, 0); + if (virtqueue_map_mapping_error(sq->vq, hdr_dma)) { err = -ENOMEM; goto err_free_buffs; } @@ -5983,8 +5983,8 @@ err_sq: err_rq: xsk_pool_dma_unmap(pool, 0); err_xsk_map: - virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, - DMA_TO_DEVICE, 0); + virtqueue_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, + DMA_TO_DEVICE, 0); err_free_buffs: kvfree(rq->xsk_buffs); return err; @@ -6011,8 +6011,8 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) xsk_pool_dma_unmap(pool, 0); - virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, - vi->hdr_len, DMA_TO_DEVICE, 0); + virtqueue_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, + vi->hdr_len, DMA_TO_DEVICE, 0); kvfree(rq->xsk_buffs); return err; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 75e5f6336c8d..482a268af851 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -166,7 +166,7 @@ struct vring_virtqueue { bool packed_ring; /* Is DMA API used? */ - bool use_dma_api; + bool use_map_api; /* Can we use weak barriers? */ bool weak_barriers; @@ -268,7 +268,7 @@ static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, * unconditionally on data path. */ -static bool vring_use_dma_api(const struct virtio_device *vdev) +static bool vring_use_map_api(const struct virtio_device *vdev) { if (!virtio_has_dma_quirk(vdev)) return true; @@ -291,14 +291,14 @@ static bool vring_use_dma_api(const struct virtio_device *vdev) static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring, const struct vring_desc_extra *extra) { - return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR); + return vring->use_map_api && (extra->addr != DMA_MAPPING_ERROR); } size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; - if (vring_use_dma_api(vdev)) + if (vring_use_map_api(vdev)) max_segment_size = dma_max_mapping_size(vdev->dev.parent); return max_segment_size; @@ -309,7 +309,7 @@ static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, dma_addr_t *dma_handle, gfp_t flag, struct device *dma_dev) { - if (vring_use_dma_api(vdev)) { + if (vring_use_map_api(vdev)) { return dma_alloc_coherent(dma_dev, size, dma_handle, flag); } else { @@ -343,7 +343,7 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, void *queue, dma_addr_t dma_handle, struct device *dma_dev) { - if (vring_use_dma_api(vdev)) + if (vring_use_map_api(vdev)) dma_free_coherent(dma_dev, size, queue, dma_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); @@ -372,7 +372,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *len = sg->length; - if (!vq->use_dma_api) { + if (!vq->use_map_api) { /* * If DMA is not used, KMSAN doesn't know that the scatterlist * is initialized by the hardware. Explicitly check/unpoison it @@ -402,17 +402,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, void *cpu_addr, size_t size, enum dma_data_direction direction) { - if (!vq->use_dma_api) + if (!vq->use_map_api) return (dma_addr_t)virt_to_phys(cpu_addr); - return virtqueue_dma_map_single_attrs(&vq->vq, cpu_addr, - size, direction, 0); + return virtqueue_map_single_attrs(&vq->vq, cpu_addr, + size, direction, 0); } static int vring_mapping_error(const struct vring_virtqueue *vq, dma_addr_t addr) { - if (!vq->use_dma_api) + if (!vq->use_map_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); @@ -449,7 +449,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { - if (!vq->use_dma_api) + if (!vq->use_map_api) goto out; } else if (!vring_need_unmap_buffer(vq, extra)) goto out; @@ -782,7 +782,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, extra = (struct vring_desc_extra *)&indir_desc[num]; - if (vq->use_dma_api) { + if (vq->use_map_api) { for (j = 0; j < num; j++) vring_unmap_one_split(vq, &extra[j]); } @@ -1150,7 +1150,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, vq->broken = false; #endif vq->dma_dev = dma_dev; - vq->use_dma_api = vring_use_dma_api(vdev); + vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -1266,7 +1266,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { - if (!vq->use_dma_api) + if (!vq->use_map_api) return; } else if (!vring_need_unmap_buffer(vq, extra)) return; @@ -1351,7 +1351,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(len); - if (unlikely(vq->use_dma_api)) { + if (unlikely(vq->use_map_api)) { extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; @@ -1373,7 +1373,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, sizeof(struct vring_packed_desc)); vq->packed.vring.desc[head].id = cpu_to_le16(id); - if (vq->use_dma_api) { + if (vq->use_map_api) { vq->packed.desc_extra[id].addr = addr; vq->packed.desc_extra[id].len = total_sg * sizeof(struct vring_packed_desc); @@ -1515,7 +1515,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, desc[i].len = cpu_to_le32(len); desc[i].id = cpu_to_le16(id); - if (unlikely(vq->use_dma_api)) { + if (unlikely(vq->use_map_api)) { vq->packed.desc_extra[curr].addr = premapped ? DMA_MAPPING_ERROR : addr; vq->packed.desc_extra[curr].len = len; @@ -1650,7 +1650,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, vq->free_head = id; vq->vq.num_free += state->num; - if (unlikely(vq->use_dma_api)) { + if (unlikely(vq->use_map_api)) { curr = id; for (i = 0; i < state->num; i++) { vring_unmap_extra_packed(vq, @@ -1668,7 +1668,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, if (!desc) return; - if (vq->use_dma_api) { + if (vq->use_map_api) { len = vq->packed.desc_extra[id].len; num = len / sizeof(struct vring_packed_desc); @@ -2121,7 +2121,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, #endif vq->packed_ring = true; vq->dma_dev = dma_dev; - vq->use_dma_api = vring_use_dma_api(vdev); + vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2433,7 +2433,7 @@ struct device *virtqueue_dma_dev(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->use_dma_api) + if (vq->use_map_api) return vring_dma_dev(vq); else return NULL; @@ -3122,7 +3122,7 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) EXPORT_SYMBOL_GPL(virtqueue_get_vring); /** - * virtqueue_dma_map_single_attrs - map DMA for _vq + * virtqueue_map_single_attrs - map DMA for _vq * @_vq: the struct virtqueue we're talking about. * @ptr: the pointer of the buffer to do dma * @size: the size of the buffer to do dma @@ -3132,16 +3132,16 @@ EXPORT_SYMBOL_GPL(virtqueue_get_vring); * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * - * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). + * return DMA address. Caller should check that by virtqueue_mapping_error(). */ -dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) { + if (!vq->use_map_api) { kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); return (dma_addr_t)virt_to_phys(ptr); } @@ -3154,85 +3154,85 @@ dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr return dma_map_page_attrs(vring_dma_dev(vq), virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); } -EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); +EXPORT_SYMBOL_GPL(virtqueue_map_single_attrs); /** - * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq + * virtqueue_unmap_single_attrs - unmap map for _vq * @_vq: the struct virtqueue we're talking about. * @addr: the dma address to unmap * @size: the size of the buffer * @dir: DMA direction * @attrs: DMA Attrs * - * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. + * Unmap the address that is mapped by the virtqueue_map_* APIs. * */ -void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, - dma_addr_t addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, + dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return; dma_unmap_page_attrs(vring_dma_dev(vq), addr, size, dir, attrs); } -EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); +EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs); /** - * virtqueue_dma_mapping_error - check dma address + * virtqueue_map_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * * Returns 0 means dma valid. Other means invalid dma address. */ -int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) +int virtqueue_map_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); } -EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); +EXPORT_SYMBOL_GPL(virtqueue_map_mapping_error); /** - * virtqueue_dma_need_sync - check a dma address needs sync + * virtqueue_map_need_sync - check a dma address needs sync * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * - * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be + * Check if the dma address mapped by the virtqueue_map_* APIs needs to be * synchronized * * return bool */ -bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr) +bool virtqueue_map_need_sync(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return false; return dma_need_sync(vring_dma_dev(vq), addr); } -EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); +EXPORT_SYMBOL_GPL(virtqueue_map_need_sync); /** - * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu + * virtqueue_map_sync_single_range_for_cpu - map sync for cpu * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * - * Before calling this function, use virtqueue_dma_need_sync() to confirm that + * Before calling this function, use virtqueue_map_need_sync() to confirm that * the DMA address really needs to be synchronized * */ -void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, +void virtqueue_map_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) @@ -3240,25 +3240,25 @@ void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return; dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } -EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); +EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_cpu); /** - * virtqueue_dma_sync_single_range_for_device - dma sync for device + * virtqueue_map_sync_single_range_for_device - map sync for device * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * - * Before calling this function, use virtqueue_dma_need_sync() to confirm that + * Before calling this function, use virtqueue_map_need_sync() to confirm that * the DMA address really needs to be synchronized */ -void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, +void virtqueue_map_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) @@ -3266,12 +3266,12 @@ void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return; dma_sync_single_range_for_device(dev, addr, offset, size, dir); } -EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device); +EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_device); MODULE_DESCRIPTION("Virtio ring implementation"); MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index eab71a440fba..576e08bd7697 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -262,18 +262,18 @@ void unregister_virtio_driver(struct virtio_driver *drv); module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) -dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, +dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr); +int virtqueue_map_mapping_error(const struct virtqueue *_vq, dma_addr_t addr); -bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr); -void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, +bool virtqueue_map_need_sync(const struct virtqueue *_vq, dma_addr_t addr); +void virtqueue_map_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_map_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -- cgit v1.2.3 From b16060c5c7d56455da3c3c50b4a20a83c2a30810 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:36 +0800 Subject: virtio: introduce virtio_map container union MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following patch will introduce the mapping operations for virtio device. In order to achieve this, besides the dma device, virtio core needs to support a transport or device specific mapping metadata as well. So this patch introduces a union container of a dma device. The idea is the allow the transport layer to pass device specific mapping metadata which will be used as a parameter for the virtio mapping operations. For the transport or device that is using DMA, dma device is still being used. Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-5-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 104 ++++++++++++++++++++++--------------------- drivers/virtio/virtio_vdpa.c | 6 ++- include/linux/virtio.h | 5 +++ include/linux/virtio_ring.h | 7 +-- 4 files changed, 66 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 482a268af851..cda9bc2121bf 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -210,8 +210,7 @@ struct vring_virtqueue { /* DMA, allocation, and size information */ bool we_own_ring; - /* Device used for doing DMA */ - struct device *dma_dev; + union virtio_map map; #ifdef DEBUG /* They're supposed to lock for us. */ @@ -307,10 +306,10 @@ EXPORT_SYMBOL_GPL(virtio_max_dma_size); static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, dma_addr_t *dma_handle, gfp_t flag, - struct device *dma_dev) + union virtio_map map) { if (vring_use_map_api(vdev)) { - return dma_alloc_coherent(dma_dev, size, + return dma_alloc_coherent(map.dma_dev, size, dma_handle, flag); } else { void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); @@ -341,10 +340,10 @@ static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, static void vring_free_queue(struct virtio_device *vdev, size_t size, void *queue, dma_addr_t dma_handle, - struct device *dma_dev) + union virtio_map map) { if (vring_use_map_api(vdev)) - dma_free_coherent(dma_dev, size, queue, dma_handle); + dma_free_coherent(map.dma_dev, size, queue, dma_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); } @@ -356,7 +355,7 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, */ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) { - return vq->dma_dev; + return vq->map.dma_dev; } /* Map one sg entry. */ @@ -1056,12 +1055,13 @@ err_state: } static void vring_free_split(struct vring_virtqueue_split *vring_split, - struct virtio_device *vdev, struct device *dma_dev) + struct virtio_device *vdev, + union virtio_map map) { vring_free_queue(vdev, vring_split->queue_size_in_bytes, vring_split->vring.desc, vring_split->queue_dma_addr, - dma_dev); + map); kfree(vring_split->desc_state); kfree(vring_split->desc_extra); @@ -1072,7 +1072,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, u32 num, unsigned int vring_align, bool may_reduce_num, - struct device *dma_dev) + union virtio_map map) { void *queue = NULL; dma_addr_t dma_addr; @@ -1088,7 +1088,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (queue) break; if (!may_reduce_num) @@ -1102,7 +1102,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, /* Try to get a single page. You are my only hope! */ queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_ZERO, - dma_dev); + map); } if (!queue) return -ENOMEM; @@ -1126,7 +1126,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue *vq; int err; @@ -1149,7 +1149,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, #else vq->broken = false; #endif - vq->dma_dev = dma_dev; + vq->map = map; vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && @@ -1187,21 +1187,21 @@ static struct virtqueue *vring_create_virtqueue_split( bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue_split vring_split = {}; struct virtqueue *vq; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, - may_reduce_num, dma_dev); + may_reduce_num, map); if (err) return NULL; vq = __vring_new_virtqueue_split(index, &vring_split, vdev, weak_barriers, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); if (!vq) { - vring_free_split(&vring_split, vdev, dma_dev); + vring_free_split(&vring_split, vdev, map); return NULL; } @@ -1220,7 +1220,7 @@ static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) err = vring_alloc_queue_split(&vring_split, vdev, num, vq->split.vring_align, vq->split.may_reduce_num, - vring_dma_dev(vq)); + vq->map); if (err) goto err; @@ -1238,7 +1238,7 @@ static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) return 0; err_state_extra: - vring_free_split(&vring_split, vdev, vring_dma_dev(vq)); + vring_free_split(&vring_split, vdev, vq->map); err: virtqueue_reinit_split(vq); return -ENOMEM; @@ -1947,25 +1947,25 @@ static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, - struct device *dma_dev) + union virtio_map map) { if (vring_packed->vring.desc) vring_free_queue(vdev, vring_packed->ring_size_in_bytes, vring_packed->vring.desc, vring_packed->ring_dma_addr, - dma_dev); + map); if (vring_packed->vring.driver) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.driver, vring_packed->driver_event_dma_addr, - dma_dev); + map); if (vring_packed->vring.device) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.device, vring_packed->device_event_dma_addr, - dma_dev); + map); kfree(vring_packed->desc_state); kfree(vring_packed->desc_extra); @@ -1973,7 +1973,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, - u32 num, struct device *dma_dev) + u32 num, union virtio_map map) { struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; @@ -1985,7 +1985,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, ring = vring_alloc_queue(vdev, ring_size_in_bytes, &ring_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (!ring) goto err; @@ -1998,7 +1998,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, driver = vring_alloc_queue(vdev, event_size_in_bytes, &driver_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (!driver) goto err; @@ -2009,7 +2009,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, device = vring_alloc_queue(vdev, event_size_in_bytes, &device_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (!device) goto err; @@ -2021,7 +2021,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, return 0; err: - vring_free_packed(vring_packed, vdev, dma_dev); + vring_free_packed(vring_packed, vdev, map); return -ENOMEM; } @@ -2097,7 +2097,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue *vq; int err; @@ -2120,7 +2120,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, vq->broken = false; #endif vq->packed_ring = true; - vq->dma_dev = dma_dev; + vq->map = map; vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && @@ -2158,18 +2158,18 @@ static struct virtqueue *vring_create_virtqueue_packed( bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue_packed vring_packed = {}; struct virtqueue *vq; - if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev)) + if (vring_alloc_queue_packed(&vring_packed, vdev, num, map)) return NULL; vq = __vring_new_virtqueue_packed(index, &vring_packed, vdev, weak_barriers, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); if (!vq) { - vring_free_packed(&vring_packed, vdev, dma_dev); + vring_free_packed(&vring_packed, vdev, map); return NULL; } @@ -2185,7 +2185,7 @@ static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) struct virtio_device *vdev = _vq->vdev; int err; - if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq))) + if (vring_alloc_queue_packed(&vring_packed, vdev, num, vq->map)) goto err_ring; err = vring_alloc_state_extra_packed(&vring_packed); @@ -2202,7 +2202,7 @@ static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) return 0; err_state_extra: - vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq)); + vring_free_packed(&vring_packed, vdev, vq->map); err_ring: virtqueue_reinit_packed(vq); return -ENOMEM; @@ -2434,7 +2434,7 @@ struct device *virtqueue_dma_dev(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->use_map_api) - return vring_dma_dev(vq); + return vq->map.dma_dev; else return NULL; } @@ -2719,19 +2719,20 @@ struct virtqueue *vring_create_virtqueue( void (*callback)(struct virtqueue *), const char *name) { + union virtio_map map = {.dma_dev = vdev->dev.parent}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, vdev->dev.parent); + context, notify, callback, name, map); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, vdev->dev.parent); + context, notify, callback, name, map); } EXPORT_SYMBOL_GPL(vring_create_virtqueue); -struct virtqueue *vring_create_virtqueue_dma( +struct virtqueue *vring_create_virtqueue_map( unsigned int index, unsigned int num, unsigned int vring_align, @@ -2742,19 +2743,19 @@ struct virtqueue *vring_create_virtqueue_dma( bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); } -EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); +EXPORT_SYMBOL_GPL(vring_create_virtqueue_map); /** * virtqueue_resize - resize the vring of vq @@ -2865,6 +2866,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, const char *name) { struct vring_virtqueue_split vring_split = {}; + union virtio_map map = {.dma_dev = vdev->dev.parent}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { struct vring_virtqueue_packed vring_packed = {}; @@ -2874,13 +2876,13 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, return __vring_new_virtqueue_packed(index, &vring_packed, vdev, weak_barriers, context, notify, callback, - name, vdev->dev.parent); + name, map); } vring_init(&vring_split.vring, num, pages, vring_align); return __vring_new_virtqueue_split(index, &vring_split, vdev, weak_barriers, context, notify, callback, name, - vdev->dev.parent); + map); } EXPORT_SYMBOL_GPL(vring_new_virtqueue); @@ -2894,19 +2896,19 @@ static void vring_free(struct virtqueue *_vq) vq->packed.ring_size_in_bytes, vq->packed.vring.desc, vq->packed.ring_dma_addr, - vring_dma_dev(vq)); + vq->map); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.driver, vq->packed.driver_event_dma_addr, - vring_dma_dev(vq)); + vq->map); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.device, vq->packed.device_event_dma_addr, - vring_dma_dev(vq)); + vq->map); kfree(vq->packed.desc_state); kfree(vq->packed.desc_extra); @@ -2915,7 +2917,7 @@ static void vring_free(struct virtqueue *_vq) vq->split.queue_size_in_bytes, vq->split.vring.desc, vq->split.queue_dma_addr, - vring_dma_dev(vq)); + vq->map); } } if (!vq->packed_ring) { diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 657b07a60788..dc557aa7c825 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -139,6 +139,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, struct vdpa_callback cb; struct virtqueue *vq; u64 desc_addr, driver_addr, device_addr; + union virtio_map map = {0}; /* Assume split virtqueue, switch to packed if necessary */ struct vdpa_vq_state state = {0}; u32 align, max_num, min_num = 1; @@ -185,9 +186,10 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, dma_dev = ops->get_vq_dma_dev(vdpa, index); else dma_dev = vdpa_get_dma_dev(vdpa); - vq = vring_create_virtqueue_dma(index, max_num, align, vdev, + map.dma_dev = dma_dev; + vq = vring_create_virtqueue_map(index, max_num, align, vdev, true, may_reduce_num, ctx, - notify, callback, name, dma_dev); + notify, callback, name, map); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 576e08bd7697..b4ba1a99e5ab 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -41,6 +41,11 @@ struct virtqueue { void *priv; }; +union virtio_map { + /* Device that performs DMA */ + struct device *dma_dev; +}; + int virtqueue_add_outbuf(struct virtqueue *vq, struct scatterlist sg[], unsigned int num, void *data, diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 9b33df741b63..c97a12c1cda3 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -3,6 +3,7 @@ #define _LINUX_VIRTIO_RING_H #include +#include #include #include @@ -79,9 +80,9 @@ struct virtqueue *vring_create_virtqueue(unsigned int index, /* * Creates a virtqueue and allocates the descriptor ring with per - * virtqueue DMA device. + * virtqueue mapping operations. */ -struct virtqueue *vring_create_virtqueue_dma(unsigned int index, +struct virtqueue *vring_create_virtqueue_map(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, @@ -91,7 +92,7 @@ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, - struct device *dma_dev); + union virtio_map map); /* * Creates a virtqueue with a standard layout but a caller-allocated -- cgit v1.2.3 From bee8c7c24b737338216dc0f87d6c47a4abaf609a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:38 +0800 Subject: virtio: introduce map ops in virtio core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces map operations for virtio device. Virtio used to use DMA API which is not necessarily the case since some devices doesn't do DMA. Instead of using tricks and abusing DMA API, let's simply abstract the current mapping logic into a virtio specific mapping operations. For the device or transport that doesn't do DMA, they can implement their own mapping logic without the need to trick DMA core. In this case the mapping metadata is opaque to the virtio core that will be passed back to the transport or device specific map operations. For other devices, DMA API will still be used, so map token will still be the dma device to minimize the changeset and performance impact. The mapping operations are abstracted as a independent structure instead of reusing virtio_config_ops. This allows the transport can simply reuse the structure for lower layers like vDPA. A set of new mapping helpers were introduced for the device that want to do mapping by themselves. Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-7-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 211 +++++++++++++++++++++++++++++++++--------- drivers/virtio/virtio_vdpa.c | 3 + include/linux/virtio.h | 25 +++++ include/linux/virtio_config.h | 72 ++++++++++++++ 4 files changed, 269 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 46515b017ccb..f91a432b3e53 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -297,8 +297,14 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; - if (vring_use_map_api(vdev)) - max_segment_size = dma_max_mapping_size(vdev->dev.parent); + if (vring_use_map_api(vdev)) { + if (vdev->map) { + max_segment_size = + vdev->map->max_mapping_size(vdev->vmap); + } else + max_segment_size = + dma_max_mapping_size(vdev->dev.parent); + } return max_segment_size; } @@ -309,8 +315,8 @@ static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, union virtio_map map) { if (vring_use_map_api(vdev)) { - return dma_alloc_coherent(map.dma_dev, size, - map_handle, flag); + return virtqueue_map_alloc_coherent(vdev, map, size, + map_handle, flag); } else { void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); @@ -343,7 +349,8 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, union virtio_map map) { if (vring_use_map_api(vdev)) - dma_free_coherent(map.dma_dev, size, queue, map_handle); + virtqueue_map_free_coherent(vdev, map, size, + queue, map_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); } @@ -358,6 +365,20 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) return vq->map.dma_dev; } +static int vring_mapping_error(const struct vring_virtqueue *vq, + dma_addr_t addr) +{ + struct virtio_device *vdev = vq->vq.vdev; + + if (!vq->use_map_api) + return 0; + + if (vdev->map) + return vdev->map->mapping_error(vq->map, addr); + else + return dma_mapping_error(vring_dma_dev(vq), addr); +} + /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr, @@ -387,11 +408,11 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist * the way it expects (we don't guarantee that the scatterlist * will exist for the lifetime of the mapping). */ - *addr = dma_map_page(vring_dma_dev(vq), - sg_page(sg), sg->offset, sg->length, - direction); + *addr = virtqueue_map_page_attrs(&vq->vq, sg_page(sg), + sg->offset, sg->length, + direction, 0); - if (dma_mapping_error(vring_dma_dev(vq), *addr)) + if (vring_mapping_error(vq, *addr)) return -ENOMEM; return 0; @@ -408,15 +429,6 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, size, direction, 0); } -static int vring_mapping_error(const struct vring_virtqueue *vq, - dma_addr_t addr) -{ - if (!vq->use_map_api) - return 0; - - return dma_mapping_error(vring_dma_dev(vq), addr); -} - static void virtqueue_init(struct vring_virtqueue *vq, u32 num) { vq->vq.num_free = num; @@ -453,11 +465,12 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, } else if (!vring_need_unmap_buffer(vq, extra)) goto out; - dma_unmap_page(vring_dma_dev(vq), - extra->addr, - extra->len, - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); + virtqueue_unmap_page_attrs(&vq->vq, + extra->addr, + extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, + 0); out: return extra->next; @@ -1271,10 +1284,11 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, } else if (!vring_need_unmap_buffer(vq, extra)) return; - dma_unmap_page(vring_dma_dev(vq), - extra->addr, extra->len, - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); + virtqueue_unmap_page_attrs(&vq->vq, + extra->addr, extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, + 0); } static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, @@ -2433,7 +2447,7 @@ struct device *virtqueue_dma_dev(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->use_map_api) + if (vq->use_map_api && !_vq->vdev->map) return vq->map.dma_dev; else return NULL; @@ -3123,6 +3137,107 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring); +/** + * virtqueue_map_alloc_coherent - alloc coherent mapping + * @vdev: the virtio device we are talking to + * @map: metadata for performing mapping + * @size: the size of the buffer + * @map_handle: the pointer to the mapped address + * @gfp: allocation flag (GFP_XXX) + * + * return virtual address or NULL on error + */ +void *virtqueue_map_alloc_coherent(struct virtio_device *vdev, + union virtio_map map, + size_t size, dma_addr_t *map_handle, + gfp_t gfp) +{ + if (vdev->map) + return vdev->map->alloc(map, size, + map_handle, gfp); + else + return dma_alloc_coherent(map.dma_dev, size, + map_handle, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_map_alloc_coherent); + +/** + * virtqueue_map_free_coherent - free coherent mapping + * @vdev: the virtio device we are talking to + * @map: metadata for performing mapping + * @size: the size of the buffer + * @map_handle: the mapped address that needs to be freed + * + */ +void virtqueue_map_free_coherent(struct virtio_device *vdev, + union virtio_map map, size_t size, void *vaddr, + dma_addr_t map_handle) +{ + if (vdev->map) + vdev->map->free(map, size, vaddr, + map_handle, 0); + else + dma_free_coherent(map.dma_dev, size, vaddr, map_handle); +} +EXPORT_SYMBOL_GPL(virtqueue_map_free_coherent); + +/** + * virtqueue_map_page_attrs - map a page to the device + * @_vq: the virtqueue we are talking to + * @page: the page that will be mapped by the device + * @offset: the offset in the page for a buffer + * @size: the buffer size + * @dir: mapping direction + * @attrs: mapping attributes + * + * Returns mapped address. Caller should check that by virtqueue_mapping_error(). + */ +dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + + if (vdev->map) + return vdev->map->map_page(vq->map, + page, offset, size, + dir, attrs); + + return dma_map_page_attrs(vring_dma_dev(vq), + page, offset, size, + dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_map_page_attrs); + +/** + * virtqueue_unmap_page_attrs - map a page to the device + * @_vq: the virtqueue we are talking to + * @map_handle: the mapped address + * @size: the buffer size + * @dir: mapping direction + * @attrs: unmapping attributes + */ +void virtqueue_unmap_page_attrs(const struct virtqueue *_vq, + dma_addr_t map_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + + if (vdev->map) + vdev->map->unmap_page(vq->map, + map_handle, size, dir, attrs); + else + dma_unmap_page_attrs(vring_dma_dev(vq), map_handle, + size, dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_unmap_page_attrs); + /** * virtqueue_map_single_attrs - map DMA for _vq * @_vq: the struct virtqueue we're talking about. @@ -3134,7 +3249,7 @@ EXPORT_SYMBOL_GPL(virtqueue_get_vring); * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * - * return DMA address. Caller should check that by virtqueue_mapping_error(). + * return mapped address. Caller should check that by virtqueue_mapping_error(). */ dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, @@ -3153,8 +3268,8 @@ dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, "rejecting DMA map of vmalloc memory\n")) return DMA_MAPPING_ERROR; - return dma_map_page_attrs(vring_dma_dev(vq), virt_to_page(ptr), - offset_in_page(ptr), size, dir, attrs); + return virtqueue_map_page_attrs(&vq->vq, virt_to_page(ptr), + offset_in_page(ptr), size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_map_single_attrs); @@ -3179,12 +3294,12 @@ void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, if (!vq->use_map_api) return; - dma_unmap_page_attrs(vring_dma_dev(vq), addr, size, dir, attrs); + virtqueue_unmap_page_attrs(_vq, addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs); /** - * virtqueue_map_mapping_error - check dma address + * virtqueue_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @@ -3194,10 +3309,7 @@ int virtqueue_map_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_map_api) - return 0; - - return dma_mapping_error(vring_dma_dev(vq), addr); + return vring_mapping_error(vq, addr); } EXPORT_SYMBOL_GPL(virtqueue_map_mapping_error); @@ -3214,11 +3326,15 @@ EXPORT_SYMBOL_GPL(virtqueue_map_mapping_error); bool virtqueue_map_need_sync(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; if (!vq->use_map_api) return false; - return dma_need_sync(vring_dma_dev(vq), addr); + if (vdev->map) + return vdev->map->need_sync(vq->map, addr); + else + return dma_need_sync(vring_dma_dev(vq), addr); } EXPORT_SYMBOL_GPL(virtqueue_map_need_sync); @@ -3240,12 +3356,17 @@ void virtqueue_map_sync_single_range_for_cpu(const struct virtqueue *_vq, enum dma_data_direction dir) { const struct vring_virtqueue *vq = to_vvq(_vq); - struct device *dev = vring_dma_dev(vq); + struct virtio_device *vdev = _vq->vdev; if (!vq->use_map_api) return; - dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); + if (vdev->map) + vdev->map->sync_single_for_cpu(vq->map, + addr + offset, size, dir); + else + dma_sync_single_range_for_cpu(vring_dma_dev(vq), + addr, offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_cpu); @@ -3266,12 +3387,18 @@ void virtqueue_map_sync_single_range_for_device(const struct virtqueue *_vq, enum dma_data_direction dir) { const struct vring_virtqueue *vq = to_vvq(_vq); - struct device *dev = vring_dma_dev(vq); + struct virtio_device *vdev = _vq->vdev; if (!vq->use_map_api) return; - dma_sync_single_range_for_device(dev, addr, offset, size, dir); + if (vdev->map) + vdev->map->sync_single_for_device(vq->map, + addr + offset, + size, dir); + else + dma_sync_single_range_for_device(vring_dma_dev(vq), addr, + offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_device); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index dc557aa7c825..d4be689e3626 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -195,6 +195,9 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, goto error_new_virtqueue; } + if (index == 0) + vdev->vmap = map; + vq->num_max = max_num; /* Setup virtqueue callback */ diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b4ba1a99e5ab..3386a4a8d06b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -166,9 +166,11 @@ struct virtio_device { struct virtio_device_id id; const struct virtio_config_ops *config; const struct vringh_config_ops *vringh_config; + const struct virtio_map_ops *map; struct list_head vqs; VIRTIO_DECLARE_FEATURES(features); void *priv; + union virtio_map vmap; #ifdef CONFIG_VIRTIO_DEBUG struct dentry *debugfs_dir; u64 debugfs_filter_features[VIRTIO_FEATURES_DWORDS]; @@ -267,6 +269,29 @@ void unregister_virtio_driver(struct virtio_driver *drv); module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) + +void *virtqueue_map_alloc_coherent(struct virtio_device *vdev, + union virtio_map mapping_token, + size_t size, dma_addr_t *dma_handle, + gfp_t gfp); + +void virtqueue_map_free_coherent(struct virtio_device *vdev, + union virtio_map mapping_token, + size_t size, void *vaddr, + dma_addr_t dma_handle); + +dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction dir, + unsigned long attrs); + +void virtqueue_unmap_page_attrs(const struct virtqueue *_vq, + dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs); void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 7427b79d6f3d..16001e9f9b39 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -139,6 +139,78 @@ struct virtio_config_ops { int (*enable_vq_after_reset)(struct virtqueue *vq); }; +/** + * struct virtio_map_ops - operations for mapping buffer for a virtio device + * Note: For transport that has its own mapping logic it must + * implements all of the operations + * @map_page: map a buffer to the device + * map: metadata for performing mapping + * page: the page that will be mapped by the device + * offset: the offset in the page for a buffer + * size: the buffer size + * dir: mapping direction + * attrs: mapping attributes + * Returns: the mapped address + * @unmap_page: unmap a buffer from the device + * map: device specific mapping map + * map_handle: the mapped address + * size: the buffer size + * dir: mapping direction + * attrs: unmapping attributes + * @sync_single_for_cpu: sync a single buffer from device to cpu + * map: metadata for performing mapping + * map_handle: the mapping address to sync + * size: the size of the buffer + * dir: synchronization direction + * @sync_single_for_device: sync a single buffer from cpu to device + * map: metadata for performing mapping + * map_handle: the mapping address to sync + * size: the size of the buffer + * dir: synchronization direction + * @alloc: alloc a coherent buffer mapping + * map: metadata for performing mapping + * size: the size of the buffer + * map_handle: the mapping address to sync + * gfp: allocation flag (GFP_XXX) + * Returns: virtual address of the allocated buffer + * @free: free a coherent buffer mapping + * map: metadata for performing mapping + * size: the size of the buffer + * vaddr: virtual address of the buffer + * map_handle: the mapping address to sync + * attrs: unmapping attributes + * @need_sync: if the buffer needs synchronization + * map: metadata for performing mapping + * map_handle: the mapped address + * Returns: whether the buffer needs synchronization + * @mapping_error: if the mapping address is error + * map: metadata for performing mapping + * map_handle: the mapped address + * @max_mapping_size: get the maximum buffer size that can be mapped + * map: metadata for performing mapping + * Returns: the maximum buffer size that can be mapped + */ +struct virtio_map_ops { + dma_addr_t (*map_page)(union virtio_map map, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); + void (*unmap_page)(union virtio_map map, dma_addr_t map_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*sync_single_for_cpu)(union virtio_map map, dma_addr_t map_handle, + size_t size, enum dma_data_direction dir); + void (*sync_single_for_device)(union virtio_map map, + dma_addr_t map_handle, size_t size, + enum dma_data_direction dir); + void *(*alloc)(union virtio_map map, size_t size, + dma_addr_t *map_handle, gfp_t gfp); + void (*free)(union virtio_map map, size_t size, void *vaddr, + dma_addr_t map_handle, unsigned long attrs); + bool (*need_sync)(union virtio_map map, dma_addr_t map_handle); + int (*mapping_error)(union virtio_map map, dma_addr_t map_handle); + size_t (*max_mapping_size)(union virtio_map map); +}; + /* If driver didn't advertise the feature, it will never appear. */ void virtio_check_driver_offered_feature(const struct virtio_device *vdev, unsigned int fbit); -- cgit v1.2.3 From 58aca3dbc7d8891a016cb17d488af3002812793b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:39 +0800 Subject: vdpa: support virtio_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Virtio core switches from DMA device to virtio_map, let's do that as well for vDPA. Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-8-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/vdpa/alibaba/eni_vdpa.c | 2 +- drivers/vdpa/ifcvf/ifcvf_main.c | 2 +- drivers/vdpa/mlx5/core/mr.c | 4 ++-- drivers/vdpa/mlx5/net/mlx5_vnet.c | 13 ++++++++----- drivers/vdpa/octeon_ep/octep_vdpa_main.c | 2 +- drivers/vdpa/pds/vdpa_dev.c | 2 +- drivers/vdpa/solidrun/snet_main.c | 4 ++-- drivers/vdpa/vdpa.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- drivers/vhost/vdpa.c | 6 ++++-- drivers/virtio/virtio_vdpa.c | 11 +++++------ include/linux/vdpa.h | 15 ++++++++------- 14 files changed, 37 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index ad7f3447fe90..54aea086d08c 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -496,7 +496,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); pci_set_drvdata(pdev, eni_vdpa); - eni_vdpa->vdpa.dma_dev = &pdev->dev; + eni_vdpa->vdpa.vmap.dma_dev = &pdev->dev; eni_vdpa->queues = eni_vdpa_get_num_queues(eni_vdpa); eni_vdpa->vring = devm_kcalloc(&pdev->dev, eni_vdpa->queues, diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index ccf64d7bbfaa..979d188d74ee 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -713,7 +713,7 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, ifcvf_mgmt_dev->adapter = adapter; adapter->pdev = pdev; - adapter->vdpa.dma_dev = &pdev->dev; + adapter->vdpa.vmap.dma_dev = &pdev->dev; adapter->vdpa.mdev = mdev; adapter->vf = vf; vdpa_dev = &adapter->vdpa; diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index c7a20278bc3c..8870a7169267 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -378,7 +378,7 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr u64 pa, offset; u64 paend; struct scatterlist *sg; - struct device *dma = mvdev->vdev.dma_dev; + struct device *dma = mvdev->vdev.vmap.dma_dev; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) { @@ -432,7 +432,7 @@ err_map: static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { - struct device *dma = mvdev->vdev.dma_dev; + struct device *dma = mvdev->vdev.vmap.dma_dev; destroy_direct_mr(mvdev, mr); dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 0ed2fc28e1ce..a7e76f175914 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3395,14 +3395,17 @@ static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid) return err; } -static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx) +static union virtio_map mlx5_get_vq_map(struct vdpa_device *vdev, u16 idx) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + union virtio_map map; if (is_ctrl_vq_idx(mvdev, idx)) - return &vdev->dev; + map.dma_dev = &vdev->dev; + else + map.dma_dev = mvdev->vdev.vmap.dma_dev; - return mvdev->vdev.dma_dev; + return map; } static void free_irqs(struct mlx5_vdpa_net *ndev) @@ -3686,7 +3689,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_map = mlx5_vdpa_set_map, .reset_map = mlx5_vdpa_reset_map, .set_group_asid = mlx5_set_group_asid, - .get_vq_dma_dev = mlx5_get_vq_dma_dev, + .get_vq_map = mlx5_get_vq_map, .free = mlx5_vdpa_free, .suspend = mlx5_vdpa_suspend, .resume = mlx5_vdpa_resume, /* Op disabled if not supported. */ @@ -3965,7 +3968,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev->mvdev.mlx_features = device_features; - mvdev->vdev.dma_dev = &mdev->pdev->dev; + mvdev->vdev.vmap.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) goto err_alloc; diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c index 9b49efd24391..5818dae133a3 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -516,7 +516,7 @@ static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, } oct_vdpa->pdev = pdev; - oct_vdpa->vdpa.dma_dev = &pdev->dev; + oct_vdpa->vdpa.vmap.dma_dev = &pdev->dev; oct_vdpa->vdpa.mdev = mdev; oct_vdpa->oct_hw = oct_hw; vdpa_dev = &oct_vdpa->vdpa; diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 301d95e08596..63d82263fb52 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -643,7 +643,7 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, pdev = vdpa_aux->padev->vf_pdev; dma_dev = &pdev->dev; - pdsv->vdpa_dev.dma_dev = dma_dev; + pdsv->vdpa_dev.vmap.dma_dev = dma_dev; status = pds_vdpa_get_status(&pdsv->vdpa_dev); if (status == 0xff) { diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c index 55ec51c17ab3..39050aab147f 100644 --- a/drivers/vdpa/solidrun/snet_main.c +++ b/drivers/vdpa/solidrun/snet_main.c @@ -1052,8 +1052,8 @@ static int snet_vdpa_probe_vf(struct pci_dev *pdev) */ snet_reserve_irq_idx(pf_irqs ? pdev_pf : pdev, snet); - /*set DMA device*/ - snet->vdpa.dma_dev = &pdev->dev; + /* set map metadata */ + snet->vdpa.vmap.dma_dev = &pdev->dev; /* Register VDPA device */ ret = vdpa_register_device(&snet->vdpa, snet->cfg->vq_num); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 8a372b51c21a..c71debeb8471 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -151,7 +151,7 @@ static void vdpa_release_dev(struct device *d) * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * - * Return: Returns an error when parent/config/dma_dev is not set or fail to get + * Return: Returns an error when parent/config/map is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c204fc8e471a..22ee53538444 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -272,7 +272,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], &vdpasim->iommu_lock); - vdpasim->vdpa.dma_dev = dev; + vdpasim->vdpa.vmap.dma_dev = dev; return vdpasim; diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 04620bb77203..f68ed569394c 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2022,7 +2022,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) return ret; } set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); - vdev->vdpa.dma_dev = &vdev->vdpa.dev; + vdev->vdpa.vmap.dma_dev = &vdev->vdpa.dev; vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev; return 0; diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 8787407f75b0..242641c0f2bd 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -520,7 +520,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; - vp_vdpa->vdpa.dma_dev = &pdev->dev; + vp_vdpa->vdpa.vmap.dma_dev = &pdev->dev; vp_vdpa->queues = vp_modern_get_num_queues(mdev); vp_vdpa->mdev = mdev; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index af1e1fdfd9ed..05a481e4c385 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1318,7 +1318,8 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct device *dma_dev = vdpa_get_dma_dev(vdpa); + union virtio_map map = vdpa_get_map(vdpa); + struct device *dma_dev = map.dma_dev; int ret; /* Device want to do DMA by itself */ @@ -1353,7 +1354,8 @@ err_attach: static void vhost_vdpa_free_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; - struct device *dma_dev = vdpa_get_dma_dev(vdpa); + union virtio_map map = vdpa_get_map(vdpa); + struct device *dma_dev = map.dma_dev; if (v->domain) { iommu_detach_device(v->domain, dma_dev); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index d4be689e3626..8b27c6e8eebb 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -133,7 +133,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, const char *name, bool ctx) { struct vdpa_device *vdpa = vd_get_vdpa(vdev); - struct device *dma_dev; const struct vdpa_config_ops *ops = vdpa->config; bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify; struct vdpa_callback cb; @@ -182,11 +181,11 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, /* Create the vring */ align = ops->get_vq_align(vdpa); - if (ops->get_vq_dma_dev) - dma_dev = ops->get_vq_dma_dev(vdpa, index); + if (ops->get_vq_map) + map = ops->get_vq_map(vdpa, index); else - dma_dev = vdpa_get_dma_dev(vdpa); - map.dma_dev = dma_dev; + map = vdpa_get_map(vdpa); + vq = vring_create_virtqueue_map(index, max_num, align, vdev, true, may_reduce_num, ctx, notify, callback, name, map); @@ -467,7 +466,7 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa) if (!vd_dev) return -ENOMEM; - vd_dev->vdev.dev.parent = vdpa_get_dma_dev(vdpa); + vd_dev->vdev.dev.parent = vdpa_get_map(vdpa).dma_dev; vd_dev->vdev.dev.release = virtio_vdpa_release_dev; vd_dev->vdev.config = &virtio_vdpa_config_ops; vd_dev->vdpa = vdpa; diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2e7a30fe6b92..ae0451945851 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ struct vdpa_mgmt_dev; /** * struct vdpa_device - representation of a vDPA device * @dev: underlying device - * @dma_dev: the actual device that is performing DMA + * @vmap: the metadata passed to upper layer to be used for mapping * @driver_override: driver name to force a match; do not set directly, * because core frees it; use driver_set_override() to * set or clear it. @@ -87,7 +88,7 @@ struct vdpa_mgmt_dev; */ struct vdpa_device { struct device dev; - struct device *dma_dev; + union virtio_map vmap; const char *driver_override; const struct vdpa_config_ops *config; struct rw_semaphore cf_lock; /* Protects get/set config */ @@ -352,11 +353,11 @@ struct vdpa_map_file { * @vdev: vdpa device * @asid: address space identifier * Returns integer: success (0) or error (< 0) - * @get_vq_dma_dev: Get the dma device for a specific + * @get_vq_map: Get the map metadata for a specific * virtqueue (optional) * @vdev: vdpa device * @idx: virtqueue index - * Returns pointer to structure device or error (NULL) + * Returns map token union error (NULL) * @bind_mm: Bind the device to a specific address space * so the vDPA framework can use VA when this * callback is implemented. (optional) @@ -436,7 +437,7 @@ struct vdpa_config_ops { int (*reset_map)(struct vdpa_device *vdev, unsigned int asid); int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, unsigned int asid); - struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx); + union virtio_map (*get_vq_map)(struct vdpa_device *vdev, u16 idx); int (*bind_mm)(struct vdpa_device *vdev, struct mm_struct *mm); void (*unbind_mm)(struct vdpa_device *vdev); @@ -520,9 +521,9 @@ static inline void vdpa_set_drvdata(struct vdpa_device *vdev, void *data) dev_set_drvdata(&vdev->dev, data); } -static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) +static inline union virtio_map vdpa_get_map(struct vdpa_device *vdev) { - return vdev->dma_dev; + return vdev->vmap; } static inline int vdpa_reset(struct vdpa_device *vdev, u32 flags) -- cgit v1.2.3 From 0d16cc439f36355d04b17ac45c3001d90969aa44 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 24 Sep 2025 15:00:44 +0800 Subject: vdpa: introduce map ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Virtio core allows the transport to provide device or transport specific mapping functions. This patch adds this support to vDPA. We can simply do this by allowing the vDPA parent to register a virtio_map_ops. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Wang Message-Id: <20250924070045.10361-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Eugenio Pérez --- drivers/vdpa/alibaba/eni_vdpa.c | 3 ++- drivers/vdpa/ifcvf/ifcvf_main.c | 3 ++- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- drivers/vdpa/octeon_ep/octep_vdpa_main.c | 4 ++-- drivers/vdpa/pds/vdpa_dev.c | 3 ++- drivers/vdpa/solidrun/snet_main.c | 4 ++-- drivers/vdpa/vdpa.c | 3 +++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 3 ++- drivers/vdpa/virtio_pci/vp_vdpa.c | 3 ++- drivers/virtio/virtio_vdpa.c | 4 +++- include/linux/vdpa.h | 10 +++++++--- 12 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index 54aea086d08c..e476504db0c8 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -478,7 +478,8 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa, - dev, &eni_vdpa_ops, 1, 1, NULL, false); + dev, &eni_vdpa_ops, NULL, + 1, 1, NULL, false); if (IS_ERR(eni_vdpa)) { ENI_ERR(pdev, "failed to allocate vDPA structure\n"); return PTR_ERR(eni_vdpa); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 979d188d74ee..6658dc74d915 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -705,7 +705,8 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, vf = &ifcvf_mgmt_dev->vf; pdev = vf->pdev; adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - &pdev->dev, &ifc_vdpa_ops, 1, 1, NULL, false); + &pdev->dev, &ifc_vdpa_ops, + NULL, 1, 1, NULL, false); if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return PTR_ERR(adapter); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index a7e76f175914..82034efb74fc 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3882,7 +3882,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops, - MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); + NULL, MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c index 5818dae133a3..9e8d07078606 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -508,8 +508,8 @@ static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, u64 device_features; int ret; - oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1, - NULL, false); + oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, + NULL, 1, 1, NULL, false); if (IS_ERR(oct_vdpa)) { dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device"); return PTR_ERR(oct_vdpa); diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 63d82263fb52..36f61cc96e21 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -632,7 +632,8 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, } pdsv = vdpa_alloc_device(struct pds_vdpa_device, vdpa_dev, - dev, &pds_vdpa_ops, 1, 1, name, false); + dev, &pds_vdpa_ops, NULL, + 1, 1, name, false); if (IS_ERR(pdsv)) { dev_err(dev, "Failed to allocate vDPA structure: %pe\n", pdsv); return PTR_ERR(pdsv); diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c index 39050aab147f..4588211d57eb 100644 --- a/drivers/vdpa/solidrun/snet_main.c +++ b/drivers/vdpa/solidrun/snet_main.c @@ -1008,8 +1008,8 @@ static int snet_vdpa_probe_vf(struct pci_dev *pdev) } /* Allocate vdpa device */ - snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, 1, 1, NULL, - false); + snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, + NULL, 1, 1, NULL, false); if (!snet) { SNET_ERR(pdev, "Failed to allocate a vdpa device\n"); ret = -ENOMEM; diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index c71debeb8471..34874beb0152 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -142,6 +142,7 @@ static void vdpa_release_dev(struct device *d) * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device + * @map: the map operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data @@ -156,6 +157,7 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + const struct virtio_map_ops *map, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) @@ -187,6 +189,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; + vdev->map = map; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 22ee53538444..c1c6431950e1 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -215,7 +215,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, else ops = &vdpasim_config_ops; - vdpa = __vdpa_alloc_device(NULL, ops, + vdpa = __vdpa_alloc_device(NULL, ops, NULL, dev_attr->ngroups, dev_attr->nas, dev_attr->alloc_size, dev_attr->name, use_va); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index f68ed569394c..ad782d20a8ed 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2009,7 +2009,8 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, 1, 1, name, true); + &vduse_vdpa_config_ops, NULL, + 1, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 242641c0f2bd..17a19a728c9c 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -511,7 +511,8 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, int ret, i; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, 1, 1, name, false); + dev, &vp_vdpa_ops, NULL, + 1, 1, name, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 8b27c6e8eebb..2464fa655f09 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -466,9 +466,11 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa) if (!vd_dev) return -ENOMEM; - vd_dev->vdev.dev.parent = vdpa_get_map(vdpa).dma_dev; + vd_dev->vdev.dev.parent = vdpa->map ? &vdpa->dev : + vdpa_get_map(vdpa).dma_dev; vd_dev->vdev.dev.release = virtio_vdpa_release_dev; vd_dev->vdev.config = &virtio_vdpa_config_ops; + vd_dev->vdev.map = vdpa->map; vd_dev->vdpa = vdpa; vd_dev->vdev.id.device = ops->get_device_id(vdpa); diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index ae0451945851..4cf21d6e9cfd 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -76,6 +76,7 @@ struct vdpa_mgmt_dev; * because core frees it; use driver_set_override() to * set or clear it. * @config: the configuration ops for this device. + * @map: the map ops for this device * @cf_lock: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests @@ -91,6 +92,7 @@ struct vdpa_device { union virtio_map vmap; const char *driver_override; const struct vdpa_config_ops *config; + const struct virtio_map_ops *map; struct rw_semaphore cf_lock; /* Protects get/set config */ unsigned int index; bool features_valid; @@ -447,6 +449,7 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + const struct virtio_map_ops *map, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va); @@ -458,6 +461,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @member: the name of struct vdpa_device within the @dev_struct * @parent: the parent device * @config: the bus operations that is supported by this device + * @map: the map operations that is supported by this device * @ngroups: the number of virtqueue groups supported by this device * @nas: the number of address spaces * @name: name of the vdpa device @@ -465,10 +469,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, nas, \ - name, use_va) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, map, \ + ngroups, nas, name, use_va) \ container_of((__vdpa_alloc_device( \ - parent, config, ngroups, nas, \ + parent, config, map, ngroups, nas, \ (sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ dev_struct, member))), name, use_va)), \ -- cgit v1.2.3 From 1c14b0e4ba988381e362ad8a9651eff0b21bd47f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 24 Sep 2025 15:00:45 +0800 Subject: vduse: switch to use virtio map API instead of DMA API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lacking the support of device specific mapping supported in virtio, VDUSE must trick the DMA API in order to make virtio-vdpa transport work. This is done by advertising vDPA device as dma device with a VDUSE specific dma_ops even if it doesn't do DMA at all. This will be fixed by this patch. Thanks to the new mapping operations support by virtio and vDPA. VDUSE can simply switch to advertise its specific mappings operations to virtio via virtio-vdpa then DMA API is not needed for VDUSE any more and iova domain could be used as the mapping token instead. Signed-off-by: Jason Wang Message-Id: <20250924070045.10361-3-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Eugenio Pérez --- drivers/vdpa/Kconfig | 8 +--- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- drivers/vdpa/vdpa_user/iova_domain.h | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 78 ++++++++++++++++++------------------ include/linux/virtio.h | 4 ++ 5 files changed, 46 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 559fb9d3271f..857cf288c876 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -34,13 +34,7 @@ config VDPA_SIM_BLOCK config VDPA_USER tristate "VDUSE (vDPA Device in Userspace) support" - depends on EVENTFD && MMU && HAS_DMA - # - # This driver incorrectly tries to override the dma_ops. It should - # never have done that, but for now keep it working on architectures - # that use dma ops - # - depends on ARCH_HAS_DMA_OPS + depends on EVENTFD && MMU select VHOST_IOTLB select IOMMU_IOVA help diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 58116f89d8da..ccaed24b7ef8 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -447,7 +447,7 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t *dma_addr, - gfp_t flag, unsigned long attrs) + gfp_t flag) { struct iova_domain *iovad = &domain->consistent_iovad; unsigned long limit = domain->iova_limit; diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 7f3f0928ec78..1f3c30be272a 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -64,7 +64,7 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t *dma_addr, - gfp_t flag, unsigned long attrs); + gfp_t flag); void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, void *vaddr, dma_addr_t dma_addr, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index ad782d20a8ed..e7bced0b5542 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -814,59 +814,53 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .free = vduse_vdpa_free, }; -static void vduse_dev_sync_single_for_device(struct device *dev, +static void vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); } -static void vduse_dev_sync_single_for_cpu(struct device *dev, +static void vduse_dev_sync_single_for_cpu(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); } -static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page, +static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } -static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } -static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag, - unsigned long attrs) +static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, + dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; unsigned long iova; void *addr; *dma_addr = DMA_MAPPING_ERROR; addr = vduse_domain_alloc_coherent(domain, size, - (dma_addr_t *)&iova, flag, attrs); + (dma_addr_t *)&iova, flag); if (!addr) return NULL; @@ -875,31 +869,45 @@ static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, return addr; } -static void vduse_dev_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) +static void vduse_dev_free_coherent(union virtio_map token, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); } -static size_t vduse_dev_max_mapping_size(struct device *dev) +static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; + + return dma_addr < domain->bounce_size; +} + +static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) +{ + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) + return -ENOMEM; + return 0; +} + +static size_t vduse_dev_max_mapping_size(union virtio_map token) +{ + struct vduse_iova_domain *domain = token.iova_domain; return domain->bounce_size; } -static const struct dma_map_ops vduse_dev_dma_ops = { +static const struct virtio_map_ops vduse_map_ops = { .sync_single_for_device = vduse_dev_sync_single_for_device, .sync_single_for_cpu = vduse_dev_sync_single_for_cpu, .map_page = vduse_dev_map_page, .unmap_page = vduse_dev_unmap_page, .alloc = vduse_dev_alloc_coherent, .free = vduse_dev_free_coherent, + .need_sync = vduse_dev_need_sync, + .mapping_error = vduse_dev_mapping_error, .max_mapping_size = vduse_dev_max_mapping_size, }; @@ -2003,27 +2011,18 @@ static struct vduse_mgmt_dev *vduse_mgmt; static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) { struct vduse_vdpa *vdev; - int ret; if (dev->vdev) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, NULL, + &vduse_vdpa_config_ops, &vduse_map_ops, 1, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); dev->vdev = vdev; vdev->dev = dev; - vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask; - ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64)); - if (ret) { - put_device(&vdev->vdpa.dev); - return ret; - } - set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); - vdev->vdpa.vmap.dma_dev = &vdev->vdpa.dev; vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev; return 0; @@ -2056,6 +2055,7 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return -ENOMEM; } + dev->vdev->vdpa.vmap.iova_domain = dev->domain; ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); if (ret) { put_device(&dev->vdev->vdpa.dev); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 3386a4a8d06b..96c66126c074 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -41,9 +41,13 @@ struct virtqueue { void *priv; }; +struct vduse_iova_domain; + union virtio_map { /* Device that performs DMA */ struct device *dma_dev; + /* VDUSE specific mapping data */ + struct vduse_iova_domain *iova_domain; }; int virtqueue_add_outbuf(struct virtqueue *vq, -- cgit v1.2.3 From f97aef092e199c10a3da96ae79b571edd5362faa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:12:37 +0200 Subject: cpufreq: Make drivers using CPUFREQ_ETERNAL specify transition latency Commit a755d0e2d41b ("cpufreq: Honour transition_latency over transition_delay_us") caused platforms where cpuinfo.transition_latency is CPUFREQ_ETERNAL to get a very large transition latency whereas previously it had been capped at 10 ms (and later at 2 ms). This led to a user-observable regression between 6.6 and 6.12 as described by Shawn: "The dbs sampling_rate was 10000 us on 6.6 and suddently becomes 6442450 us (4294967295 / 1000 * 1.5) on 6.12 for these platforms because the default transition delay was dropped [...]. It slows down dbs governor's reacting to CPU loading change dramatically. Also, as transition_delay_us is used by schedutil governor as rate_limit_us, it shows a negative impact on device idle power consumption, because the device gets slightly less time in the lowest OPP." Evidently, the expectation of the drivers using CPUFREQ_ETERNAL as cpuinfo.transition_latency was that it would be capped by the core, but they may as well return a default transition latency value instead of CPUFREQ_ETERNAL and the core need not do anything with it. Accordingly, introduce CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS and make all of the drivers in question use it instead of CPUFREQ_ETERNAL. Also update the related Rust binding. Fixes: a755d0e2d41b ("cpufreq: Honour transition_latency over transition_delay_us") Closes: https://lore.kernel.org/linux-pm/20250922125929.453444-1-shawnguo2@yeah.net/ Reported-by: Shawn Guo Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Acked-by: Viresh Kumar Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2264949.irdbgypaU6@rafael.j.wysocki [ rjw: Fix typo in new symbol name, drop redundant type cast from Rust binding ] Tested-by: Shawn Guo # with cpufreq-dt driver Reviewed-by: Qais Yousef Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 2 +- drivers/cpufreq/imx6q-cpufreq.c | 2 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 2 +- drivers/cpufreq/rcpufreq_dt.rs | 2 +- drivers/cpufreq/scmi-cpufreq.c | 2 +- drivers/cpufreq/scpi-cpufreq.c | 2 +- drivers/cpufreq/spear-cpufreq.c | 2 +- include/linux/cpufreq.h | 3 +++ rust/kernel/cpufreq.rs | 7 ++++--- 9 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 506437489b4d..7d5079fd1688 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -104,7 +104,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev); if (!transition_latency) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cpumask_copy(policy->cpus, priv->cpus); policy->driver_data = priv; diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index db1c88e9d3f9..e93697d3edfd 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -442,7 +442,7 @@ soc_opp_out: } if (of_property_read_u32(np, "clock-latency", &transition_latency)) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /* * Calculate the ramp time for max voltage change in the diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index fce5aa5ceea0..ae4500ab4891 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -309,7 +309,7 @@ static int mtk_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) latency = readl_relaxed(data->reg_bases[REG_FREQ_LATENCY]) * 1000; if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; policy->fast_switch_possible = true; diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 7e1fbf9a091f..3909022e1c74 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -123,7 +123,7 @@ impl cpufreq::Driver for CPUFreqDTDriver { let mut transition_latency = opp_table.max_transition_latency_ns() as u32; if transition_latency == 0 { - transition_latency = cpufreq::ETERNAL_LATENCY_NS; + transition_latency = cpufreq::DEFAULT_TRANSITION_LATENCY_NS; } policy diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 38c165d526d1..d2a110079f5f 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -294,7 +294,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) latency = perf_ops->transition_latency_get(ph, domain); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index dcbb0ae7dd47..e530345baddf 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -157,7 +157,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) latency = scpi_ops->get_transition_latency(cpu_dev); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c index 707c71090cc3..2a1550e1aa21 100644 --- a/drivers/cpufreq/spear-cpufreq.c +++ b/drivers/cpufreq/spear-cpufreq.c @@ -182,7 +182,7 @@ static int spear_cpufreq_probe(struct platform_device *pdev) if (of_property_read_u32(np, "clock-latency", &spear_cpufreq.transition_latency)) - spear_cpufreq.transition_latency = CPUFREQ_ETERNAL; + spear_cpufreq.transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cnt = of_property_count_u32_elems(np, "cpufreq_tbl"); if (cnt <= 0) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 40966512ea18..bc8c083bc16a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -32,6 +32,9 @@ */ #define CPUFREQ_ETERNAL (-1) + +#define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC + #define CPUFREQ_NAME_LEN 16 /* Print length for names. Extra 1 space for accommodating '\n' in prints */ #define CPUFREQ_NAME_PLEN (CPUFREQ_NAME_LEN + 1) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index eea57ba95f24..2ea735700ae7 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -39,7 +39,8 @@ use macros::vtable; const CPUFREQ_NAME_LEN: usize = bindings::CPUFREQ_NAME_LEN as usize; /// Default transition latency value in nanoseconds. -pub const ETERNAL_LATENCY_NS: u32 = bindings::CPUFREQ_ETERNAL as u32; +pub const DEFAULT_TRANSITION_LATENCY_NS: u32 = + bindings::CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /// CPU frequency driver flags. pub mod flags { @@ -400,13 +401,13 @@ impl TableBuilder { /// The following example demonstrates how to create a CPU frequency table. /// /// ``` -/// use kernel::cpufreq::{ETERNAL_LATENCY_NS, Policy}; +/// use kernel::cpufreq::{DEFAULT_TRANSITION_LATENCY_NS, Policy}; /// /// fn update_policy(policy: &mut Policy) { /// policy /// .set_dvfs_possible_from_any_cpu(true) /// .set_fast_switch_possible(true) -/// .set_transition_latency_ns(ETERNAL_LATENCY_NS); +/// .set_transition_latency_ns(DEFAULT_TRANSITION_LATENCY_NS); /// /// pr_info!("The policy details are: {:?}\n", (policy.cpu(), policy.cur())); /// } -- cgit v1.2.3 From c28a280bd465690981099cd6e43dfcfa5c28b133 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:29:50 +0200 Subject: ACPI: CPPC: Do not use CPUFREQ_ETERNAL as an error value Instead of using CPUFREQ_ETERNAL for signaling an error condition in cppc_get_transition_latency(), change the return value type of that function to int and make it return a proper negative error code on failures. No intentional functional impact. Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Qais Yousef --- drivers/acpi/cppc_acpi.c | 16 +++++++--------- drivers/cpufreq/amd-pstate.c | 8 ++++---- drivers/cpufreq/cppc_cpufreq.c | 4 ++-- include/acpi/cppc_acpi.h | 6 +++--- 4 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6b649031808f..ab4651205e8a 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1876,7 +1876,7 @@ EXPORT_SYMBOL_GPL(cppc_set_perf); * If desired_reg is in the SystemMemory or SystemIo ACPI address space, * then assume there is no latency. */ -unsigned int cppc_get_transition_latency(int cpu_num) +int cppc_get_transition_latency(int cpu_num) { /* * Expected transition latency is based on the PCCT timing values @@ -1889,31 +1889,29 @@ unsigned int cppc_get_transition_latency(int cpu_num) * completion of a command before issuing the next command, * in microseconds. */ - unsigned int latency_ns = 0; struct cpc_desc *cpc_desc; struct cpc_register_resource *desired_reg; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu_num); struct cppc_pcc_data *pcc_ss_data; + int latency_ns = 0; cpc_desc = per_cpu(cpc_desc_ptr, cpu_num); if (!cpc_desc) - return CPUFREQ_ETERNAL; + return -ENODATA; desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; if (CPC_IN_SYSTEM_MEMORY(desired_reg) || CPC_IN_SYSTEM_IO(desired_reg)) return 0; - else if (!CPC_IN_PCC(desired_reg)) - return CPUFREQ_ETERNAL; - if (pcc_ss_id < 0) - return CPUFREQ_ETERNAL; + if (!CPC_IN_PCC(desired_reg) || pcc_ss_id < 0) + return -ENODATA; pcc_ss_data = pcc_data[pcc_ss_id]; if (pcc_ss_data->pcc_mpar) latency_ns = 60 * (1000 * 1000 * 1000 / pcc_ss_data->pcc_mpar); - latency_ns = max(latency_ns, pcc_ss_data->pcc_nominal * 1000); - latency_ns = max(latency_ns, pcc_ss_data->pcc_mrtt * 1000); + latency_ns = max_t(int, latency_ns, pcc_ss_data->pcc_nominal * 1000); + latency_ns = max_t(int, latency_ns, pcc_ss_data->pcc_mrtt * 1000); return latency_ns; } diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index b4c79fde1979..298e92d8cc03 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -872,10 +872,10 @@ static void amd_pstate_update_limits(struct cpufreq_policy *policy) */ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) { - u32 transition_delay_ns; + int transition_delay_ns; transition_delay_ns = cppc_get_transition_latency(cpu); - if (transition_delay_ns == CPUFREQ_ETERNAL) { + if (transition_delay_ns < 0) { if (cpu_feature_enabled(X86_FEATURE_AMD_FAST_CPPC)) return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY; else @@ -891,10 +891,10 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) */ static u32 amd_pstate_get_transition_latency(unsigned int cpu) { - u32 transition_latency; + int transition_latency; transition_latency = cppc_get_transition_latency(cpu); - if (transition_latency == CPUFREQ_ETERNAL) + if (transition_latency < 0) return AMD_PSTATE_TRANSITION_LATENCY; return transition_latency; diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index b71946937c52..e23d9abea135 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -310,9 +310,9 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static unsigned int __cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { - unsigned int transition_latency_ns = cppc_get_transition_latency(cpu); + int transition_latency_ns = cppc_get_transition_latency(cpu); - if (transition_latency_ns == CPUFREQ_ETERNAL) + if (transition_latency_ns < 0) return CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS / NSEC_PER_USEC; return transition_latency_ns / NSEC_PER_USEC; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 20f3d62e7a16..13fa81504844 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -160,7 +160,7 @@ extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int f extern bool acpi_cpc_valid(void); extern bool cppc_allow_fast_switch(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); -extern unsigned int cppc_get_transition_latency(int cpu); +extern int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern bool cpc_supported_by_cpu(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); @@ -216,9 +216,9 @@ static inline bool cppc_allow_fast_switch(void) { return false; } -static inline unsigned int cppc_get_transition_latency(int cpu) +static inline int cppc_get_transition_latency(int cpu) { - return CPUFREQ_ETERNAL; + return -ENODATA; } static inline bool cpc_ffh_supported(void) { -- cgit v1.2.3 From 950c6451a5c38d375993c3b9da427e2e69b01c30 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:31:47 +0200 Subject: cpufreq: Drop unused symbol CPUFREQ_ETERNAL Drop CPUFREQ_ETERNAL that has no users any more along with all references to it in the documentation. No functional impact. Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Qais Yousef --- Documentation/admin-guide/pm/cpufreq.rst | 4 ---- Documentation/cpu-freq/cpu-drivers.rst | 3 +-- Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 3 +-- Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst | 3 +-- include/linux/cpufreq.h | 5 ----- 5 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index cacb9f0307dd..738d7b4dc33a 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -274,10 +274,6 @@ are the following: The time it takes to switch the CPUs belonging to this policy from one P-state to another, in nanoseconds. - If unknown or if known to be so high that the scaling driver does not - work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`) - will be returned by reads from this attribute. - ``related_cpus`` List of all (online and offline) CPUs belonging to this policy. diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index d84ededb66f9..c5635ac3de54 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -109,8 +109,7 @@ Then, the driver must fill in the following values: +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | the time it takes on this CPU to | | | switch between two frequencies in | -| | nanoseconds (if appropriate, else | -| | specify CPUFREQ_ETERNAL) | +| | nanoseconds | +-----------------------------------+--------------------------------------+ |policy->cur | The current operating frequency of | | | this CPU (if appropriate) | diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 2ca92042767b..8238f4c6e4f5 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在两个频率之间切换所需的时间,以 | -| | 纳秒为单位(如不适用,设定为 | -| | CPUFREQ_ETERNAL) | +| | 纳秒为单位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 该CPU当前的工作频率(如适用) | diff --git a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst index add3de2d4523..5435c3928d4b 100644 --- a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心層註冊一個cpufreq_driver結構體。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在兩個頻率之間切換所需的時間,以 | -| | 納秒爲單位(如不適用,設定爲 | -| | CPUFREQ_ETERNAL) | +| | 納秒爲單位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 該CPU當前的工作頻率(如適用) | diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index bc8c083bc16a..0465d1e6f72a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -26,13 +26,8 @@ *********************************************************************/ /* * Frequency values here are CPU kHz - * - * Maximum transition latency is in nanoseconds - if it's unknown, - * CPUFREQ_ETERNAL shall be used. */ -#define CPUFREQ_ETERNAL (-1) - #define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC #define CPUFREQ_NAME_LEN 16 -- cgit v1.2.3 From 4e66293bb141df33d5eb1f922e16fe05913bf296 Mon Sep 17 00:00:00 2001 From: Bhanu Seshu Kumar Valluri Date: Wed, 1 Oct 2025 14:27:16 +0530 Subject: of: doc: Fix typo in doc comments. synthetized => synthesized definied => defined sucess => success Signed-off-by: Bhanu Seshu Kumar Valluri Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 2 +- drivers/of/overlay.c | 2 +- include/linux/of.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/of/irq.c b/drivers/of/irq.c index e7c12abd10ab..1ca66b6e267c 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -163,7 +163,7 @@ const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_ph * @out_irq: structure of_phandle_args updated by this function * * This function is a low-level interrupt tree walking function. It - * can be used to do a partial walk with synthetized reg and interrupts + * can be used to do a partial walk with synthesized reg and interrupts * properties, for example when resolving PCI interrupts when no device * node exist for the parent. It takes an interrupt specifier structure as * input, walks the tree looking for any interrupt-map properties, translates diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index 1af6f52d0708..255e8362f600 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -135,7 +135,7 @@ static BLOCKING_NOTIFIER_HEAD(overlay_notify_chain); * @nb: Notifier block to register * * Register for notification on overlay operations on device tree nodes. The - * reported actions definied by @of_reconfig_change. The notifier callback + * reported actions defined by @of_reconfig_change. The notifier callback * furthermore receives a pointer to the affected device tree node. * * Note that a notifier callback is not supposed to store pointers to a device diff --git a/include/linux/of.h b/include/linux/of.h index 5e2c6ed9370a..121a288ca92d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1134,7 +1134,7 @@ static inline bool of_phandle_args_equal(const struct of_phandle_args *a1, * Search for a property in a device node and count the number of u8 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u8 and -ENODATA if the * property does not have a value. */ @@ -1153,7 +1153,7 @@ static inline int of_property_count_u8_elems(const struct device_node *np, * Search for a property in a device node and count the number of u16 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u16 and -ENODATA if the * property does not have a value. */ @@ -1172,7 +1172,7 @@ static inline int of_property_count_u16_elems(const struct device_node *np, * Search for a property in a device node and count the number of u32 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u32 and -ENODATA if the * property does not have a value. */ @@ -1191,7 +1191,7 @@ static inline int of_property_count_u32_elems(const struct device_node *np, * Search for a property in a device node and count the number of u64 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u64 and -ENODATA if the * property does not have a value. */ -- cgit v1.2.3 From b595edcb24727e7f93e7962c3f6f971cc16dd29e Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 1 Oct 2025 16:08:46 -0700 Subject: hyperv: Remove the spurious null directive line The file contains a line that consists of the lone # symbol followed by a newline. While that is a valid syntax as defined by the C99+ grammar (6.10.7 "Null directive"), it serves no apparent purpose in this case. Remove the null preprocessor directive. No functional changes. Fixes: e68bda71a238 ("hyperv: Add new Hyper-V headers in include/hyperv") Signed-off-by: Roman Kisel Reviewed-by: Easwar Hariharan Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 1be7f6a02304..77abddfc750e 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -597,8 +597,6 @@ struct ms_hyperv_tsc_page { /* HV_REFERENCE_TSC_PAGE */ #define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) #define HV_SYNIC_SINT_VECTOR_MASK (0xFF) -# - /* Hyper-V defined statically assigned SINTs */ #define HV_SYNIC_INTERCEPTION_SINT_INDEX 0x00000000 #define HV_SYNIC_IOMMU_FAULT_SINT_INDEX 0x00000001 -- cgit v1.2.3 From 510d76646a6a7beaa49fc0da7282e285a3dfce97 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 23 Sep 2025 19:21:36 +0800 Subject: block: Update a comment of disk statistics >From commit 074a7aca7afa ("block: move stats from disk to part0"), we know that: * {disk|all}_stat_*() are gone. * disk_stat_lock/unlock() are renamed to part_stat_lock/unlock(). Therefore, outdated comments should be updated accordingly. Fixes: 074a7aca7afa ("block: move stats from disk to part0") Signed-off-by: Tang Yizhou Signed-off-by: Jens Axboe --- include/linux/part_stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index eeeff2a04529..729415e91215 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -17,8 +17,8 @@ struct disk_stats { /* * Macros to operate on percpu disk statistics: * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should - * be called between disk_stat_lock() and disk_stat_unlock(). + * part_stat_{add|sub|inc|dec}() modify the stat counters and should + * be called between part_stat_lock() and part_stat_unlock(). * * part_stat_read() can be called at any time. */ -- cgit v1.2.3 From 10aa5c80603088d10c2cd5e7e27d561a8fb59c7e Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Tue, 30 Sep 2025 14:27:52 +0200 Subject: drm/gpusvm, drm/xe: Fix userptr to not allow device private pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When userptr is used on SVM-enabled VMs, a non-NULL hmm_range::dev_private_owner value might mean that hmm_range_fault() attempts to return device private pages. Either that will fail, or the userptr code will not know how to handle those. Use NULL for hmm_range::dev_private_owner to migrate such pages to system. In order to do that, move the struct drm_gpusvm::device_private_page_owner field to struct drm_gpusvm_ctx::device_private_page_owner so that it doesn't remain immutable over the drm_gpusvm lifetime. v2: - Don't conditionally compile xe_svm_devm_owner(). - Kerneldoc xe_svm_devm_owner(). Fixes: 9e9787414882 ("drm/xe/userptr: replace xe_hmm with gpusvm") Cc: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Matthew Brost Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Reviewed-by: Matthew Brost Acked-by: Maarten Lankhorst Link: https://lore.kernel.org/r/20250930122752.96034-1-thomas.hellstrom@linux.intel.com (cherry picked from commit ad298d9ec957414dbf3d51f3c8bca4b6d2416c0c) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/drm_gpusvm.c | 24 +++++++++++++----------- drivers/gpu/drm/xe/xe_svm.c | 11 +++-------- drivers/gpu/drm/xe/xe_svm.h | 14 ++++++++++++++ drivers/gpu/drm/xe/xe_userptr.c | 1 + drivers/gpu/drm/xe/xe_vm.c | 1 + include/drm/drm_gpusvm.h | 7 ++++--- 6 files changed, 36 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index eeeeb99cfdf6..cb906765897e 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -361,7 +361,6 @@ static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = { * @name: Name of the GPU SVM. * @drm: Pointer to the DRM device structure. * @mm: Pointer to the mm_struct for the address space. - * @device_private_page_owner: Device private pages owner. * @mm_start: Start address of GPU SVM. * @mm_range: Range of the GPU SVM. * @notifier_size: Size of individual notifiers. @@ -383,7 +382,7 @@ static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = { */ int drm_gpusvm_init(struct drm_gpusvm *gpusvm, const char *name, struct drm_device *drm, - struct mm_struct *mm, void *device_private_page_owner, + struct mm_struct *mm, unsigned long mm_start, unsigned long mm_range, unsigned long notifier_size, const struct drm_gpusvm_ops *ops, @@ -395,15 +394,13 @@ int drm_gpusvm_init(struct drm_gpusvm *gpusvm, mmgrab(mm); } else { /* No full SVM mode, only core drm_gpusvm_pages API. */ - if (ops || num_chunks || mm_range || notifier_size || - device_private_page_owner) + if (ops || num_chunks || mm_range || notifier_size) return -EINVAL; } gpusvm->name = name; gpusvm->drm = drm; gpusvm->mm = mm; - gpusvm->device_private_page_owner = device_private_page_owner; gpusvm->mm_start = mm_start; gpusvm->mm_range = mm_range; gpusvm->notifier_size = notifier_size; @@ -684,6 +681,7 @@ static unsigned int drm_gpusvm_hmm_pfn_to_order(unsigned long hmm_pfn, * @notifier: Pointer to the GPU SVM notifier structure * @start: Start address * @end: End address + * @dev_private_owner: The device private page owner * * Check if pages between start and end have been faulted in on the CPU. Use to * prevent migration of pages without CPU backing store. @@ -692,14 +690,15 @@ static unsigned int drm_gpusvm_hmm_pfn_to_order(unsigned long hmm_pfn, */ static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm, struct drm_gpusvm_notifier *notifier, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + void *dev_private_owner) { struct hmm_range hmm_range = { .default_flags = 0, .notifier = ¬ifier->notifier, .start = start, .end = end, - .dev_private_owner = gpusvm->device_private_page_owner, + .dev_private_owner = dev_private_owner, }; unsigned long timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); @@ -753,6 +752,7 @@ err_free: * @gpuva_start: Start address of GPUVA which mirrors CPU * @gpuva_end: End address of GPUVA which mirrors CPU * @check_pages_threshold: Check CPU pages for present threshold + * @dev_private_owner: The device private page owner * * This function determines the chunk size for the GPU SVM range based on the * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual @@ -767,7 +767,8 @@ drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm, unsigned long fault_addr, unsigned long gpuva_start, unsigned long gpuva_end, - unsigned long check_pages_threshold) + unsigned long check_pages_threshold, + void *dev_private_owner) { unsigned long start, end; int i = 0; @@ -814,7 +815,7 @@ retry: * process-many-malloc' mallocs at least 64k at a time. */ if (end - start <= check_pages_threshold && - !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) { + !drm_gpusvm_check_pages(gpusvm, notifier, start, end, dev_private_owner)) { ++i; goto retry; } @@ -957,7 +958,8 @@ drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas, fault_addr, gpuva_start, gpuva_end, - ctx->check_pages_threshold); + ctx->check_pages_threshold, + ctx->device_private_page_owner); if (chunk_size == LONG_MAX) { err = -EINVAL; goto err_notifier_remove; @@ -1268,7 +1270,7 @@ int drm_gpusvm_get_pages(struct drm_gpusvm *gpusvm, .notifier = notifier, .start = pages_start, .end = pages_end, - .dev_private_owner = gpusvm->device_private_page_owner, + .dev_private_owner = ctx->device_private_page_owner, }; void *zdd; unsigned long timeout = diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 7f2f1f041f1d..7e2db71ff34e 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -67,11 +67,6 @@ void xe_svm_range_debug(struct xe_svm_range *range, const char *operation) range_debug(range, operation); } -static void *xe_svm_devm_owner(struct xe_device *xe) -{ - return xe; -} - static struct drm_gpusvm_range * xe_svm_range_alloc(struct drm_gpusvm *gpusvm) { @@ -744,15 +739,14 @@ int xe_svm_init(struct xe_vm *vm) xe_svm_garbage_collector_work_func); err = drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM", &vm->xe->drm, - current->mm, xe_svm_devm_owner(vm->xe), 0, - vm->size, + current->mm, 0, vm->size, xe_modparam.svm_notifier_size * SZ_1M, &gpusvm_ops, fault_chunk_sizes, ARRAY_SIZE(fault_chunk_sizes)); drm_gpusvm_driver_set_lock(&vm->svm.gpusvm, &vm->lock); } else { err = drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM (simple)", - &vm->xe->drm, NULL, NULL, 0, 0, 0, NULL, + &vm->xe->drm, NULL, 0, 0, 0, NULL, NULL, 0); } @@ -1017,6 +1011,7 @@ static int __xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, .devmem_only = need_vram && devmem_possible, .timeslice_ms = need_vram && devmem_possible ? vm->xe->atomic_svm_timeslice_ms : 0, + .device_private_page_owner = xe_svm_devm_owner(vm->xe), }; struct xe_validation_ctx vctx; struct drm_exec exec; diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h index cef6ee7d6fe3..0955d2ac8d74 100644 --- a/drivers/gpu/drm/xe/xe_svm.h +++ b/drivers/gpu/drm/xe/xe_svm.h @@ -6,6 +6,20 @@ #ifndef _XE_SVM_H_ #define _XE_SVM_H_ +struct xe_device; + +/** + * xe_svm_devm_owner() - Return the owner of device private memory + * @xe: The xe device. + * + * Return: The owner of this device's device private memory to use in + * hmm_range_fault()- + */ +static inline void *xe_svm_devm_owner(struct xe_device *xe) +{ + return xe; +} + #if IS_ENABLED(CONFIG_DRM_XE_GPUSVM) #include diff --git a/drivers/gpu/drm/xe/xe_userptr.c b/drivers/gpu/drm/xe/xe_userptr.c index 91d09af71ced..f16e92cd8090 100644 --- a/drivers/gpu/drm/xe/xe_userptr.c +++ b/drivers/gpu/drm/xe/xe_userptr.c @@ -54,6 +54,7 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma) struct xe_device *xe = vm->xe; struct drm_gpusvm_ctx ctx = { .read_only = xe_vma_read_only(vma), + .device_private_page_owner = NULL, }; lockdep_assert_held(&vm->lock); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 0cacab20ff85..027e6ce648c5 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2881,6 +2881,7 @@ static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op) ctx.read_only = xe_vma_read_only(vma); ctx.devmem_possible = devmem_possible; ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0; + ctx.device_private_page_owner = xe_svm_devm_owner(vm->xe); /* TODO: Threading the migration */ xa_for_each(&op->prefetch_range.range, i, svm_range) { diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h index 5434048a2ca4..b92faa9a26b2 100644 --- a/include/drm/drm_gpusvm.h +++ b/include/drm/drm_gpusvm.h @@ -179,7 +179,6 @@ struct drm_gpusvm_range { * @name: Name of the GPU SVM * @drm: Pointer to the DRM device structure * @mm: Pointer to the mm_struct for the address space - * @device_private_page_owner: Device private pages owner * @mm_start: Start address of GPU SVM * @mm_range: Range of the GPU SVM * @notifier_size: Size of individual notifiers @@ -204,7 +203,6 @@ struct drm_gpusvm { const char *name; struct drm_device *drm; struct mm_struct *mm; - void *device_private_page_owner; unsigned long mm_start; unsigned long mm_range; unsigned long notifier_size; @@ -226,6 +224,8 @@ struct drm_gpusvm { /** * struct drm_gpusvm_ctx - DRM GPU SVM context * + * @device_private_page_owner: The device-private page owner to use for + * this operation * @check_pages_threshold: Check CPU pages for present if chunk is less than or * equal to threshold. If not present, reduce chunk * size. @@ -239,6 +239,7 @@ struct drm_gpusvm { * Context that is DRM GPUSVM is operating in (i.e. user arguments). */ struct drm_gpusvm_ctx { + void *device_private_page_owner; unsigned long check_pages_threshold; unsigned long timeslice_ms; unsigned int in_notifier :1; @@ -249,7 +250,7 @@ struct drm_gpusvm_ctx { int drm_gpusvm_init(struct drm_gpusvm *gpusvm, const char *name, struct drm_device *drm, - struct mm_struct *mm, void *device_private_page_owner, + struct mm_struct *mm, unsigned long mm_start, unsigned long mm_range, unsigned long notifier_size, const struct drm_gpusvm_ops *ops, -- cgit v1.2.3 From 16abbabc004bedeeaa702e11913da9d4fa70e63a Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 1 Oct 2025 08:10:28 +0200 Subject: dma-mapping: fix direction in dma_alloc direction traces Set __entry->dir to the actual "dir" parameter of all trace events in dma_alloc_class. This struct member was left uninitialized by mistake. Signed-off-by: Petr Tesarik Fixes: 3afff779a725 ("dma-mapping: trace dma_alloc/free direction") Cc: stable@vger.kernel.org Reviewed-by: Sean Anderson Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251001061028.412258-1-ptesarik@suse.com --- include/trace/events/dma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 5da59fd8121d..b3fef140ae15 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -133,6 +133,7 @@ DECLARE_EVENT_CLASS(dma_alloc_class, __entry->dma_addr = dma_addr; __entry->size = size; __entry->flags = flags; + __entry->dir = dir; __entry->attrs = attrs; ), -- cgit v1.2.3 From 7a0f94361ffd6e1d31c79023e8674b492bef05e3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 30 Sep 2025 19:24:26 -0700 Subject: net: psp: don't assume reply skbs will have a socket Rx path may be passing around unreferenced sockets, which means that skb_set_owner_edemux() may not set skb->sk and PSP will crash: KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] RIP: 0010:psp_reply_set_decrypted (./include/net/psp/functions.h:132 net/psp/psp_sock.c:287) tcp_v6_send_response.constprop.0 (net/ipv6/tcp_ipv6.c:979) tcp_v6_send_reset (net/ipv6/tcp_ipv6.c:1140 (discriminator 1)) tcp_v6_do_rcv (net/ipv6/tcp_ipv6.c:1683) tcp_v6_rcv (net/ipv6/tcp_ipv6.c:1912) Fixes: 659a2899a57d ("tcp: add datapath logic for PSP with inline key exchange") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251001022426.2592750-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/psp/psp_sock.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index ef7743664da3..c5c23a54774e 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -34,7 +34,7 @@ unsigned int psp_key_size(u32 version); void psp_sk_assoc_free(struct sock *sk); void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); void psp_twsk_assoc_free(struct inet_timewait_sock *tw); -void psp_reply_set_decrypted(struct sk_buff *skb); +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { @@ -160,7 +160,7 @@ static inline void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5ca97ede979c..ff11d3a85a36 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1668,7 +1668,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, nskb->ip_summed = CHECKSUM_NONE; if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); - psp_reply_set_decrypted(nskb); + psp_reply_set_decrypted(orig_sk, nskb); } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9622c2776ade..59c4977a811a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -974,7 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); - psp_reply_set_decrypted(buff); + psp_reply_set_decrypted(sk, buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 5324a7603bed..a931d825d1cc 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -279,12 +279,12 @@ void psp_twsk_assoc_free(struct inet_timewait_sock *tw) psp_assoc_put(pas); } -void psp_reply_set_decrypted(struct sk_buff *skb) +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { struct psp_assoc *pas; rcu_read_lock(); - pas = psp_sk_get_assoc_rcu(skb->sk); + pas = psp_sk_get_assoc_rcu(sk); if (pas && pas->tx.spi) skb->decrypted = 1; rcu_read_unlock(); -- cgit v1.2.3 From 1b54b0756f051c11f5a5d0fbc1581e0b9a18e2bc Mon Sep 17 00:00:00 2001 From: Bhanu Seshu Kumar Valluri Date: Wed, 1 Oct 2025 16:27:15 +0530 Subject: net: doc: Fix typos in docs Fix typos in doc comments. Signed-off-by: Bhanu Seshu Kumar Valluri Link: https://patch.msgid.link/20251001105715.50462-1-bhanuseshukumar@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- net/tipc/crypto.c | 2 +- net/tipc/topsrv.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7a54a8b4d277..3c7634482356 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -297,7 +297,7 @@ static inline const char *phy_modes(phy_interface_t interface) * * Description: maps RGMII supported link speeds into the clock rates. * This can also be used for MII, GMII, and RMII interface modes as the - * clock rates are indentical, but the caller must be aware that errors + * clock rates are identical, but the caller must be aware that errors * for unsupported clock rates will not be signalled. * * Returns: clock rate or negative errno @@ -519,7 +519,7 @@ enum phy_state { * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. - * @device_ids: The device identifer for each present device. + * @device_ids: The device identifier for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index ea5bb131ebd0..751904f10aab 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1797,7 +1797,7 @@ exit: * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or - * as the callback, the encryption header and auth tag will be trimed out + * as the callback, the encryption header and auth tag will be trimmed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index ffe577bf6b51..aad7f96b6009 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -57,7 +57,7 @@ * @conn_idr: identifier set of connection * @idr_lock: protect the connection identifier set * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance + * @net: network namespace instance * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue @@ -83,7 +83,7 @@ struct tipc_topsrv { * @sock: socket handler associated with connection * @flags: indicates connection state * @server: pointer to connected server - * @sub_list: lsit to all pertaing subscriptions + * @sub_list: list to all pertaining subscriptions * @sub_lock: lock protecting the subscription list * @rwork: receive work item * @outqueue: pointer to first outbound message in queue -- cgit v1.2.3 From 384b52ce32110db974d3b61d463af48347eb73fb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 1 Oct 2025 19:34:29 +0200 Subject: PM: runtime: Introduce one more usage counter guard Follow previous commit 9a0abc39450a ("PM: runtime: Add auto-cleanup macros for "resume and get" operations") and define a runtime PM usage counter guard in which pm_runtime_get_noresume() and pm_runtime_put_noidle() will be used for incrementing and decrementing it, respectively. Signed-off-by: Rafael J. Wysocki Reviewed-by: Jonathan Cameron --- include/linux/pm_runtime.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index edb8aed5ef62..a3f44f6c2da1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -614,6 +614,9 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_noresume, struct device *, + pm_runtime_get_noresume(_T), pm_runtime_put_noidle(_T)); + DEFINE_GUARD(pm_runtime_active, struct device *, pm_runtime_get_sync(_T), pm_runtime_put(_T)); DEFINE_GUARD(pm_runtime_active_auto, struct device *, -- cgit v1.2.3 From b8179af120943e2fc099ea87caa234039a709a66 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:35 +0200 Subject: mm/memory_hotplug: activate node before adding new memory blocks The sysfs attributes for memory blocks require the node ID to be set and initialized, so move the node activation before adding new memory blocks. This also has the nice side effect that the BUG_ON() can be converted into a WARN_ON() as we now can handle registration errors. Link: https://lkml.kernel.org/r/20250729064637.51662-3-hare@kernel.org Fixes: b9ff036082cd ("mm/memory_hotplug.c: make add_memory_resource use __try_online_node") Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 32 +++++++++++++++++--------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 894d3891292b..fb212a889e65 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -879,7 +879,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -893,7 +893,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_OFFLINE, altmap, group); + ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..4a29153e372e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -159,7 +159,7 @@ static inline unsigned long memory_block_advised_max_size(void) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e9f14de4a9c9..0be83039c3b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1477,7 +1477,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, } /* create memory block devices after memory was added */ - ret = create_memory_block_devices(cur_start, memblock_size, + ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, NULL); @@ -1539,8 +1539,16 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = __try_online_node(nid, false); if (ret < 0) - goto error; - new_node = ret; + goto error_memblock_remove; + if (ret) { + node_set_online(nid); + ret = register_one_node(nid); + if (WARN_ON(ret)) { + node_set_offline(nid); + goto error_memblock_remove; + } + new_node = true; + } /* * Self hosted memmap array @@ -1556,24 +1564,13 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, NULL, group); + ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { arch_remove_memory(start, size, params.altmap); goto error; } } - if (new_node) { - /* If sysfs file of new node can't be created, cpu on the node - * can't be hot-added. There is no rollback way now. - * So, check by BUG_ON() to catch it reluctantly.. - * We online node here. We can't roll back from here. - */ - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); - } - register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); @@ -1597,6 +1594,11 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: + if (new_node) { + node_set_offline(nid); + unregister_one_node(nid); + } +error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: -- cgit v1.2.3 From 0a947c14e48cbf9de222836170282e0167a9e096 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:36 +0200 Subject: drivers/base: move memory_block_add_nid() into the caller Now the node id only needs to be set for early memory, so move memory_block_add_nid() into the caller and rename it into memory_block_add_nid_early(). This allows us to further simplify the code by dropping the 'context' argument to do_register_memory_block_under_node(). Link: https://lkml.kernel.org/r/20250729064637.51662-4-hare@kernel.org Suggested-by: David Hildenbrand Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 36 ++++++++++++++++++------------------ drivers/base/node.c | 10 ++++------ include/linux/memory.h | 3 +-- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index fb212a889e65..6d84a02cfa5d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -769,21 +769,22 @@ static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, #ifdef CONFIG_NUMA /** - * memory_block_add_nid() - Indicate that system RAM falling into this memory - * block device (partially) belongs to the given node. + * memory_block_add_nid_early() - Indicate that early system RAM falling into + * this memory block device (partially) belongs + * to the given node. * @mem: The memory block device. * @nid: The node id. - * @context: The memory initialization context. * - * Indicate that system RAM falling into this memory block (partially) belongs - * to the given node. If the context indicates ("early") that we are adding the - * node during node device subsystem initialization, this will also properly - * set/adjust mem->zone based on the zone ranges of the given node. + * Indicate that early system RAM falling into this memory block (partially) + * belongs to the given node. This will also properly set/adjust mem->zone based + * on the zone ranges of the given node. + * + * Memory hotplug handles this on memory block creation, where we can only have + * a single nid span a memory block. */ -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context) +void memory_block_add_nid_early(struct memory_block *mem, int nid) { - if (context == MEMINIT_EARLY && mem->nid != nid) { + if (mem->nid != nid) { /* * For early memory we have to determine the zone when setting * the node id and handle multiple nodes spanning a single @@ -797,15 +798,14 @@ void memory_block_add_nid(struct memory_block *mem, int nid, mem->zone = early_node_zone_for_memory_block(mem, nid); else mem->zone = NULL; + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. If we span multiple nodes (not applicable + * to hotplugged memory), zone == NULL will prohibit memory offlining + * and consequently unplug. + */ + mem->nid = nid; } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. If we span multiple nodes (not applicable - * to hotplugged memory), zone == NULL will prohibit memory offlining - * and consequently unplug. - */ - mem->nid = nid; } #endif diff --git a/drivers/base/node.c b/drivers/base/node.c index 67b01d579737..6b6e55a98b79 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -781,13 +781,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG static void do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk, - enum meminit_context context) + struct memory_block *mem_blk) { int ret; - memory_block_add_nid(mem_blk, nid, context); - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -815,7 +812,7 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG); + do_register_memory_block_under_node(nid, mem_blk); return 0; } @@ -855,7 +852,8 @@ static void register_memory_blocks_under_nodes(void) if (!mem) continue; - do_register_memory_block_under_node(nid, mem, MEMINIT_EARLY); + memory_block_add_nid_early(mem, nid); + do_register_memory_block_under_node(nid, mem); put_device(&mem->dev); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 4a29153e372e..43d378038ce2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -202,8 +202,7 @@ static inline unsigned long phys_to_block_id(unsigned long phys) } #ifdef CONFIG_NUMA -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context); +void memory_block_add_nid_early(struct memory_block *mem, int nid); #endif /* CONFIG_NUMA */ int memory_block_advise_max_size(unsigned long size); unsigned long memory_block_advised_max_size(void); -- cgit v1.2.3 From de7342228b7343774d6a9981c2ddbfb5e201044b Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 4 Oct 2025 22:23:29 +0800 Subject: bpf: Finish constification of 1st parameter of bpf_d_path() The commit 1b8abbb12128 ("bpf...d_path(): constify path argument") constified the first parameter of the bpf_d_path(), but failed to update it in all places. Finish constification. Otherwise the selftest fail to build: .../selftests/bpf/bpf_experimental.h:222:12: error: conflicting types for 'bpf_path_d_path' 222 | extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym; | ^ .../selftests/bpf/tools/include/vmlinux.h:153922:12: note: previous declaration is here 153922 | extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __weak __ksym; Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument") Signed-off-by: Rong Tao Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 1 + tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/verifier_vfs_accept.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index c77dc40f7689..15d113a1bc1d 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -788,6 +788,7 @@ class PrinterHelpersHeader(Printer): 'struct task_struct', 'struct cgroup', 'struct path', + 'const struct path', 'struct btf_ptr', 'struct inode', 'struct socket', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c index 3e2d76ee8050..55398c04290a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -70,7 +70,7 @@ __success int BPF_PROG(path_d_path_from_file_argument, struct file *file) { int ret; - struct path *path; + const struct path *path; /* The f_path member is a path which is embedded directly within a * file. Therefore, a pointer to such embedded members are still -- cgit v1.2.3 From b71a6e2a1b710ea31c363a72c75f510ef35d8e69 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 2 Oct 2025 17:12:35 +0900 Subject: i2c: rename wait_for_completion callback to wait_for_completion_cb Functionally no change. Remove the ambiguity of 'wait_for_completion'. It helps development of the DEPT dependency tracker, but seems favorable in any case. Signed-off-by: Byungchul Park [wsa: reworded commit message] Signed-off-by: Wolfram Sang --- drivers/i2c/algos/i2c-algo-pca.c | 2 +- drivers/i2c/busses/i2c-pca-isa.c | 2 +- drivers/i2c/busses/i2c-pca-platform.c | 2 +- include/linux/i2c-algo-pca.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c index 74b66aec33d4..ee86df4cff4b 100644 --- a/drivers/i2c/algos/i2c-algo-pca.c +++ b/drivers/i2c/algos/i2c-algo-pca.c @@ -30,7 +30,7 @@ static int i2c_debug; #define pca_clock(adap) adap->i2c_clock #define pca_set_con(adap, val) pca_outw(adap, I2C_PCA_CON, val) #define pca_get_con(adap) pca_inw(adap, I2C_PCA_CON) -#define pca_wait(adap) adap->wait_for_completion(adap->data) +#define pca_wait(adap) adap->wait_for_completion_cb(adap->data) static void pca_reset(struct i2c_algo_pca_data *adap) { diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c index 85e8cf58e8bf..0cbf2f509527 100644 --- a/drivers/i2c/busses/i2c-pca-isa.c +++ b/drivers/i2c/busses/i2c-pca-isa.c @@ -95,7 +95,7 @@ static struct i2c_algo_pca_data pca_isa_data = { /* .data intentionally left NULL, not needed with ISA */ .write_byte = pca_isa_writebyte, .read_byte = pca_isa_readbyte, - .wait_for_completion = pca_isa_waitforcompletion, + .wait_for_completion_cb = pca_isa_waitforcompletion, .reset_chip = pca_isa_resetchip, }; diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c index 87da8241b927..c0f35ebbe37d 100644 --- a/drivers/i2c/busses/i2c-pca-platform.c +++ b/drivers/i2c/busses/i2c-pca-platform.c @@ -180,7 +180,7 @@ static int i2c_pca_pf_probe(struct platform_device *pdev) } i2c->algo_data.data = i2c; - i2c->algo_data.wait_for_completion = i2c_pca_pf_waitforcompletion; + i2c->algo_data.wait_for_completion_cb = i2c_pca_pf_waitforcompletion; if (i2c->gpio) i2c->algo_data.reset_chip = i2c_pca_pf_resetchip; else diff --git a/include/linux/i2c-algo-pca.h b/include/linux/i2c-algo-pca.h index 7c522fdd9ea7..e305bf32e40a 100644 --- a/include/linux/i2c-algo-pca.h +++ b/include/linux/i2c-algo-pca.h @@ -71,7 +71,7 @@ struct i2c_algo_pca_data { void *data; /* private low level data */ void (*write_byte) (void *data, int reg, int val); int (*read_byte) (void *data, int reg); - int (*wait_for_completion) (void *data); + int (*wait_for_completion_cb) (void *data); void (*reset_chip) (void *data); /* For PCA9564, use one of the predefined frequencies: * 330000, 288000, 217000, 146000, 88000, 59000, 44000, 36000 -- cgit v1.2.3 From 929bf010e0599ddef6b640cd314f1de65dd1ca3e Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:10 +0800 Subject: mm: introduce num_pages_contiguous() Let's add a simple helper for determining the number of contiguous pages that represent contiguous PFNs. In an ideal world, this helper would be simpler or not even required. Unfortunately, on some configs we still have to maintain (SPARSEMEM without VMEMMAP), the memmap is allocated per memory section, and we might run into weird corner cases of false positives when blindly testing for contiguous pages only. One example of such false positives would be a memory section-sized hole that does not have a memmap. The surrounding memory sections might get "struct pages" that are contiguous, but the PFNs are actually not. This helper will, for example, be useful for determining contiguous PFNs in a GUP result, to batch further operations across returned "struct page"s. VFIO will utilize this interface to accelerate the VFIO DMA map process. Implementation based on Linus' suggestions to avoid new usage of nth_page() where avoidable. Suggested-by: Linus Torvalds Suggested-by: Jason Gunthorpe Signed-off-by: Li Zhe Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20250814064714.56485-2-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- include/linux/mm.h | 7 ++++++- include/linux/mm_inline.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06978b4dbeb8..f092ce3530bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1833,7 +1833,12 @@ static inline unsigned long memdesc_section(memdesc_flags_t mdf) { return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } -#endif +#else /* !SECTION_IN_PAGE_FLAGS */ +static inline unsigned long memdesc_section(memdesc_flags_t mdf) +{ + return 0; +} +#endif /* SECTION_IN_PAGE_FLAGS */ /** * folio_pfn - Return the Page Frame Number of a folio. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d6c1011b38f2..f6a2b2d20016 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -617,4 +617,40 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +/** + * num_pages_contiguous() - determine the number of contiguous pages + * that represent contiguous PFNs + * @pages: an array of page pointers + * @nr_pages: length of the array, at least 1 + * + * Determine the number of contiguous pages that represent contiguous PFNs + * in @pages, starting from the first page. + * + * In some kernel configs contiguous PFNs will not have contiguous struct + * pages. In these configurations num_pages_contiguous() will return a num + * smaller than ideal number. The caller should continue to check for pfn + * contiguity after each call to num_pages_contiguous(). + * + * Returns the number of contiguous pages. + */ +static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages) +{ + struct page *cur_page = pages[0]; + unsigned long section = memdesc_section(cur_page->flags); + size_t i; + + for (i = 1; i < nr_pages; i++) { + if (++cur_page != pages[i]) + break; + /* + * In unproblematic kernel configs, page_to_section() == 0 and + * the whole check will get optimized out. + */ + if (memdesc_section(cur_page->flags) != section) + break; + } + + return i; +} + #endif -- cgit v1.2.3 From 95920c2ed02bde551ab654e9749c2ca7bc3100e0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 30 Sep 2025 13:43:29 +0200 Subject: page_pool: Fix PP_MAGIC_MASK to avoid crashing on some 32-bit arches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helge reported that the introduction of PP_MAGIC_MASK let to crashes on boot on his 32-bit parisc machine. The cause of this is the mask is set too wide, so the page_pool_page_is_pp() incurs false positives which crashes the machine. Just disabling the check in page_pool_is_pp() will lead to the page_pool code itself malfunctioning; so instead of doing this, this patch changes the define for PP_DMA_INDEX_BITS to avoid mistaking arbitrary kernel pointers for page_pool-tagged pages. The fix relies on the kernel pointers that alias with the pp_magic field always being above PAGE_OFFSET. With this assumption, we can use the lowest bit of the value of PAGE_OFFSET as the upper bound of the PP_DMA_INDEX_MASK, which should avoid the false positives. Because we cannot rely on PAGE_OFFSET always being a compile-time constant, nor on it always being >0, we fall back to disabling the dma_index storage when there are not enough bits available. This leaves us in the situation we were in before the patch in the Fixes tag, but only on a subset of architecture configurations. This seems to be the best we can do until the transition to page types in complete for page_pool pages. v2: - Make sure there's at least 8 bits available and that the PAGE_OFFSET bit calculation doesn't wrap Link: https://lore.kernel.org/all/aMNJMFa5fDalFmtn@p100/ Fixes: ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them when destroying the pool") Cc: stable@vger.kernel.org # 6.15+ Tested-by: Helge Deller Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Mina Almasry Tested-by: Helge Deller Link: https://patch.msgid.link/20250930114331.675412-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 22 ++++++++------- net/core/page_pool.c | 76 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..0905eb6b55ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4159,14 +4159,13 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * since this value becomes part of PP_SIGNATURE; meaning we can just use the * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is - * 0, we make sure that we leave the two topmost bits empty, as that guarantees - * we won't mistake a valid kernel pointer for a value we set, regardless of the - * VMSPLIT setting. + * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is + * known at compile-time. * - * Altogether, this means that the number of bits available is constrained by - * the size of an unsigned long (at the upper end, subtracting two bits per the - * above), and the definition of PP_SIGNATURE (with or without - * POISON_POINTER_DELTA). + * If the value of PAGE_OFFSET is not known at compile time, or if it is too + * small to leave at least 8 bits available above PP_SIGNATURE, we define the + * number of bits to be 0, which turns off the DMA index tracking altogether + * (see page_pool_register_dma_index()). */ #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) #if POISON_POINTER_DELTA > 0 @@ -4175,8 +4174,13 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); */ #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) #else -/* Always leave out the topmost two; see above. */ -#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */ +#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8)) +#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \ + PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \ + !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \ + MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0) + #endif #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 492728f9e021..1a5edec485f1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -468,11 +468,60 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, } } +static int page_pool_register_dma_index(struct page_pool *pool, + netmem_ref netmem, gfp_t gfp) +{ + int err = 0; + u32 id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + goto out; + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto out; + } + + netmem_set_dma_index(netmem, id); +out: + return err; +} + +static int page_pool_release_dma_index(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + return 0; + + id = netmem_get_dma_index(netmem); + if (!id) + return -1; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return -1; + + netmem_set_dma_index(netmem, 0); + + return 0; +} + static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; - u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -491,18 +540,10 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t g goto unmap_failed; } - if (in_softirq()) - err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - else - err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - if (err) { - WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + err = page_pool_register_dma_index(pool, netmem, gfp); + if (err) goto unset_failed; - } - netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; @@ -680,8 +721,6 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, netmem_ref netmem) { - struct page *old, *page = netmem_to_page(netmem); - unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -690,15 +729,7 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo */ return; - id = netmem_get_dma_index(netmem); - if (!id) - return; - - if (in_softirq()) - old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); - else - old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); - if (old != page) + if (page_pool_release_dma_index(pool, netmem)) return; dma = page_pool_get_dma_addr_netmem(netmem); @@ -708,7 +739,6 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); - netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need -- cgit v1.2.3 From beb97995b97532e1f215e3295e6843e59862f94b Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 Oct 2025 14:18:18 +0800 Subject: io_uring: use tab indentation for IORING_SEND_VECTORIZED comment Be consistent with tab style of "liburing/src/include/liburing/io_uring.h". Signed-off-by: Haiyue Wang Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0cc1cc0dd01..263bed13473e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -404,7 +404,7 @@ enum io_uring_op { * will be contiguous from the starting buffer ID. * * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec - * to allow vectorized send operations. + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) -- cgit v1.2.3 From 1ed06c83506ecaaf1836ddeb7c65772ff86d8d53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Oct 2025 11:06:25 +0200 Subject: block: remove bio_iov_iter_get_pages Switch the only caller to bio_iov_iter_get_pages, and explain why it does not have any alignment requirements. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Keith Busch Reviewed-by: Qu Wenruo Signed-off-by: Jens Axboe --- block/blk-map.c | 6 +++++- include/linux/bio.h | 5 ----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/block/blk-map.c b/block/blk-map.c index 165f2234f00f..6cce652c7fa6 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -283,7 +283,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); if (!bio) return -ENOMEM; - ret = bio_iov_iter_get_pages(bio, iter); + /* + * No alignment requirements on our part to support arbitrary + * passthrough commands. + */ + ret = bio_iov_iter_get_pages_aligned(bio, iter, 0); if (ret) goto out_put; ret = blk_rq_append_bio(rq, bio); diff --git a/include/linux/bio.h b/include/linux/bio.h index a64a30131031..b01dae9506de 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -449,11 +449,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask); -static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) -{ - return bio_iov_iter_get_pages_aligned(bio, iter, 0); -} - void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); -- cgit v1.2.3 From 82dd5d763c9b718e2d655b9565e0a06a91bb83dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Oct 2025 11:06:26 +0200 Subject: block: rename bio_iov_iter_get_pages_aligned to bio_iov_iter_get_pages Now that the bio_iov_iter_get_pages is free again, use it instead of the more complicated now. Also drop the unused export. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Keith Busch Reviewed-by: Qu Wenruo Signed-off-by: Jens Axboe --- block/bio.c | 5 ++--- block/blk-map.c | 2 +- include/linux/bio.h | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 3a1a848940dd..b3a79285c278 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1316,7 +1316,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, } /** - * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio + * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * @len_align_mask: the mask to align the total size to, 0 for any length @@ -1336,7 +1336,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { int ret = 0; @@ -1360,7 +1360,6 @@ int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, return bio_iov_iter_align_down(bio, iter, len_align_mask); return ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/blk-map.c b/block/blk-map.c index 6cce652c7fa6..60faf036fb6e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -287,7 +287,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * No alignment requirements on our part to support arbitrary * passthrough commands. */ - ret = bio_iov_iter_get_pages_aligned(bio, iter, 0); + ret = bio_iov_iter_get_pages(bio, iter, 0); if (ret) goto out_put; ret = blk_rq_append_bio(rq, bio); diff --git a/include/linux/bio.h b/include/linux/bio.h index b01dae9506de..16c1c85613b7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -446,7 +446,7 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 066e5309bd45..c97e8b0e67b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1876,7 +1876,7 @@ static inline int bio_split_rw_at(struct bio *bio, static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, struct iov_iter *iter, struct block_device *bdev) { - return bio_iov_iter_get_pages_aligned(bio, iter, + return bio_iov_iter_get_pages(bio, iter, bdev_logical_block_size(bdev) - 1); } -- cgit v1.2.3 From 506aa235f6e0baa00bf792df82a5e9f618b7a5d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Oct 2025 11:06:28 +0200 Subject: block: move bio_iov_iter_get_bdev_pages to block/fops.c Keep bio_iov_iter_get_bdev_pages local with the callers, as blindly looking at the bdev logical block size is often not the best idea unless on a block device. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Keith Busch Reviewed-by: Qu Wenruo Signed-off-by: Jens Axboe --- block/fops.c | 13 ++++++++++--- include/linux/blkdev.h | 7 ------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/block/fops.c b/block/fops.c index c2c0396ea9ee..5e3db9fead77 100644 --- a/block/fops.c +++ b/block/fops.c @@ -43,6 +43,13 @@ static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, (bdev_logical_block_size(bdev) - 1); } +static inline int blkdev_iov_iter_get_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, @@ -78,7 +85,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); + ret = blkdev_iov_iter_get_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -212,7 +219,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); + ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -348,7 +355,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); + ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c97e8b0e67b6..d4db5039836d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1873,13 +1873,6 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } -static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, - struct iov_iter *iter, struct block_device *bdev) -{ - return bio_iov_iter_get_pages(bio, iter, - bdev_logical_block_size(bdev) - 1); -} - #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 1f086d2508ebe494d13fd587d1f5e2b908379efc Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 3 Oct 2025 16:05:46 -0400 Subject: drm/amdkfd: Fix two comments in kfd_ioctl.h Queue read and write pointers are "to KFD", not "from KFD". Suggested-by: Robert Liu Signed-off-by: Felix Kuehling Reviewed-by: Alex Deucher Reviewed-by: Robert Liu Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 04c7d283dc7d..5d1727a6d040 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -67,8 +67,8 @@ struct kfd_ioctl_get_version_args { struct kfd_ioctl_create_queue_args { __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ + __u64 write_pointer_address; /* to KFD */ + __u64 read_pointer_address; /* to KFD */ __u64 doorbell_offset; /* from KFD */ __u32 ring_size; /* to KFD */ -- cgit v1.2.3 From 8375b76517cb52bac0903071feedc218c45d74d2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 21 Sep 2025 08:44:56 +0300 Subject: kho: replace kho_preserve_phys() with kho_preserve_pages() to make it clear that KHO operates on pages rather than on a random physical address. The kho_preserve_pages() will be also used in upcoming support for vmalloc preservation. Link: https://lkml.kernel.org/r/20250921054458.4043761-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Reviewed-by: Jason Gunthorpe Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Chris Li Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 5 +++-- kernel/kexec_handover.c | 25 +++++++++++-------------- mm/memblock.c | 4 +++- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 559d13a3bc44..cec663b39861 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -18,6 +18,7 @@ enum kho_event { struct folio; struct notifier_block; +struct page; #define DECLARE_KHOSER_PTR(name, type) \ union { \ @@ -43,7 +44,7 @@ bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); -int kho_preserve_phys(phys_addr_t phys, size_t size); +int kho_preserve_pages(struct page *page, unsigned int nr_pages); struct folio *kho_restore_folio(phys_addr_t phys); int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -71,7 +72,7 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; } -static inline int kho_preserve_phys(phys_addr_t phys, size_t size) +static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index b8d0d63f6145..1c44a55f758e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -725,26 +725,23 @@ int kho_preserve_folio(struct folio *folio) EXPORT_SYMBOL_GPL(kho_preserve_folio); /** - * kho_preserve_phys - preserve a physically contiguous range across kexec. - * @phys: physical address of the range. - * @size: size of the range. + * kho_preserve_pages - preserve contiguous pages across kexec + * @page: first page in the list. + * @nr_pages: number of pages. * - * Instructs KHO to preserve the memory range from @phys to @phys + @size - * across kexec. + * Preserve a contiguous list of order 0 pages. Must be restored using + * kho_restore_pages() to ensure the pages are restored properly as order 0. * * Return: 0 on success, error code on failure */ -int kho_preserve_phys(phys_addr_t phys, size_t size) +int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - unsigned long pfn = PHYS_PFN(phys); + struct kho_mem_track *track = &kho_out.ser.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; unsigned long failed_pfn = 0; - const unsigned long start_pfn = pfn; - const unsigned long end_pfn = PHYS_PFN(phys + size); int err = 0; - struct kho_mem_track *track = &kho_out.ser.track; - - if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) - return -EINVAL; while (pfn < end_pfn) { const unsigned int order = @@ -764,7 +761,7 @@ int kho_preserve_phys(phys_addr_t phys, size_t size) return err; } -EXPORT_SYMBOL_GPL(kho_preserve_phys); +EXPORT_SYMBOL_GPL(kho_preserve_pages); /* Handling for debug/kho/out */ diff --git a/mm/memblock.c b/mm/memblock.c index 120a501a887a..e23e16618e9b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2452,8 +2452,10 @@ static int reserve_mem_kho_finalize(struct kho_serialization *ser) for (i = 0; i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; - err |= kho_preserve_phys(map->start, map->size); + err |= kho_preserve_pages(page, nr_pages); } err |= kho_preserve_folio(page_folio(kho_fdt)); -- cgit v1.2.3 From a667300bd53f272a3055238bcefe108f88836270 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 21 Sep 2025 08:44:57 +0300 Subject: kho: add support for preserving vmalloc allocations A vmalloc allocation is preserved using binary structure similar to global KHO memory tracker. It's a linked list of pages where each page is an array of physical address of pages in vmalloc area. kho_preserve_vmalloc() hands out the physical address of the head page to the caller. This address is used as the argument to kho_vmalloc_restore() to restore the mapping in the vmalloc address space and populate it with the preserved pages. [pasha.tatashin@soleen.com: free chunks using free_page() not kfree()] Link: https://lkml.kernel.org/r/mafs0a52idbeg.fsf@kernel.org [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250921054458.4043761-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Chris Li Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 28 ++++ kernel/kexec_handover.c | 281 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index cec663b39861..25042c1d8d54 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -39,13 +39,24 @@ struct page; struct kho_serialization; +struct kho_vmalloc_chunk; +struct kho_vmalloc { + DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); + unsigned int total_pages; + unsigned short flags; + unsigned short order; +}; + #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -77,11 +88,28 @@ static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) return -EOPNOTSUPP; } +static inline int kho_preserve_vmalloc(void *ptr, + struct kho_vmalloc *preservation) +{ + return -EOPNOTSUPP; +} + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; } +static inline struct page *kho_restore_pages(phys_addr_t phys, + unsigned int nr_pages) +{ + return NULL; +} + +static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + return NULL; +} + static inline int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) { diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 1c44a55f758e..76f0940fb485 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -274,6 +275,37 @@ struct folio *kho_restore_folio(phys_addr_t phys) } EXPORT_SYMBOL_GPL(kho_restore_folio); +/** + * kho_restore_pages - restore list of contiguous order 0 pages. + * @phys: physical address of the first page. + * @nr_pages: number of pages. + * + * Restore a contiguous list of order 0 pages that was preserved with + * kho_preserve_pages(). + * + * Return: 0 on success, error code on failure + */ +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +{ + const unsigned long start_pfn = PHYS_PFN(phys); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn)); + + if (!page) + return NULL; + split_page(page, order); + pfn += 1 << order; + } + + return pfn_to_page(start_pfn); +} +EXPORT_SYMBOL_GPL(kho_restore_pages); + /* Serialize and deserialize struct kho_mem_phys across kexec * * Record all the bitmaps in a linked list of pages for the next kernel to @@ -763,6 +795,255 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages); +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(phys_addr_t)) + +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + phys_addr_t phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* vmalloc flags KHO supports */ +#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) + +/* KHO internal flags for vmalloc preservations */ +#define KHO_VMALLOC_ALLOC 0x0001 +#define KHO_VMALLOC_HUGE_VMAP 0x0002 + +static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) +{ + unsigned short kho_flags = 0; + + if (vm_flags & VM_ALLOC) + kho_flags |= KHO_VMALLOC_ALLOC; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + kho_flags |= KHO_VMALLOC_HUGE_VMAP; + + return kho_flags; +} + +static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) +{ + unsigned int vm_flags = 0; + + if (kho_flags & KHO_VMALLOC_ALLOC) + vm_flags |= VM_ALLOC; + if (kho_flags & KHO_VMALLOC_HUGE_VMAP) + vm_flags |= VM_ALLOW_HUGE_VMAP; + + return vm_flags; +} + +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) +{ + struct kho_vmalloc_chunk *chunk; + int err; + + chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return NULL; + + err = kho_preserve_pages(virt_to_page(chunk), 1); + if (err) + goto err_free; + if (cur) + KHOSER_STORE_PTR(cur->hdr.next, chunk); + return chunk; + +err_free: + free_page((unsigned long)chunk); + return NULL; +} + +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +{ + struct kho_mem_track *track = &kho_out.ser.track; + unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); + + __kho_unpreserve(track, pfn, pfn + 1); + + for (int i = 0; chunk->phys[i]; i++) { + pfn = PHYS_PFN(chunk->phys[i]); + __kho_unpreserve(track, pfn, pfn + 1); + } +} + +static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} + +/** + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec + * @ptr: pointer to the area in vmalloc address space + * @preservation: placeholder for preservation metadata + * + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The + * physical pages mapped at @ptr will be preserved and on successful return + * @preservation will hold the physical address of a structure that describes + * the preservation. + * + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably + * restored on the same node + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk; + struct vm_struct *vm = find_vm_area(ptr); + unsigned int order, flags, nr_contig_pages; + unsigned int idx = 0; + int err; + + if (!vm) + return -EINVAL; + + if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return -EOPNOTSUPP; + + flags = vmalloc_flags_to_kho(vm->flags); + order = get_vm_area_page_order(vm); + + chunk = new_vmalloc_chunk(NULL); + if (!chunk) + return -ENOMEM; + KHOSER_STORE_PTR(preservation->first, chunk); + + nr_contig_pages = (1 << order); + for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { + phys_addr_t phys = page_to_phys(vm->pages[i]); + + err = kho_preserve_pages(vm->pages[i], nr_contig_pages); + if (err) + goto err_free; + + chunk->phys[idx++] = phys; + if (idx == ARRAY_SIZE(chunk->phys)) { + chunk = new_vmalloc_chunk(chunk); + if (!chunk) + goto err_free; + idx = 0; + } + } + + preservation->total_pages = vm->nr_pages; + preservation->flags = flags; + preservation->order = order; + + return 0; + +err_free: + kho_vmalloc_free_chunks(preservation); + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); + +/** + * kho_restore_vmalloc - recreates and populates an area in vmalloc address + * space from the preserved memory. + * @preservation: preservation metadata. + * + * Recreates an area in vmalloc address space and populates it with memory that + * was preserved using kho_preserve_vmalloc(). + * + * Return: pointer to the area in the vmalloc address space, NULL on failure. + */ +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + unsigned int align, order, shift, vm_flags; + unsigned long total_pages, contig_pages; + unsigned long addr, size; + struct vm_struct *area; + struct page **pages; + unsigned int idx = 0; + int err; + + vm_flags = kho_flags_to_vmalloc(preservation->flags); + if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return NULL; + + total_pages = preservation->total_pages; + pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + order = preservation->order; + contig_pages = (1 << order); + shift = PAGE_SHIFT + order; + align = 1 << shift; + + while (chunk) { + struct page *page; + + for (int i = 0; chunk->phys[i]; i++) { + phys_addr_t phys = chunk->phys[i]; + + if (idx + contig_pages > total_pages) + goto err_free_pages_array; + + page = kho_restore_pages(phys, contig_pages); + if (!page) + goto err_free_pages_array; + + for (int j = 0; j < contig_pages; j++) + pages[idx++] = page; + + phys += contig_pages * PAGE_SIZE; + } + + page = kho_restore_pages(virt_to_phys(chunk), 1); + if (!page) + goto err_free_pages_array; + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + __free_page(page); + } + + if (idx != total_pages) + goto err_free_pages_array; + + area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, + vm_flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + if (!area) + goto err_free_pages_array; + + addr = (unsigned long)area->addr; + size = get_vm_area_size(area); + err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); + if (err) + goto err_free_vm_area; + + area->nr_pages = total_pages; + area->pages = pages; + + return area->addr; + +err_free_vm_area: + free_vm_area(area); +err_free_pages_array: + kvfree(pages); + return NULL; +} +EXPORT_SYMBOL_GPL(kho_restore_vmalloc); + /* Handling for debug/kho/out */ static struct dentry *debugfs_root; -- cgit v1.2.3 From fcc0669c5aa681994c507b50f1c706c969d99730 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 22 Sep 2025 15:02:03 -0700 Subject: memcg: skip cgroup_file_notify if spinning is not allowed Generally memcg charging is allowed from all the contexts including NMI where even spinning on spinlock can cause locking issues. However one call chain was missed during the addition of memcg charging from any context support. That is try_charge_memcg() -> memcg_memory_event() -> cgroup_file_notify(). The possible function call tree under cgroup_file_notify() can acquire many different spin locks in spinning mode. Some of them are cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's just skip cgroup_file_notify() from memcg charging if the context does not allow spinning. Alternative approach was also explored where instead of skipping cgroup_file_notify(), we defer the memcg event processing to irq_work [1]. However it adds complexity and it was decided to keep things simple until we need more memcg events with !allow_spinning requirement. Link: https://lore.kernel.org/all/5qi2llyzf7gklncflo6gxoozljbm4h3tpnuv4u4ej4ztysvi6f@x44v7nz2wdzd/ [1] Link: https://lkml.kernel.org/r/20250922220203.261714-1-shakeel.butt@linux.dev Fixes: 3ac4638a734a ("memcg: make memcg_rstat_updated nmi safe") Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Closes: https://lore.kernel.org/all/20250905061919.439648-1-yepeilin@google.com/ Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Kumar Kartikeya Dwivedi Cc: Muchun Song Cc: Peilin Ye Cc: Roman Gushchin Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 26 +++++++++++++++++++------- mm/memcontrol.c | 7 ++++--- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 16fe0306e50e..873e510d6f8d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1001,22 +1001,28 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event) +static inline void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, + bool allow_spinning) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event) + if (!swap_event && allow_spinning) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; @@ -1026,6 +1032,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, !mem_cgroup_is_root(memcg)); } +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + __memcg_memory_event(memcg, event, true); +} + static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e090f29eb03b..4deda33625f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2307,12 +2307,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, bool drained = false; bool raised_max_event = false; unsigned long pflags; + bool allow_spinning = gfpflags_allow_spinning(gfp_mask); retry: if (consume_stock(memcg, nr_pages)) return 0; - if (!gfpflags_allow_spinning(gfp_mask)) + if (!allow_spinning) /* Avoid the refill and flush of the older stock */ batch = nr_pages; @@ -2348,7 +2349,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - memcg_memory_event(mem_over_limit, MEMCG_MAX); + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning); raised_max_event = true; psi_memstall_enter(&pflags); @@ -2415,7 +2416,7 @@ force: * a MEMCG_MAX event. */ if (!raised_max_event) - memcg_memory_event(mem_over_limit, MEMCG_MAX); + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning); /* * The allocation either can't fail or will lead to more memory -- cgit v1.2.3 From f04aad36a07cc17b7a5d5b9a2d386ce6fae63e93 Mon Sep 17 00:00:00 2001 From: Jakub Acs Date: Wed, 1 Oct 2025 09:03:52 +0000 Subject: mm/ksm: fix flag-dropping behavior in ksm_madvise syzkaller discovered the following crash: (kernel BUG) [ 44.607039] ------------[ cut here ]------------ [ 44.607422] kernel BUG at mm/userfaultfd.c:2067! [ 44.608148] Oops: invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI [ 44.608814] CPU: 1 UID: 0 PID: 2475 Comm: reproducer Not tainted 6.16.0-rc6 #1 PREEMPT(none) [ 44.609635] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 44.610695] RIP: 0010:userfaultfd_release_all+0x3a8/0x460 [ 44.617726] Call Trace: [ 44.617926] [ 44.619284] userfaultfd_release+0xef/0x1b0 [ 44.620976] __fput+0x3f9/0xb60 [ 44.621240] fput_close_sync+0x110/0x210 [ 44.622222] __x64_sys_close+0x8f/0x120 [ 44.622530] do_syscall_64+0x5b/0x2f0 [ 44.622840] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 44.623244] RIP: 0033:0x7f365bb3f227 Kernel panics because it detects UFFD inconsistency during userfaultfd_release_all(). Specifically, a VMA which has a valid pointer to vma->vm_userfaultfd_ctx, but no UFFD flags in vma->vm_flags. The inconsistency is caused in ksm_madvise(): when user calls madvise() with MADV_UNMEARGEABLE on a VMA that is registered for UFFD in MINOR mode, it accidentally clears all flags stored in the upper 32 bits of vma->vm_flags. Assuming x86_64 kernel build, unsigned long is 64-bit and unsigned int and int are 32-bit wide. This setup causes the following mishap during the &= ~VM_MERGEABLE assignment. VM_MERGEABLE is a 32-bit constant of type unsigned int, 0x8000'0000. After ~ is applied, it becomes 0x7fff'ffff unsigned int, which is then promoted to unsigned long before the & operation. This promotion fills upper 32 bits with leading 0s, as we're doing unsigned conversion (and even for a signed conversion, this wouldn't help as the leading bit is 0). & operation thus ends up AND-ing vm_flags with 0x0000'0000'7fff'ffff instead of intended 0xffff'ffff'7fff'ffff and hence accidentally clears the upper 32-bits of its value. Fix it by changing `VM_MERGEABLE` constant to unsigned long, using the BIT() macro. Note: other VM_* flags are not affected: This only happens to the VM_MERGEABLE flag, as the other VM_* flags are all constants of type int and after ~ operation, they end up with leading 1 and are thus converted to unsigned long with leading 1s. Note 2: After commit 31defc3b01d9 ("userfaultfd: remove (VM_)BUG_ON()s"), this is no longer a kernel BUG, but a WARNING at the same place: [ 45.595973] WARNING: CPU: 1 PID: 2474 at mm/userfaultfd.c:2067 but the root-cause (flag-drop) remains the same. [akpm@linux-foundation.org: rust bindgen wasn't able to handle BIT(), from Miguel] Link: https://lore.kernel.org/oe-kbuild-all/202510030449.VfSaAjvd-lkp@intel.com/ Link: https://lkml.kernel.org/r/20251001090353.57523-2-acsjakub@amazon.de Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode") Signed-off-by: Jakub Acs Signed-off-by: Miguel Ojeda Acked-by: David Hildenbrand Acked-by: SeongJae Park Tested-by: Alice Ryhl Tested-by: Miguel Ojeda Cc: Xu Xin Cc: Chengming Zhou Cc: Peter Xu Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- rust/bindings/bindings_helper.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06978b4dbeb8..70a2a76007d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -323,7 +323,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 04b75d4d01c3..2e43c66635a2 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -108,6 +108,7 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; +const vm_flags_t RUST_CONST_HELPER_VM_MERGEABLE = VM_MERGEABLE; #if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) #include "../../drivers/android/binder/rust_binder.h" -- cgit v1.2.3 From 27c0a7b05d13a0dc54ed0b95fc12218210fdea1a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Jul 2025 12:02:27 -0700 Subject: libceph: Use HMAC-SHA256 library instead of crypto_shash Use the HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. Signed-off-by: Eric Biggers Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 ++- net/ceph/Kconfig | 3 +- net/ceph/messenger_v2.c | 77 ++++++++++++------------------------------ 3 files changed, 26 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1717cc57cdac..4b49592a738f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -2,6 +2,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H +#include #include #include #include @@ -412,7 +413,8 @@ struct ceph_connection_v2_info { struct ceph_msg_data_cursor in_cursor; struct ceph_msg_data_cursor out_cursor; - struct crypto_shash *hmac_tfm; /* post-auth signature */ + struct hmac_sha256_key hmac_key; /* post-auth signature */ + bool hmac_key_set; struct crypto_aead *gcm_tfm; /* on-wire encryption */ struct aead_request *gcm_req; struct crypto_wait gcm_wait; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 0aa21fcbf6ec..ea60e3ef0834 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -6,8 +6,7 @@ config CEPH_LIB select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM - select CRYPTO_HMAC - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 5483b4eed94e..c54c8b5a6526 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -709,7 +709,7 @@ static int setup_crypto(struct ceph_connection *con, dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", __func__, con, con->v2.con_mode, session_key_len, con_secret_len); - WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req); if (con->v2.con_mode != CEPH_CON_MODE_CRC && con->v2.con_mode != CEPH_CON_MODE_SECURE) { @@ -723,22 +723,8 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_none */ } - noio_flag = memalloc_noio_save(); - con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); - memalloc_noio_restore(noio_flag); - if (IS_ERR(con->v2.hmac_tfm)) { - ret = PTR_ERR(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - pr_err("failed to allocate hmac tfm context: %d\n", ret); - return ret; - } - - ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, - session_key_len); - if (ret) { - pr_err("failed to set hmac key: %d\n", ret); - return ret; - } + hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len); + con->v2.hmac_key_set = true; if (con->v2.con_mode == CEPH_CON_MODE_CRC) { WARN_ON(con_secret_len); @@ -793,38 +779,26 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static int ceph_hmac_sha256(struct ceph_connection *con, - const struct kvec *kvecs, int kvec_cnt, u8 *hmac) +static void ceph_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, + u8 hmac[SHA256_DIGEST_SIZE]) { - SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ - int ret; + struct hmac_sha256_ctx ctx; int i; - dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, - con->v2.hmac_tfm, kvec_cnt); + dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con, + con->v2.hmac_key_set, kvec_cnt); - if (!con->v2.hmac_tfm) { + if (!con->v2.hmac_key_set) { memset(hmac, 0, SHA256_DIGEST_SIZE); - return 0; /* auth_none */ + return; /* auth_none */ } - desc->tfm = con->v2.hmac_tfm; - ret = crypto_shash_init(desc); - if (ret) - goto out; - - for (i = 0; i < kvec_cnt; i++) { - ret = crypto_shash_update(desc, kvecs[i].iov_base, - kvecs[i].iov_len); - if (ret) - goto out; - } - - ret = crypto_shash_final(desc, hmac); - -out: - shash_desc_zero(desc); - return ret; /* auth_x, both plain and secure modes */ + /* auth_x, both plain and secure modes */ + hmac_sha256_init(&ctx, &con->v2.hmac_key); + for (i = 0; i < kvec_cnt; i++) + hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len); + hmac_sha256_final(&ctx, hmac); } static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) @@ -1455,17 +1429,14 @@ static int prepare_auth_request_more(struct ceph_connection *con, static int prepare_auth_signature(struct ceph_connection *con) { void *buf; - int ret; buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, con_secure(con))); if (!buf) return -ENOMEM; - ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs, - con->v2.in_sign_kvec_cnt, CTRL_BODY(buf)); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, SHA256_DIGEST_SIZE); @@ -2460,10 +2431,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs, - con->v2.out_sign_kvec_cnt, hmac); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, + hmac); ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { @@ -3814,10 +3783,8 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN); memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN); - if (con->v2.hmac_tfm) { - crypto_free_shash(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - } + memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key)); + con->v2.hmac_key_set = false; if (con->v2.gcm_req) { aead_request_free(con->v2.gcm_req); con->v2.gcm_req = NULL; -- cgit v1.2.3 From 59699a5a7114f09f890e86c09a6b32afb5eaa64c Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:53 +0200 Subject: libceph: make ceph_con_get_out_msg() return the message pointer The caller in messenger_v1.c loads it anyway, so let's keep the pointer in the register instead of reloading it from memory. This eliminates a tiny bit of unnecessary overhead. Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 4 ++-- net/ceph/messenger_v1.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 4b49592a738f..9ebcac2981fd 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -550,7 +550,7 @@ void ceph_addr_set_port(struct ceph_entity_addr *addr, int p); void ceph_con_process_message(struct ceph_connection *con); int ceph_con_in_msg_alloc(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); -void ceph_con_get_out_msg(struct ceph_connection *con); +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); /* messenger_v1.c */ int ceph_con_v1_try_read(struct ceph_connection *con); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9f6d860411cb..b6c7bfc03503 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2110,7 +2110,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con, return ret; } -void ceph_con_get_out_msg(struct ceph_connection *con) +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; @@ -2141,7 +2141,7 @@ void ceph_con_get_out_msg(struct ceph_connection *con) * message or in case of a fault. */ WARN_ON(con->out_msg); - con->out_msg = ceph_msg_get(msg); + return con->out_msg = ceph_msg_get(msg); } /* diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 0cb61c76b9b8..eebe4e19d75a 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -210,8 +210,7 @@ static void prepare_write_message(struct ceph_connection *con) &con->v1.out_temp_ack); } - ceph_con_get_out_msg(con); - m = con->out_msg; + m = ceph_con_get_out_msg(con); dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), -- cgit v1.2.3 From 7399212dcf64d90a6ab239bdd98bd325d922fc7e Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:54 +0200 Subject: libceph: pass the message pointer instead of loading con->out_msg This pointer is in a register anyway, so let's use that instead of reloading from memory everywhere. [ idryomov: formatting ] Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 +- net/ceph/messenger.c | 4 +- net/ceph/messenger_v1.c | 45 ++++++----- net/ceph/messenger_v2.c | 168 +++++++++++++++++++++-------------------- 4 files changed, 114 insertions(+), 107 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 9ebcac2981fd..6aa4c6478c9f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -555,7 +555,7 @@ struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); /* messenger_v1.c */ int ceph_con_v1_try_read(struct ceph_connection *con); int ceph_con_v1_try_write(struct ceph_connection *con); -void ceph_con_v1_revoke(struct ceph_connection *con); +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg); void ceph_con_v1_revoke_incoming(struct ceph_connection *con); bool ceph_con_v1_opened(struct ceph_connection *con); void ceph_con_v1_reset_session(struct ceph_connection *con); @@ -564,7 +564,7 @@ void ceph_con_v1_reset_protocol(struct ceph_connection *con); /* messenger_v2.c */ int ceph_con_v2_try_read(struct ceph_connection *con); int ceph_con_v2_try_write(struct ceph_connection *con); -void ceph_con_v2_revoke(struct ceph_connection *con); +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg); void ceph_con_v2_revoke_incoming(struct ceph_connection *con); bool ceph_con_v2_opened(struct ceph_connection *con); void ceph_con_v2_reset_session(struct ceph_connection *con); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b6c7bfc03503..08a6a083609f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1793,9 +1793,9 @@ void ceph_msg_revoke(struct ceph_msg *msg) WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) - ceph_con_v2_revoke(con); + ceph_con_v2_revoke(con, msg); else - ceph_con_v1_revoke(con); + ceph_con_v1_revoke(con, msg); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index eebe4e19d75a..cc4a36ef8462 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -169,10 +169,9 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ -static void prepare_write_message_footer(struct ceph_connection *con) +static void prepare_write_message_footer(struct ceph_connection *con, + struct ceph_msg *m) { - struct ceph_msg *m = con->out_msg; - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); @@ -230,31 +229,31 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + m->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr)); /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); + m->footer.front_crc = cpu_to_le32(crc); if (m->middle) { crc = crc32c(0, m->middle->vec.iov_base, m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); + m->footer.middle_crc = cpu_to_le32(crc); } else - con->out_msg->footer.middle_crc = 0; + m->footer.middle_crc = 0; dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; + le32_to_cpu(m->footer.front_crc), + le32_to_cpu(m->footer.middle_crc)); + m->footer.flags = 0; /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; + m->footer.data_crc = 0; if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); + prepare_message_data(m, m->data_length); con->v1.out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ - prepare_write_message_footer(con); + prepare_write_message_footer(con, m); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -461,9 +460,9 @@ out: * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_message_data(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; @@ -515,7 +514,7 @@ static int write_partial_message_data(struct ceph_connection *con) else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); - prepare_write_message_footer(con); + prepare_write_message_footer(con, msg); return 1; /* must return > 0 to indicate success */ } @@ -1471,6 +1470,7 @@ bad_tag: */ int ceph_con_v1_try_write(struct ceph_connection *con) { + struct ceph_msg *msg; int ret = 1; dout("try_write start %p state %d\n", con, con->state); @@ -1517,14 +1517,15 @@ more: } /* msg pages? */ - if (con->out_msg) { + msg = con->out_msg; + if (msg) { if (con->v1.out_msg_done) { - ceph_msg_put(con->out_msg); + ceph_msg_put(msg); con->out_msg = NULL; /* we're done with this one */ goto do_next; } - ret = write_partial_message_data(con); + ret = write_partial_message_data(con, msg); if (ret == 1) goto more; /* we need to send the footer, too! */ if (ret == 0) @@ -1563,10 +1564,8 @@ out: return ret; } -void ceph_con_v1_revoke(struct ceph_connection *con) +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - WARN_ON(con->v1.out_skip); /* footer */ if (con->v1.out_msg_done) { diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c54c8b5a6526..b44e936f3865 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1560,10 +1560,11 @@ static int prepare_ack(struct ceph_connection *con) return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); } -static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +static void prepare_epilogue_plain(struct ceph_connection *con, + struct ceph_msg *msg, bool aborted) { dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, - con->out_msg, aborted, con->v2.out_epil.front_crc, + msg, aborted, con->v2.out_epil.front_crc, con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); encode_epilogue_plain(con, aborted); @@ -1574,10 +1575,9 @@ static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) * For "used" empty segments, crc is -1. For unused (trailing) * segments, crc is 0. */ -static void prepare_message_plain(struct ceph_connection *con) +static void prepare_message_plain(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - prepare_head_plain(con, con->v2.out_buf, sizeof(struct ceph_msg_header2), NULL, 0, false); @@ -1618,7 +1618,7 @@ static void prepare_message_plain(struct ceph_connection *con) con->v2.out_state = OUT_S_QUEUE_DATA; } else { con->v2.out_epil.data_crc = 0; - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } } @@ -1630,7 +1630,8 @@ static void prepare_message_plain(struct ceph_connection *con) * allocate pages for the entire tail of the message (currently up * to ~32M) and two sgs arrays (up to ~256K each)... */ -static int prepare_message_secure(struct ceph_connection *con) +static int prepare_message_secure(struct ceph_connection *con, + struct ceph_msg *msg) { void *zerop = page_address(ceph_zero_page); struct sg_table enc_sgt = {}; @@ -1645,7 +1646,7 @@ static int prepare_message_secure(struct ceph_connection *con) if (ret) return ret; - tail_len = tail_onwire_len(con->out_msg, true); + tail_len = tail_onwire_len(msg, true); if (!tail_len) { /* * Empty message: once the head is written, @@ -1656,7 +1657,7 @@ static int prepare_message_secure(struct ceph_connection *con) } encode_epilogue_secure(con, false); - ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop, &con->v2.out_epil, NULL, 0, false); if (ret) goto out; @@ -1685,7 +1686,7 @@ static int prepare_message_secure(struct ceph_connection *con) goto out; dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, - con->out_msg, sgt.orig_nents, enc_page_cnt); + msg, sgt.orig_nents, enc_page_cnt); con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; out: @@ -1694,19 +1695,19 @@ out: return ret; } -static int prepare_message(struct ceph_connection *con) +static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg) { int lens[] = { sizeof(struct ceph_msg_header2), - front_len(con->out_msg), - middle_len(con->out_msg), - data_len(con->out_msg) + front_len(msg), + middle_len(msg), + data_len(msg) }; struct ceph_frame_desc desc; int ret; dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, - con->out_msg, lens[0], lens[1], lens[2], lens[3]); + msg, lens[0], lens[1], lens[2], lens[3]); if (con->in_seq > con->in_seq_acked) { dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, @@ -1717,15 +1718,15 @@ static int prepare_message(struct ceph_connection *con) reset_out_kvecs(con); init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); encode_preamble(&desc, con->v2.out_buf); - fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr, con->in_seq_acked); if (con_secure(con)) { - ret = prepare_message_secure(con); + ret = prepare_message_secure(con, msg); if (ret) return ret; } else { - prepare_message_plain(con); + prepare_message_plain(con, msg); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -3153,20 +3154,20 @@ int ceph_con_v2_try_read(struct ceph_connection *con) } } -static void queue_data(struct ceph_connection *con) +static void queue_data(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; con->v2.out_epil.data_crc = -1; - ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, - data_len(con->out_msg)); + ceph_msg_data_cursor_init(&con->v2.out_cursor, msg, + data_len(msg)); get_bvec_at(&con->v2.out_cursor, &bv); set_out_bvec(con, &bv, true); con->v2.out_state = OUT_S_QUEUE_DATA_CONT; } -static void queue_data_cont(struct ceph_connection *con) +static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; @@ -3187,7 +3188,7 @@ static void queue_data_cont(struct ceph_connection *con) * we are done. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3219,7 +3220,7 @@ static void queue_enc_page(struct ceph_connection *con) con->v2.out_state = OUT_S_FINISH_MESSAGE; } -static void queue_zeros(struct ceph_connection *con) +static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg) { dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); @@ -3236,7 +3237,7 @@ static void queue_zeros(struct ceph_connection *con) * Once it's written, we are done patching up for the revoke. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, true); + prepare_epilogue_plain(con, msg, true); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3263,6 +3264,7 @@ static void finish_message(struct ceph_connection *con) static int populate_out_iter(struct ceph_connection *con) { + struct ceph_msg *msg; int ret; dout("%s con %p state %d out_state %d\n", __func__, con, con->state, @@ -3278,18 +3280,18 @@ static int populate_out_iter(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: WARN_ON(!con->out_msg); - queue_data(con); + queue_data(con, con->out_msg); goto populated; case OUT_S_QUEUE_DATA_CONT: WARN_ON(!con->out_msg); - queue_data_cont(con); + queue_data_cont(con, con->out_msg); goto populated; case OUT_S_QUEUE_ENC_PAGE: queue_enc_page(con); goto populated; case OUT_S_QUEUE_ZEROS: WARN_ON(con->out_msg); /* revoked */ - queue_zeros(con); + queue_zeros(con, con->out_msg); goto populated; case OUT_S_FINISH_MESSAGE: finish_message(con); @@ -3309,8 +3311,8 @@ static int populate_out_iter(struct ceph_connection *con) return ret; } } else if (!list_empty(&con->out_queue)) { - ceph_con_get_out_msg(con); - ret = prepare_message(con); + msg = ceph_con_get_out_msg(con); + ret = prepare_message(con, msg); if (ret) { pr_err("prepare_message failed: %d\n", ret); return ret; @@ -3422,17 +3424,18 @@ static u32 crc32c_zeros(u32 crc, int zero_len) return crc; } -static void prepare_zero_front(struct ceph_connection *con, int resid) +static void prepare_zero_front(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > front_len(con->out_msg)); - sent = front_len(con->out_msg) - resid; + WARN_ON(!resid || resid > front_len(msg)); + sent = front_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.front_crc = - crc32c(-1, con->out_msg->front.iov_base, sent); + crc32c(-1, msg->front.iov_base, sent); con->v2.out_epil.front_crc = crc32c_zeros(con->v2.out_epil.front_crc, resid); } else { @@ -3443,17 +3446,18 @@ static void prepare_zero_front(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_middle(struct ceph_connection *con, int resid) +static void prepare_zero_middle(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > middle_len(con->out_msg)); - sent = middle_len(con->out_msg) - resid; + WARN_ON(!resid || resid > middle_len(msg)); + sent = middle_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.middle_crc = - crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + crc32c(-1, msg->middle->vec.iov_base, sent); con->v2.out_epil.middle_crc = crc32c_zeros(con->v2.out_epil.middle_crc, resid); } else { @@ -3464,61 +3468,64 @@ static void prepare_zero_middle(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_data(struct ceph_connection *con) +static void prepare_zero_data(struct ceph_connection *con, + struct ceph_msg *msg) { dout("%s con %p\n", __func__, con); - con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); - out_zero_add(con, data_len(con->out_msg)); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg)); + out_zero_add(con, data_len(msg)); } -static void revoke_at_queue_data(struct ceph_connection *con) +static void revoke_at_queue_data(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - boundary = front_len(con->out_msg) + middle_len(con->out_msg); + boundary = front_len(msg) + middle_len(msg); if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg); + boundary = middle_len(msg); if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); + queue_zeros(con, msg); return; } WARN_ON(!resid); dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_middle(con, msg, resid); + prepare_zero_data(con, msg); + queue_zeros(con, msg); } -static void revoke_at_queue_data_cont(struct ceph_connection *con) +static void revoke_at_queue_data_cont(struct ceph_connection *con, + struct ceph_msg *msg) { int sent, resid; /* current piece of data */ - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); @@ -3537,10 +3544,11 @@ static void revoke_at_queue_data_cont(struct ceph_connection *con) con->v2.out_iter.count -= resid; out_zero_add(con, con->v2.out_cursor.total_resid); - queue_zeros(con); + queue_zeros(con, msg); } -static void revoke_at_finish_message(struct ceph_connection *con) +static void revoke_at_finish_message(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; @@ -3548,39 +3556,39 @@ static void revoke_at_finish_message(struct ceph_connection *con) WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - if (!front_len(con->out_msg) && !middle_len(con->out_msg) && - !data_len(con->out_msg)) { + if (!front_len(msg) && !middle_len(msg) && + !data_len(msg)) { WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head (empty message) - noop\n", __func__, con); return; } - boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + boundary = front_len(msg) + middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3588,9 +3596,9 @@ static void revoke_at_finish_message(struct ceph_connection *con) if (resid > boundary) { resid -= boundary; dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); + prepare_zero_middle(con, msg, resid); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3598,7 +3606,7 @@ static void revoke_at_finish_message(struct ceph_connection *con) dout("%s con %p was sending epilogue - noop\n", __func__, con); } -void ceph_con_v2_revoke(struct ceph_connection *con) +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg) { WARN_ON(con->v2.out_zero); @@ -3611,13 +3619,13 @@ void ceph_con_v2_revoke(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: - revoke_at_queue_data(con); + revoke_at_queue_data(con, msg); break; case OUT_S_QUEUE_DATA_CONT: - revoke_at_queue_data_cont(con); + revoke_at_queue_data_cont(con, msg); break; case OUT_S_FINISH_MESSAGE: - revoke_at_finish_message(con); + revoke_at_finish_message(con, msg); break; default: WARN(1, "bad out_state %d", con->v2.out_state); -- cgit v1.2.3 From 207696b17f38e869e59889b44d395ab24bb678d3 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 18 Sep 2025 22:30:18 +0300 Subject: tpm: use a map for tpm2_calc_ordinal_duration() The current shenanigans for duration calculation introduce too much complexity for a trivial problem, and further the code is hard to patch and maintain. Address these issues with a flat look-up table, which is easy to understand and patch. If leaf driver specific patching is required in future, it is easy enough to make a copy of this table during driver initialization and add the chip parameter back. 'chip->duration' is retained for TPM 1.x. As the first entry for this new behavior address TCG spec update mentioned in this issue: https://github.com/raspberrypi/linux/issues/7054 Therefore, for TPM_SelfTest the duration is set to 3000 ms. This does not categorize a as bug, given that this is introduced to the spec after the feature was originally made. Reviewed-by: Serge Hallyn Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-interface.c | 2 +- drivers/char/tpm/tpm.h | 2 +- drivers/char/tpm/tpm2-cmd.c | 127 ++++++++++----------------------------- include/linux/tpm.h | 5 +- 4 files changed, 37 insertions(+), 99 deletions(-) (limited to 'include') diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index b71725827743..c9f173001d0e 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -52,7 +52,7 @@ MODULE_PARM_DESC(suspend_pcr, unsigned long tpm_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal) { if (chip->flags & TPM_CHIP_FLAG_TPM2) - return tpm2_calc_ordinal_duration(chip, ordinal); + return tpm2_calc_ordinal_duration(ordinal); else return tpm1_calc_ordinal_duration(chip, ordinal); } diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 7bb87fa5f7a1..2726bd38e5ac 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -299,7 +299,7 @@ ssize_t tpm2_get_tpm_pt(struct tpm_chip *chip, u32 property_id, ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip); int tpm2_auto_startup(struct tpm_chip *chip); void tpm2_shutdown(struct tpm_chip *chip, u16 shutdown_type); -unsigned long tpm2_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal); +unsigned long tpm2_calc_ordinal_duration(u32 ordinal); int tpm2_probe(struct tpm_chip *chip); int tpm2_get_cc_attrs_tbl(struct tpm_chip *chip); int tpm2_find_cc(struct tpm_chip *chip, u32 cc); diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 524d802ede26..7d77f6fbc152 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -28,120 +28,57 @@ static struct tpm2_hash tpm2_hash_map[] = { int tpm2_get_timeouts(struct tpm_chip *chip) { - /* Fixed timeouts for TPM2 */ chip->timeout_a = msecs_to_jiffies(TPM2_TIMEOUT_A); chip->timeout_b = msecs_to_jiffies(TPM2_TIMEOUT_B); chip->timeout_c = msecs_to_jiffies(TPM2_TIMEOUT_C); chip->timeout_d = msecs_to_jiffies(TPM2_TIMEOUT_D); - - /* PTP spec timeouts */ - chip->duration[TPM_SHORT] = msecs_to_jiffies(TPM2_DURATION_SHORT); - chip->duration[TPM_MEDIUM] = msecs_to_jiffies(TPM2_DURATION_MEDIUM); - chip->duration[TPM_LONG] = msecs_to_jiffies(TPM2_DURATION_LONG); - - /* Key creation commands long timeouts */ - chip->duration[TPM_LONG_LONG] = - msecs_to_jiffies(TPM2_DURATION_LONG_LONG); - chip->flags |= TPM_CHIP_FLAG_HAVE_TIMEOUTS; - return 0; } -/** - * tpm2_ordinal_duration_index() - returns an index to the chip duration table - * @ordinal: TPM command ordinal. - * - * The function returns an index to the chip duration table - * (enum tpm_duration), that describes the maximum amount of - * time the chip could take to return the result for a particular ordinal. - * - * The values of the MEDIUM, and LONG durations are taken - * from the PC Client Profile (PTP) specification (750, 2000 msec) - * - * LONG_LONG is for commands that generates keys which empirically takes - * a longer time on some systems. - * - * Return: - * * TPM_MEDIUM - * * TPM_LONG - * * TPM_LONG_LONG - * * TPM_UNDEFINED +/* + * Contains the maximum durations in milliseconds for TPM2 commands. */ -static u8 tpm2_ordinal_duration_index(u32 ordinal) -{ - switch (ordinal) { - /* Startup */ - case TPM2_CC_STARTUP: /* 144 */ - return TPM_MEDIUM; - - case TPM2_CC_SELF_TEST: /* 143 */ - return TPM_LONG; - - case TPM2_CC_GET_RANDOM: /* 17B */ - return TPM_LONG; - - case TPM2_CC_SEQUENCE_UPDATE: /* 15C */ - return TPM_MEDIUM; - case TPM2_CC_SEQUENCE_COMPLETE: /* 13E */ - return TPM_MEDIUM; - case TPM2_CC_EVENT_SEQUENCE_COMPLETE: /* 185 */ - return TPM_MEDIUM; - case TPM2_CC_HASH_SEQUENCE_START: /* 186 */ - return TPM_MEDIUM; - - case TPM2_CC_VERIFY_SIGNATURE: /* 177 */ - return TPM_LONG_LONG; - - case TPM2_CC_PCR_EXTEND: /* 182 */ - return TPM_MEDIUM; - - case TPM2_CC_HIERARCHY_CONTROL: /* 121 */ - return TPM_LONG; - case TPM2_CC_HIERARCHY_CHANGE_AUTH: /* 129 */ - return TPM_LONG; - - case TPM2_CC_GET_CAPABILITY: /* 17A */ - return TPM_MEDIUM; - - case TPM2_CC_NV_READ: /* 14E */ - return TPM_LONG; - - case TPM2_CC_CREATE_PRIMARY: /* 131 */ - return TPM_LONG_LONG; - case TPM2_CC_CREATE: /* 153 */ - return TPM_LONG_LONG; - case TPM2_CC_CREATE_LOADED: /* 191 */ - return TPM_LONG_LONG; - - default: - return TPM_UNDEFINED; - } -} +static const struct { + unsigned long ordinal; + unsigned long duration; +} tpm2_ordinal_duration_map[] = { + {TPM2_CC_STARTUP, 750}, + {TPM2_CC_SELF_TEST, 3000}, + {TPM2_CC_GET_RANDOM, 2000}, + {TPM2_CC_SEQUENCE_UPDATE, 750}, + {TPM2_CC_SEQUENCE_COMPLETE, 750}, + {TPM2_CC_EVENT_SEQUENCE_COMPLETE, 750}, + {TPM2_CC_HASH_SEQUENCE_START, 750}, + {TPM2_CC_VERIFY_SIGNATURE, 30000}, + {TPM2_CC_PCR_EXTEND, 750}, + {TPM2_CC_HIERARCHY_CONTROL, 2000}, + {TPM2_CC_HIERARCHY_CHANGE_AUTH, 2000}, + {TPM2_CC_GET_CAPABILITY, 750}, + {TPM2_CC_NV_READ, 2000}, + {TPM2_CC_CREATE_PRIMARY, 30000}, + {TPM2_CC_CREATE, 30000}, + {TPM2_CC_CREATE_LOADED, 30000}, +}; /** - * tpm2_calc_ordinal_duration() - calculate the maximum command duration - * @chip: TPM chip to use. + * tpm2_calc_ordinal_duration() - Calculate the maximum command duration * @ordinal: TPM command ordinal. * - * The function returns the maximum amount of time the chip could take - * to return the result for a particular ordinal in jiffies. - * - * Return: A maximal duration time for an ordinal in jiffies. + * Returns the maximum amount of time the chip is expected by kernel to + * take in jiffies. */ -unsigned long tpm2_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal) +unsigned long tpm2_calc_ordinal_duration(u32 ordinal) { - unsigned int index; + int i; - index = tpm2_ordinal_duration_index(ordinal); + for (i = 0; i < ARRAY_SIZE(tpm2_ordinal_duration_map); i++) + if (ordinal == tpm2_ordinal_duration_map[i].ordinal) + return msecs_to_jiffies(tpm2_ordinal_duration_map[i].duration); - if (index != TPM_UNDEFINED) - return chip->duration[index]; - else - return msecs_to_jiffies(TPM2_DURATION_DEFAULT); + return msecs_to_jiffies(TPM2_DURATION_DEFAULT); } - struct tpm2_pcr_read_out { __be32 update_cnt; __be32 pcr_selects_cnt; diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b0e9eb5ef022..dc0338a783f3 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -228,10 +228,11 @@ enum tpm2_timeouts { TPM2_TIMEOUT_B = 4000, TPM2_TIMEOUT_C = 200, TPM2_TIMEOUT_D = 30, +}; + +enum tpm2_durations { TPM2_DURATION_SHORT = 20, - TPM2_DURATION_MEDIUM = 750, TPM2_DURATION_LONG = 2000, - TPM2_DURATION_LONG_LONG = 300000, TPM2_DURATION_DEFAULT = 120000, }; -- cgit v1.2.3 From a8482d2c9071d75c920eba0db36428898250ea57 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 11 Oct 2025 12:31:53 +0200 Subject: Revert "i2c: boardinfo: Annotate code used in init phase only" This reverts commit 1a2b423be6a89dd07d5fc27ea042be68697a6a49 because we got a regression report and need time to find out the details. Reported-by: Konrad Dybcio Closes: https://lore.kernel.org/r/29ec0082-4dd4-4120-acd2-44b35b4b9487@oss.qualcomm.com Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-boardinfo.c | 4 ++-- include/linux/i2c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 338800321f8b..4df8ad092df3 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock); LIST_HEAD(__i2c_board_list); EXPORT_SYMBOL_GPL(__i2c_board_list); -int __i2c_first_dynamic_bus_num __ro_after_init; +int __i2c_first_dynamic_bus_num; EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); @@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); * The board info passed can safely be __initdata, but be careful of embedded * pointers (for platform_data, functions, etc) since that won't be copied. */ -int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) +int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) { int status; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 11a19241e360..20fd41b51d5c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev) * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO -int __init +int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else -- cgit v1.2.3